In [None]:
import os
import json
import cv2
import numpy as np

# Function to load images and bounding boxes from a folder
def load_images_and_bboxes_from_folder(folder):
    images = []
    bboxes = []

    for filename in os.listdir(folder):
        if filename.endswith('.jpg'):
            # Load the image
            image_path = os.path.join(folder, filename)
            image = cv2.imread(image_path)
            image = cv2.resize(image, (224, 224))  # Resize to a standard size
            image = image / 255.0  # Normalize pixel values
            images.append(image)

            # Load the corresponding JSON file
            json_path = os.path.splitext(image_path)[0] + '.json'
            with open(json_path, 'r') as f:
                bbox_data = json.load(f)

                # Extract all x and y coordinates from the 'hand_pts' list
                x_coords = [point[0] for point in bbox_data['hand_pts']]
                y_coords = [point[1] for point in bbox_data['hand_pts']]

                # Determine bounding box (min x, min y, max x, max y)
                x_min, y_min = min(x_coords), min(y_coords)
                x_max, y_max = max(x_coords), max(y_coords)

                # Normalize the bounding box coordinates
                x_min /= image.shape[1]
                y_min /= image.shape[0]
                x_max /= image.shape[1]
                y_max /= image.shape[0]

                bboxes.append([x_min, y_min, x_max, y_max])

    return np.array(images), np.array(bboxes)

# Example usage
base_path = 'hand_labels_synth'
folders = ['synth1', 'synth2', 'synth3', 'synth4']

all_images = []
all_bboxes = []

for folder in folders:
    folder_path = os.path.join(base_path, folder)
    images, bboxes = load_images_and_bboxes_from_folder(folder_path)
    all_images.append(images)
    all_bboxes.append(bboxes)

# Convert lists to numpy arrays
all_images = np.concatenate(all_images, axis=0)
all_bboxes = np.concatenate(all_bboxes, axis=0)

print(f"Loaded {all_images.shape[0]} images and {all_bboxes.shape[0]} bounding boxes.")


In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training, validation, and test sets
X_train, X_val, y_train, y_val = train_test_split(all_images, all_bboxes, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")


In [25]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Load the YOLOv3 pre-trained model (converted to Keras)
model = load_model('yolov3.h5')  # Convert darknet weights to .h5 format

# Freeze all layers except the last layers
for layer in model.layers[:-3]:  # Keep only the last few layers trainable
    layer.trainable = False

# Modify the final output layers to match your dataset (for example, hands detection)
# Modify the last layer's output for 1 class (for hands)
output_layer = model.layers[-1].output  # Get the last layer's output
new_output_layer = Dense(3, activation='sigmoid')(output_layer)  # Modify output for your number of classes

# Create a new model with the new output layer
new_model = tf.keras.models.Model(inputs=model.input, outputs=new_output_layer)

# Compile the model with Adam optimizer and custom loss
new_model.compile(optimizer=Adam(learning_rate=1e-4),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

# Print summary of the new model
new_model.summary()

# Load your custom dataset for hands detection
# (X_train and y_train are your training data)
# Example of training the model:



Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None, None, 3)]      0         []                            
                                                                                                  
 conv_0 (Conv2D)             (None, None, None, 32)       864       ['input_1[0][0]']             
                                                                                                  
 bnorm_0 (BatchNormalizatio  (None, None, None, 32)       128       ['conv_0[0][0]']              
 n)                                                                                               
                                                                                                  
 leaky_0 (LeakyReLU)         (None, None, None, 32)       0         ['bnorm_0[0][0]']       

In [28]:
from tensorflow.keras.utils import to_categorical

# Convert labels to one-hot encoding (assuming 4 classes, update as per your task)
y_train = to_categorical(y_train, num_classes=4)
y_val = to_categorical(y_val, num_classes=4)
print(y_train)

print(y_val)

[[[[1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [0. 1. 0. 0.]
   [1. 0. 0. 0.]]

  [[1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [0. 1. 0. 0.]
   [1. 0. 0. 0.]]

  [[1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [0. 1. 0. 0.]
   [1. 0. 0. 0.]]

  [[1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [0. 1. 0. 0.]
   [1. 0. 0. 0.]]]


 [[[0. 1. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]]

  [[0. 1. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]]

  [[1. 0. 0. 0.]
   [0. 1. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]]

  [[0. 1. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]]]


 [[[1. 0. 0. 0.]
   [0. 1. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]]

  [[1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [0. 1. 0. 0.]
   [1. 0. 0. 0.]]

  [[1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [0. 1. 0. 0.]
   [1. 0. 0. 0.]]

  [[1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [0. 1. 0. 0.]
   [1. 0. 0. 0.]]]


 ...


 [[[0. 1. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]]

  [[0. 1. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]
   [1. 0. 0. 0.]]

  [[0. 

In [30]:
from tensorflow.keras.utils import to_categorical

# Assuming you have 4 classes
num_classes = 4

# Convert labels to one-hot encoding
y_train = to_categorical(y_train, num_classes=num_classes)
y_val = to_categorical(y_val, num_classes=num_classes)

# Load the YOLOv3 pre-trained model (ensure the last layers are correctly configured)
# Example: Modify the output layer for your specific task (e.g., hand detection or classification)
new_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = new_model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))


Epoch 1/10


ValueError: in user code:

    File "C:\Users\hp\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\hp\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\hp\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\hp\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1151, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\hp\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "C:\Users\hp\anaconda3\lib\site-packages\keras\src\engine\compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\hp\anaconda3\lib\site-packages\keras\src\losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\hp\anaconda3\lib\site-packages\keras\src\losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\hp\anaconda3\lib\site-packages\keras\src\losses.py", line 2221, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\hp\anaconda3\lib\site-packages\keras\src\backend.py", line 5573, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 4, 4, 4, 4) and (None, 28, 28, 3) are incompatible


In [ ]:
new_model.save('yolov3_finetuned.h5')


In [16]:
import cv2
import mediapipe as mp
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.models import load_model

# Load the sign language model
sign_language_model = load_model('sign_language.h5')

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

# Function to detect hands and predict sign language
def detect_hand_and_predict_sign(image_path):
    # Read the image
    original_image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)

    # Perform hand detection using MediaPipe
    results = hands.process(image_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Draw hand landmarks on the image
            mp_drawing.draw_landmarks(original_image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            # Get bounding box around the hand
            x_min = int(min([landmark.x for landmark in hand_landmarks.landmark]) * original_image.shape[1])
            y_min = int(min([landmark.y for landmark in hand_landmarks.landmark]) * original_image.shape[0])
            x_max = int(max([landmark.x for landmark in hand_landmarks.landmark]) * original_image.shape[1])
            y_max = int(max([landmark.y for landmark in hand_landmarks.landmark]) * original_image.shape[0])

            # Crop the hand region
            cropped_image = original_image[y_min:y_max, x_min:x_max]
            cropped_image_resized = cv2.resize(cropped_image, (28, 28))
            cropped_image_gray = cv2.cvtColor(cropped_image_resized, cv2.COLOR_BGR2GRAY)
            cropped_image_gray = cropped_image_gray / 255.0
            cropped_image_gray = np.expand_dims(cropped_image_gray, axis=-1)
            cropped_image_gray = np.expand_dims(cropped_image_gray, axis=0)

            # Predict the sign language letter
            sign_language_prediction = sign_language_model.predict(cropped_image_gray)
            predicted_sign_language_class = np.argmax(sign_language_prediction[0])
            sign_language_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
            predicted_letter = sign_language_letters[predicted_sign_language_class]

            # Draw the bounding box and prediction on the image
            cv2.rectangle(original_image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
            cv2.putText(original_image, f"Predicted: {predicted_letter}", (x_min, y_min - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    # Display the image with the bounding box and prediction
    plt.imshow(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

# Example usage
image_path = 'G.jpg'
detect_hand_and_predict_sign(image_path)


error: OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\dnn\src\darknet\darknet_importer.cpp:210: error: (-212:Parsing error) Failed to open NetParameter file: yolov3_hand.cfg in function 'cv::dnn::dnn4_v20231225::readNetFromDarknet'


In [None]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
  9/357 [..............................] - ETA: 3:38:30 - loss: 0.6532

In [None]:
test_loss = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")


In [9]:
import matplotlib.pyplot as plt

def predict_and_visualize(image, model):
    predicted_bbox = model.predict(image[np.newaxis, ...])[0]
    x_min, y_min, x_max, y_max = predicted_bbox * [image.shape[1], image.shape[0], image.shape[1], image.shape[0]]
    
    # Draw bounding box on the image
    cv2.rectangle(image, (int(x_min), int(y_min)), (int(x_max), int(y_max)), (255, 0, 0), 2)
    
    # Display the image
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

# Example usage with a test image
predict_and_visualize(X_test[0], model)




error: OpenCV(4.9.0) d:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.simd_helpers.hpp:94: error: (-2:Unspecified error) in function '__cdecl cv::impl::`anonymous-namespace'::CvtHelper<struct cv::impl::`anonymous namespace'::Set<3,4,-1>,struct cv::impl::A0x59191d0d::Set<3,4,-1>,struct cv::impl::A0x59191d0d::Set<0,2,5>,4>::CvtHelper(const class cv::_InputArray &,const class cv::_OutputArray &,int)'
> Unsupported depth of input image:
>     'VDepth::contains(depth)'
> where
>     'depth' is 6 (CV_64F)


In [10]:
model.save('hand_detection_model.h5')

  saving_api.save_model(


In [None]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt

# Load the pre-trained models
hand_detection_model = load_model('hand_detection_model.h5')
sign_language_model = load_model('sign_language.h5')

def detect_hand_and_predict_sign(image_path):
    # Step 1: Predict the bounding box using the hand detection model
    original_image = cv2.imread(image_path)
    image = cv2.resize(original_image, (224, 224))
    image = image / 255.0
    image = np.expand_dims(image, axis=0)
    predicted_bbox = hand_detection_model.predict(image)[0]

    # Step 2: Crop the hand region from the original image
    x_min, y_min, x_max, y_max = predicted_bbox * [original_image.shape[1], original_image.shape[0], original_image.shape[1], original_image.shape[0]]
    cropped_image = original_image[int(y_min):int(y_max), int(x_min):int(x_max)]

    # Step 3: Preprocess the cropped image for sign language recognition
    cropped_image_resized = cv2.resize(cropped_image, (28, 28))
    cropped_image_gray = cv2.cvtColor(cropped_image_resized, cv2.COLOR_BGR2GRAY)
    cropped_image_gray = cropped_image_gray / 255.0
    cropped_image_gray = np.expand_dims(cropped_image_gray, axis=-1)  # Add channel dimension
    cropped_image_gray = np.expand_dims(cropped_image_gray, axis=0)  # Add batch dimension

    # Step 4: Predict the sign language using the sign language model
    sign_language_prediction = sign_language_model.predict(cropped_image_gray)

    # Get the predicted class (letter) from the output
    predicted_sign_language_class = np.argmax(sign_language_prediction[0])
    sign_language_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    predicted_letter = sign_language_letters[predicted_sign_language_class]

    # Step 5: Display the original image with bounding box and predicted letter
    plt.figure(figsize=(10, 10))
    plt.imshow(cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB))
    plt.gca().add_patch(plt.Rectangle((x_min, y_min), x_max - x_min, y_max - y_min,
                                      linewidth=2, edgecolor='r', facecolor='none'))
    plt.text(x_min, y_min - 10, f'Predicted: {predicted_letter}', color='red', fontsize=15, weight='bold')
    plt.axis('off')
    plt.show()

    return predicted_letter, (int(x_min), int(y_min), int(x_max), int(y_max))

# Example usage:
image_path = 'A.jpg'
predicted_letter, bbox = detect_hand_and_predict_sign(image_path)
print(f"Predicted Letter: {predicted_letter}")
print(f"Bounding Box: {bbox}")
