# Develop a hand gesture recognition that can accurately identify and classify different hand gestures from image data

## 1. Importing the specific libraries

In [3]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import mediapipe as mp
import cv2

In [4]:
folder_to_gesture = {
    '01_palm': 'palm',
    '02_l': 'l',
    '03_fist': 'fist',
    '04_fist_moved': 'fist_moved',
    '05_thumb': 'thumb',
    '06_index': 'index',
    '07_ok': 'ok',
    '08_palm_moved': 'palm_moved',
    '09_c': 'c',
    '010_down' : 'down'
}

## 2. Preprocessing the data

In [5]:
def data(folder_parent, img_height, img_width):
    x,y = [], []
    gestures = set()

    for folder in os.listdir(folder_parent):
        child_folder = os.path.join(folder_parent, folder)
        if os.path.isdir(child_folder):
            for gesture_folder in os.listdir(child_folder):
                gesture_photo = os.path.join(child_folder, gesture_folder)
                gesture_label = folder_to_gesture.get(gesture_folder)
                gestures.add(gesture_label)

                for img_type in os.listdir(gesture_photo):
                    img_path = os.path.join(gesture_photo, img_type)
                    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                    if img is not None:
                        img = cv2.resize(img, (img_height, img_width))
                        img = img_to_array(img)
                        x.append(img)
                        y.append(gesture_label)
    gestures = list(gestures)
    mapping = {gesture: idx for idx, gesture in enumerate(gestures)}
    y = np.array([mapping[gesture] for gesture in y])

    x = np.array(x , dtype="float")/255.0
    y = to_categorical(y, num_classes= len(gestures))

    return x , y , gestures

In [6]:
parent_folder = r"C:\Users\mayank dandriyal\Desktop\prodigy_internship\task4\leapGestRecog"
height, width = 64,64

In [7]:
x , y , gestures = data(parent_folder, height, width)

In [9]:
gestures

[None,
 'l',
 'index',
 'palm',
 'palm_moved',
 'ok',
 'c',
 'fist',
 'fist_moved',
 'thumb']

In [10]:
x

array([[[[0.01960784],
         [0.02352941],
         [0.01568627],
         ...,
         [0.01176471],
         [0.01568627],
         [0.01568627]],

        [[0.01568627],
         [0.01960784],
         [0.02352941],
         ...,
         [0.01176471],
         [0.01568627],
         [0.00784314]],

        [[0.01568627],
         [0.02352941],
         [0.01568627],
         ...,
         [0.00784314],
         [0.01176471],
         [0.01568627]],

        ...,

        [[0.01960784],
         [0.02352941],
         [0.02352941],
         ...,
         [0.03137255],
         [0.02352941],
         [0.01960784]],

        [[0.01960784],
         [0.01960784],
         [0.01568627],
         ...,
         [0.01960784],
         [0.01960784],
         [0.01960784]],

        [[0.01568627],
         [0.02352941],
         [0.01960784],
         ...,
         [0.01568627],
         [0.01960784],
         [0.01568627]]],


       [[[0.01568627],
         [0.01568627],
         [0.01

In [11]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [12]:
x_train, x_temp, y_train, y_temp = train_test_split(x , y , test_size= 0.2, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size= 0.2, random_state= 42)

In [13]:
print(f"Total gestures : {gestures}")
print(f"Training set : {len(x_train)}")
print(f"Testing set : {len(x_test)}")
print(f"Validation set : {len(x_val)}")

Total gestures : [None, 'l', 'index', 'palm', 'palm_moved', 'ok', 'c', 'fist', 'fist_moved', 'thumb']
Training set : 16000
Testing set : 800
Validation set : 3200


In [14]:
num_classes = len(gestures)

## 3. Intializing CNN and fitting the model with the data

In [15]:
input_layer = Input(shape= (height, width, 1))

x = Conv2D(32, (3, 3), activation='relu')(input_layer)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(128, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output_layer = Dense(num_classes, activation='softmax')(x)

In [16]:
model = Model(inputs = input_layer, outputs = output_layer)

In [17]:
model.compile(optimizer= Adam(), loss= 'categorical_crossentropy', metrics= ['accuracy'])

In [18]:
model.fit(x_train, y_train, epochs= 30, validation_data=(x_val, y_val), batch_size= 32)

Epoch 1/30
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.6651 - loss: 0.9547 - val_accuracy: 0.9956 - val_loss: 0.0135
Epoch 2/30
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - accuracy: 0.9841 - loss: 0.0516 - val_accuracy: 0.9997 - val_loss: 0.0025
Epoch 3/30
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.9888 - loss: 0.0347 - val_accuracy: 0.9981 - val_loss: 0.0061
Epoch 4/30
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.9908 - loss: 0.0275 - val_accuracy: 1.0000 - val_loss: 3.4432e-04
Epoch 5/30
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.9947 - loss: 0.0196 - val_accuracy: 0.9997 - val_loss: 7.4737e-04
Epoch 6/30
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - accuracy: 0.9925 - loss: 0.0241 - val_accuracy: 0.9997 - val_loss: 7.2220e-04
Epoch 

<keras.src.callbacks.history.History at 0x1e45e258c80>

In [19]:
Loss, Accuracy = model.evaluate(x_test, y_test)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 1.0000 - loss: 3.8832e-10


In [20]:
print(f"The accuracy for this model is:- {Accuracy}")
print(f"The loss for this model is:- {Loss}")

The accuracy for this model is:- 1.0
The loss for this model is:- 4.4703479806784685e-10


In [21]:
model.save("hand_gesture.keras")

## 4. Using cv2 to capture hand gestures from the camera and labelling them as predicted by the model

In [28]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Open the camera
cap = cv2.VideoCapture(0)

while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Detect hand landmarks
    results = hands.process(frame_rgb)

    # Extract hand landmarks
    if results.multi_hand_landmarks:
        hand_landmarks = results.multi_hand_landmarks[0]  # Assuming only one hand is detected
        hand_landmarks_np = np.array([[lm.x, lm.y] for lm in hand_landmarks.landmark]).flatten()
        hand_landmarks_np = hand_landmarks_np.reshape(-1,2)
        # Convert normalized coordinates to pixel coordinates
        height, width, _ = frame.shape
        hand_landmarks_px = np.multiply(hand_landmarks_np, [width, height]).astype(int)

        # Extract ROI (Region of Interest) corresponding to the hand
        min_x, min_y = np.min(hand_landmarks_px, axis=0)
        max_x, max_y = np.max(hand_landmarks_px, axis=0)
        if min_x< max_x and min_y < max_y:
            hand_roi = frame[min_y:max_y, min_x:max_x]
            if not hand_roi.size == 0:
                hand_roi_resized = cv2.resize(hand_roi, (64, 64))
                hand_roi_gray = cv2.cvtColor(hand_roi_resized, cv2.COLOR_BGR2GRAY)
                hand_roi_normalized = hand_roi_gray / 255.0
                hand_roi_input = np.expand_dims(hand_roi_normalized, axis=0)
                hand_roi_input = np.expand_dims(hand_roi_input, axis=-1)  # Add batch and channel dimensions

                # Predict hand gesture
                prediction = model.predict(hand_roi_input)
                predicted_gesture = gestures[np.argmax(prediction)]
                cv2.putText(frame, "Predicted Gesture: " + predicted_gesture, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
            else:
                print("Empty ROI. Skipping processing")
        else:
            print("Invalid ROI")

        # Draw landmarks on the frame
        for landmark in hand_landmarks.landmark:
            x, y = int(landmark.x * width), int(landmark.y * height)
            cv2.circle(frame, (x, y), 5, (0, 255, 0), -1)

    # Display the frame
    cv2.imshow('Hand Gesture Recognition', frame)

    # Exit loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()