Importing Libraries

In [32]:
import cv2

In [33]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
import pandas as pd
import time
from collections import deque
from sklearn.preprocessing import LabelEncoder

Model loading

In [34]:
mobilenet_model = tf.keras.models.load_model("sign_language_model_MobileNetV2.h5")
mlp_model = tf.keras.models.load_model("asl_mediapipe_mlp_model.h5")


Reading the mediapipe keypoints and defining the class labels

In [35]:
df = pd.read_csv("asl_mediapipe_keypoints_dataset.csv")
encoder = LabelEncoder()
encoder.fit(df["label"]) 

# Correct class labels
class_labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
                'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
                'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']

Mediapipe for hand keypoints

In [36]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7)

Define the height and width of the box, where mobilenet is to be predicting classes

In [37]:
HEIGHT_EXPAND = 220
WIDTH_EXPAND = 150

In [38]:
# Store the predicted sentence
predicted_sentence = ""
last_predicted_label = None
last_prediction_time = 0 

# 5 seconds cooldown for repeated letters
cooldown_time = 5 

In [39]:
def extract_landmark_features(hand_landmarks, handedness):
    """
    Extracts and normalizes 21 hand landmarks from MediaPipe.
    If the hand is right-handed, mirror it to match left-hand training data.
    """
    landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark])

    # Flip x-coordinates for right hand to match training data
    if handedness.classification[0].label == "Right":
        landmarks[:, 0] = 1 - landmarks[:, 0]

    return landmarks.flatten().reshape(1, -1)

Main Logic for comparision of confidence score of both models - MobileNet V2 and Mediapipe

In [40]:
cap = cv2.VideoCapture('http://192.168.100.33:4747/video')
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Flip frame for a mirrored effect
    frame = cv2.flip(frame, 1)

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)

    if results.multi_hand_landmarks:
        for hand_landmarks, handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            ### MLP Prediction (Landmark-based)
            landmark_features = extract_landmark_features(hand_landmarks, handedness)
            mlp_pred = mlp_model.predict(landmark_features)
            mlp_class_index = np.argmax(mlp_pred)
            mlp_confidence = mlp_pred[0][mlp_class_index]
            mlp_label = encoder.inverse_transform([mlp_class_index])[0]

            ### Bounding Box Extraction for MobileNetV2
            x_min = min([lm.x for lm in hand_landmarks.landmark]) * frame.shape[1]
            y_min = min([lm.y for lm in hand_landmarks.landmark]) * frame.shape[0]
            x_max = max([lm.x for lm in hand_landmarks.landmark]) * frame.shape[1]
            y_max = max([lm.y for lm in hand_landmarks.landmark]) * frame.shape[0]

            x_min = max(0, int(x_min - WIDTH_EXPAND))   
            y_min = max(0, int(y_min - HEIGHT_EXPAND)) 
            x_max = min(frame.shape[1], int(x_max + WIDTH_EXPAND))  
            y_max = min(frame.shape[0], int(y_max + HEIGHT_EXPAND)) 

            hand_crop = frame[y_min:y_max, x_min:x_max]

            ### MobileNetV2 Prediction (Image-based)
            if hand_crop.shape[0] > 0 and hand_crop.shape[1] > 0:
                hand_resized = cv2.resize(hand_crop, (128, 128))
                hand_resized = np.expand_dims(hand_resized, axis=0) / 255.0  

                mobilenet_pred = mobilenet_model.predict(hand_resized)
                mobilenet_class_index = np.argmax(mobilenet_pred)
                mobilenet_confidence = mobilenet_pred[0][mobilenet_class_index]
                mobilenet_label = class_labels[mobilenet_class_index]

            ### Decision Fusion: Pick Most Confident Prediction
            if mobilenet_confidence > mlp_confidence:
                final_label = mobilenet_label
                final_confidence = mobilenet_confidence
            else:
                final_label = mlp_label
                final_confidence = mlp_confidence

            ### Logic to Prevent Repeated Predictions
            current_time = time.time()
            if final_label == last_predicted_label:
                if current_time - last_prediction_time < cooldown_time:
                    final_label = None  # Ignore repeated prediction
            else:
                last_predicted_label = final_label
                last_prediction_time = current_time

            ### Sentence Formation Logic
            if final_label and final_label not in ["nothing", "del", "space"]:
                predicted_sentence += final_label
            elif final_label == "space":
                predicted_sentence += " "
            elif final_label == "del":
                predicted_sentence = predicted_sentence[:-1]  # Remove last character

            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
            if final_label:
                cv2.putText(frame, f"{final_label} ({final_confidence:.2f})", (x_min, y_min - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Create a black bar for displaying sentence
    bar_height = 60
    frame_height, frame_width, _ = frame.shape
    cv2.rectangle(frame, (0, frame_height - bar_height), (frame_width, frame_height), (0, 0, 0), -1)
    cv2.putText(frame, predicted_sentence, (50, frame_height - 20),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    cv2.imshow("Sign Language Recognition (MediaPipe + MobileNetV2)", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

From the experiment above it can be concluded that models based on convolutional neural networks aren't able to predict the signs correctly and even when combined with predictions of mediapipe multi-level perceptron, it still gives the wrong output.

Therefore fine tuning the mediapipe will be the best option available in the market.

Fine tuning the Mediapipe Model

In [41]:
mlp_model = tf.keras.models.load_model("asl_mediapipe_mlp_model.h5")

In [42]:
df = pd.read_csv("asl_mediapipe_keypoints_dataset.csv")
encoder = LabelEncoder()
encoder.fit(df["label"]) 

In [43]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7, max_num_hands=2)  # Allow max 2 hands

In [44]:
# Sentence formation logic
predicted_sentence = ""
last_predicted_label = None
last_prediction_time = 0

# 5 seconds cooldown for repeated letters
cooldown_time = 5 


In [45]:
# Stabilization buffer
# Stores last 5 predictions
stabilization_window = deque(maxlen=5)

# Must match for 4 out of 5 frames
stabilization_threshold = 4

two_hands_detected = False

In [46]:
def extract_landmark_features(hand_landmarks, handedness):
    """Extract and normalize 21 hand landmarks from MediaPipe."""
    landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark])

    # Flip x-coordinates for right hand to match left-hand training data
    if handedness.classification[0].label == "Right":
        landmarks[:, 0] = 1 - landmarks[:, 0]

    return landmarks.flatten().reshape(1, -1)

What is fine tuned

- Making of a buffer ensuring that model doesnt predicts images in between the hand sign change
- only predic when there is one hand in the frame, when 2 hand appear, give an warning

In [None]:
        bar_height = 60
        frame_height, frame_width, _ = frame.shape
        cv2.rectangle(frame, (0, frame_height - bar_height), (frame_width, frame_height), (0, 0, 0), -1)

        # Display Warning if Two Hands Detected
        if two_hands_detected:
            cv2.putText(frame, "Only One Hand Allowed!", (50, frame_height - 20),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        else:
            cv2.putText(frame, predicted_sentence, (50, frame_height - 20),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

        # MINIMIZE DISPLAY - Resize to tiny window (320x240) but keep capture quality high
        display_frame = cv2.resize(frame, (320, 240))
        cv2.imshow("Sign Language Recognition (Fine Tuned MediaPipe)", display_frame)

Attempting to connect to DroidCam...
‚úÖ Camera initialized. Actual resolution: 1920x1080
Starting main loop... Press 'q' to quit.
Camera released and windows closed.


KeyboardInterrupt: 

In [59]:

# TEST: Identify which camera is being used

import cv2
import time

print("=" * 60)
print("CAMERA IDENTIFICATION TEST")
print("=" * 60)

# Try to open the same way as the main code
cap = cv2.VideoCapture(1)

if not cap.isOpened():
    cap = cv2.VideoCapture(0)
    test_index = 0
else:
    test_index = 1

if not cap.isOpened():
    cap = cv2.VideoCapture(1, cv2.CAP_DSHOW)
    test_index = 1

print(f"\n‚úÖ Camera opened at Index: {test_index}")

# Get camera properties
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
backend = cap.get(cv2.CAP_PROP_BACKEND_NAME)

print(f"üì∑ Resolution: {width}x{height}")
print(f"üé¨ FPS: {fps}")
print(f"üîß Backend: {backend}")

# Try to read a frame
ret, frame = cap.read()
if ret:
    print(f"\n‚úÖ Frame captured successfully!")
    print(f"   Frame shape: {frame.shape}")
    
    # Try to identify if it's DroidCam
    # DroidCam usually has specific characteristics
    if "droid" in str(cap.getBackendName()).lower():
        print("\nüéØ DETECTED: DroidCam (mobile camera)")
    elif width == 1280 and height == 720:
        print("\nüéØ LIKELY: DroidCam (1280x720 is common for DroidCam)")
    elif width == 640 and height == 480:
        print("\nü§î UNCLEAR: Could be either PC or DroidCam at 640x480")
    else:
        print(f"\nüíª LIKELY: PC's built-in camera ({width}x{height})")
else:
    print("\n‚ùå Could not read frame from camera")

# Display camera for 3 seconds to visually verify
print("\nüì∫ Displaying camera feed for 3 seconds...")
start_time = time.time()
while time.time() - start_time < 3:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Add text to show which camera it is
    cv2.putText(frame, f"Camera Index: {test_index} | {width}x{height}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
    cv2.putText(frame, "Check if this is your phone (DroidCam) or PC camera", (10, 60),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 0), 2)
    
    cv2.imshow("Camera Test - Close window to continue", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

print("\n‚úÖ Camera test complete!")
print("=" * 60)

CAMERA IDENTIFICATION TEST

‚úÖ Camera opened at Index: 1


AttributeError: module 'cv2' has no attribute 'CAP_PROP_BACKEND_NAME'

In [None]:
import cv2

# Try indices 0, 1, 2, 3
for i in range(4):
    cap = cv2.VideoCapture(i)
    if cap.isOpened():
        print(f"Camera found at index {i}")
        cap.release()
    else:
        print(f"No camera at index {i}")

Camera found at index 0
Camera found at index 1
No camera at index 2
No camera at index 3
