In [22]:
import cv2
import mediapipe as mp
import numpy as np
import os
import imghdr
import time
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import save_model, load_model
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization

from sklearn.utils import shuffle

In [48]:
# Define the brightness and contrast adjustment function
def adjust_brightness_contrast(image, brightness=40, contrast=1.0):
    # Convert to float to prevent clipping
    img = image.astype(np.float32)
    # Adjust brightness and contrast
    img = img * contrast + brightness
    # Clip to keep pixel values between 0 and 255 and convert back to uint8
    img = np.clip(img, 0, 255).astype(np.uint8)
    return img

# Ranges for brightness and contrast to test
brightness_values = range(-20, 71, 5)  # e.g., from -50 to 50 in steps of 20
contrast_values = [0.5, 0.75, 1.0, 1.25, 1.5]

In [33]:
def is_image(file_path):
    # Returns the type of image if valid, otherwise None
    return imghdr.what(file_path) is not None

In [45]:
# Initialize MediaPipe Hand model
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

# Path where images are stored for each letter in the ASL alphabet
data_dir = "../raw_data/asl_alphabet_dataset/asl_alphabet_train" # Change according to local dataset
#data_dir = "../raw_data/asl_cropped_dataset/asl_dataset"

landmark_data = []
labels = []

I0000 00:00:1730454428.798871  240529 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1730454428.835648  294382 gl_context.cc:357] GL version: 3.1 (OpenGL ES 3.1 Mesa 23.2.1-1ubuntu3.1~22.04.2), renderer: D3D12 (NVIDIA Quadro RTX 3000)


In [46]:
# Collect landmarks for each letter
for letter in os.listdir(data_dir):
    #if letter=="C":
    #    break
    letter_dir = os.path.join(data_dir, letter)
    letter_count = 0
    for i, img_path in enumerate(os.listdir(letter_dir)):        
        if not is_image(os.path.join(letter_dir, img_path)):
            continue
        #if i >= 300:
        #    break
        #print(os.path.join(letter_dir, img_path))
        img = cv2.imread(os.path.join(letter_dir, img_path))
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        results = hands.process(img_rgb)
        if results.multi_hand_landmarks:
            letter_count += 1
            landmarks = []
            for lm in results.multi_hand_landmarks[0].landmark:
                landmarks.extend([lm.x, lm.y, lm.z])  # Flattened landmark vector
            landmark_data.append(landmarks)
            labels.append(letter.upper())  # Store the label (e.g., "A", "B", etc.)
    print(letter, letter_count)

H 2204
X 1860


KeyboardInterrupt: 

In [49]:
# Collect landmarks for each letter
for letter in os.listdir(data_dir):
    #if letter=="C":
    #    break
    letter_dir = os.path.join(data_dir, letter)
    letter_count = 0
    for i, img_path in enumerate(os.listdir(letter_dir)):        
        if not is_image(os.path.join(letter_dir, img_path)):
            continue
        #if i >= 300:
        #    break
        #print(os.path.join(letter_dir, img_path))
        img = cv2.imread(os.path.join(letter_dir, img_path))
        landmarks_found = False
        for brightness in brightness_values:
            for contrast in contrast_values:
                # Adjust brightness and contrast
                adjusted_image = adjust_brightness_contrast(img, brightness, contrast)                
                # Run MediaPipe hand detection
                results = hands.process(cv2.cvtColor(adjusted_image, cv2.COLOR_BGR2RGB))
                # Check for hand landmarks and store them
                if results.multi_hand_landmarks:
                    landmarks = []
                    landmarks_found = True
                    letter_count += 1
                    for lm in results.multi_hand_landmarks[0].landmark:
                        landmarks.extend([lm.x, lm.y, lm.z])  # Flattened landmark vector
                    landmark_data.append(landmarks)
                    labels.append(letter)  # Store the label (e.g., "A", "B", etc.)
                    break
            if landmarks_found:
                #print(f'Brightness: {brightness}, Contrast: {contrast}')
                break
    print(letter, letter_count)

H 2878


KeyboardInterrupt: 

In [6]:
# Convert to arrays for model input
landmark_data = np.array(landmark_data)
labels = np.array(labels)

print(labels)
# Save the arrays to .npy files
np.save("landmark_data_cropped.npy", landmark_data)
np.save("labels_cropped.npy", labels)

# Normalize landmarks between 0 and 1
landmark_data = landmark_data / np.max(landmark_data)

# Encode labels as integers and convert to categorical
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
labels_categorical = to_categorical(labels_encoded)

np.save('label_classes.npy', label_encoder.classes_)

['H' 'H' 'H' ... 'A' 'A' 'A']


In [23]:
labels = np.load("labels.npy")
landmark_data = np.load("landmark_data.npy")

# Normalize landmarks relative to the wrist (landmark 0) for each frame
normalized_landmark_data = []
for frame in landmark_data:
    # Extract wrist coordinates
    wrist_x, wrist_y, wrist_z = frame[0], frame[1], frame[2]

    # Normalize each landmark in the frame relative to the wrist
    normalized_frame = []
    for i in range(0, len(frame), 3):  # Iterate over (x, y, z) coordinates
        normalized_x = frame[i] - wrist_x
        normalized_y = frame[i + 1] - wrist_y
        normalized_z = frame[i + 2] - wrist_z
        normalized_frame.extend([normalized_x, normalized_y, normalized_z])

    normalized_landmark_data.append(normalized_frame)

# Convert to numpy array
normalized_landmark_data = np.array(normalized_landmark_data)

# Encode labels as integers and convert to categorical
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
labels_categorical = to_categorical(labels_encoded)

In [50]:
labels = np.load("../models/production_model/labels.npy")
landmark_data = np.load("../models/production_model/landmark_data.npy")

# Encode labels as integers and convert to categorical
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
labels_categorical = to_categorical(labels_encoded)

np.save('../models/production_model/label_classes.npy', label_encoder.classes_)

In [24]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(normalized_landmark_data, labels_categorical, test_size=0.2, random_state=42)

In [25]:
# Shuffle training data and labels in unison
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Optionally shuffle test data too
X_test, y_test = shuffle(X_test, y_test, random_state=42)

In [9]:
# Build 1D CNN model
model = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(2),
    Conv1D(128, 3, activation='relu'),
    MaxPooling1D(2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1730444519.839502    2492 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4043 MB memory:  -> device: 0, name: Quadro RTX 3000, pci bus id: 0000:01:00.0, compute capability: 7.5


In [27]:
# Deeper CNN model with regularization
model = Sequential([
    # First convolutional block
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    # Second convolutional block
    Conv1D(128, kernel_size=3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    # Third convolutional block
    Conv1D(256, kernel_size=3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    # Fully connected layers
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')  # Output layer for classification
])

In [28]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

es = EarlyStopping(patience = 5, restore_best_weights=True)

# Train the model
model.fit(X_train[..., np.newaxis], y_train, epochs=40, batch_size=32, validation_data=(X_test[..., np.newaxis], y_test), callbacks=es)
#model.save("asl_sign_language_model.h5")
save_model(model, 'asl_sign_language_model_large_normalized.keras')

Epoch 1/40
[1m1863/1863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 9ms/step - accuracy: 0.6924 - loss: 1.1158 - val_accuracy: 0.9391 - val_loss: 0.2240
Epoch 2/40
[1m1863/1863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.8975 - loss: 0.3502 - val_accuracy: 0.9477 - val_loss: 0.1930
Epoch 3/40
[1m1863/1863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.9123 - loss: 0.3052 - val_accuracy: 0.9574 - val_loss: 0.1677
Epoch 4/40
[1m1863/1863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.9227 - loss: 0.2633 - val_accuracy: 0.9633 - val_loss: 0.1408
Epoch 5/40
[1m1863/1863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.9279 - loss: 0.2410 - val_accuracy: 0.9617 - val_loss: 0.1329
Epoch 6/40
[1m1863/1863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.9321 - loss: 0.2354 - val_accuracy: 0.9637 - val_loss: 0.1265
Epoch 7/40

In [29]:
# Load the trained model
#model = tf.keras.models.load_model("asl_sign_language_model.h5")
model = tf.keras.models.load_model("asl_sign_language_model_large_normalized.keras")

In [13]:
# Encode labels as integers and convert to categorical
labels = np.load("labels.npy")
landmark_data = np.load("landmark_data.npy")

label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
labels_categorical = to_categorical(labels_encoded)

In [51]:
# Initialize MediaPipe Hands and drawing utilities
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1)
mp_drawing = mp.solutions.drawing_utils

scaling_factor = np.max(landmark_data)

# Initialize webcam
#cap = cv2.VideoCapture(0)
#cap = cv2.VideoCapture("http://localhost:5000/video_feed")
cap = cv2.VideoCapture("http://192.168.2.31:5000/video_feed")

sequence = []
sequence_length = 1  # Set sequence length to 10 frames for rolling window approach - not working yet

while cap.isOpened():
    ret, frame = cap.read()
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        landmarks = []
        #for lm in results.multi_hand_landmarks[0].landmark:
        #    landmarks.extend([lm.x, lm.y, lm.z])
        # Retrieve wrist coordinates (landmark 0)
        wrist = results.multi_hand_landmarks[0].landmark[0]

        for lm in results.multi_hand_landmarks[0].landmark:
            # Normalize each landmark relative to the wrist
            normalized_x = lm.x - wrist.x
            normalized_y = lm.y - wrist.y
            normalized_z = lm.z - wrist.z
            landmarks.extend([normalized_x, normalized_y, normalized_z])

        # Draw hand landmarks on the frame
        mp_drawing.draw_landmarks(
            frame, 
            results.multi_hand_landmarks[0], 
            mp_hands.HAND_CONNECTIONS
        )

        # Append new frame landmarks to sequence
        sequence.append(landmarks)
        if len(sequence) > sequence_length:
            sequence.pop(0)

        # Predict only if the sequence is full
        if len(sequence) == sequence_length:
            sequence_input = np.array(sequence).flatten()[np.newaxis, ..., np.newaxis]
            #sequence_input = sequence_input/scaling_factor
            prediction = model.predict(sequence_input)
            predicted_label_index = np.argmax(prediction)
            predicted_label = label_encoder.inverse_transform([predicted_label_index])
            confidence = prediction[0][predicted_label_index] * 100  # Get confidence percentage

            # Display prediction and confidence
            cv2.putText(frame, f"Predicted: {predicted_label[0]} ({confidence:.2f}%)", 
                        (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)  # Black text color
    
    # Display the frame
    cv2.imshow("ASL Sign Language Detection", frame)
    
    # Press 'C' to terminate the loop
    if cv2.waitKey(1) & 0xFF == ord("c"):
        break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1730461524.655586  240529 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1730461524.699214    5343 gl_context.cc:357] GL version: 3.1 (OpenGL ES 3.1 Mesa 23.2.1-1ubuntu3.1~22.04.2), renderer: D3D12 (NVIDIA Quadro RTX 3000)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19

In [30]:
# Load the trained model
model = tf.keras.models.load_model("asl_sign_language_model_large.keras")

labels = np.load("labels.npy")
#landmark_data = np.load("landmark_data.npy")

label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
labels_categorical = to_categorical(labels_encoded)

# Initialize MediaPipe Hands and drawing utilities
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1)
mp_drawing = mp.solutions.drawing_utils

# Root directory with test images in subdirectories
root_test_dir = "../raw_data/test_set_pics"

# Initialize a list to store processed images
processed_images = []

# Iterate through each subdirectory in the root directory
for subdir_name in os.listdir(root_test_dir):
    subdir_path = os.path.join(root_test_dir, subdir_name)
    
    # Ensure it is a directory
    if os.path.isdir(subdir_path):
        actual_label = subdir_name  # Use the subdirectory name as the actual label

        # Iterate through each image in the subdirectory
        for img_name in os.listdir(subdir_path):
            img_path = os.path.join(subdir_path, img_name)
            print(img_path)
            img = cv2.imread(img_path)
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            results = hands.process(img_rgb)

            if results.multi_hand_landmarks:
                # Prepare landmarks for model prediction
                landmarks = []
                wrist = results.multi_hand_landmarks[0].landmark[0]
                for lm in results.multi_hand_landmarks[0].landmark:
                    # Normalize landmarks relative to the wrist
                    normalized_x = lm.x - wrist.x
                    normalized_y = lm.y - wrist.y
                    normalized_z = lm.z - wrist.z
                    landmarks.extend([normalized_x, normalized_y, normalized_z])

                # Prepare the input for the model
                sequence_input = np.array(landmarks)[np.newaxis, ..., np.newaxis]

                # Make prediction
                prediction = model.predict(sequence_input)
                predicted_label_index = np.argmax(prediction)
                predicted_label = label_encoder.inverse_transform([predicted_label_index])
                confidence = prediction[0][predicted_label_index] * 100  # Confidence percentage

                # Draw landmarks on the image
                mp_drawing.draw_landmarks(
                    img,
                    results.multi_hand_landmarks[0],
                    mp_hands.HAND_CONNECTIONS
                )

                # Add actual label, predicted label, and confidence to the image
                cv2.putText(img, f"Actual: {actual_label}", (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 2.0, (0, 255, 0), 6)  # Green text color
                cv2.putText(img, f"Predicted: {predicted_label[0]} ({confidence:.2f}%)", (10, 80), cv2.FONT_HERSHEY_SIMPLEX, 2.0, (0, 0, 255), 6)  # Red text color


            # Append processed image to the list
            processed_images.append(img)

# Display each processed image and check for "ESC" key press
for processed_img in processed_images:
    cv2.imshow("Processed Test Image", processed_img)
    key = cv2.waitKey(0)
    if key == 27:  # ESC key
        break

cv2.destroyAllWindows()


I0000 00:00:1730452480.733129  240529 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1730452480.760146  284905 gl_context.cc:357] GL version: 3.1 (OpenGL ES 3.1 Mesa 23.2.1-1ubuntu3.1~22.04.2), renderer: D3D12 (NVIDIA Quadro RTX 3000)


../raw_data/test_set_pics/H/test_H_4.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 316ms/step
../raw_data/test_set_pics/H/test_H_2.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
../raw_data/test_set_pics/H/test_H_5.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
../raw_data/test_set_pics/H/test_H_3.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
../raw_data/test_set_pics/H/test_H_1.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
../raw_data/test_set_pics/X/test_X_2.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
../raw_data/test_set_pics/X/test_X_1.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
../raw_data/test_set_pics/X/test_X_3.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
../raw_data/test_set_pics/X/test_X_4.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [20]:
import numpy as np
from scipy import ndimage

labels = np.load("labels.npy")
landmark_data = np.load("landmark_data.npy")

count_H = np.count_nonzero(labels == 'H')

print("Number of 'H's:", count_H)

Number of 'H's: 2762
