In [1]:
import cv2
import mediapipe as mp
import numpy as np
import os
import time
import tensorflow as tf
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.models import save_model, load_model
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
def adjust_brightness_contrast(image, brightness=40, contrast=1.0):
    # Convert to float to prevent clipping
    img = image.astype(np.float32)
    # Adjust brightness and contrast
    img = img * contrast + brightness
    # Clip to keep pixel values between 0 and 255 and convert back to uint8
    img = np.clip(img, 0, 255).astype(np.uint8)
    return img

In [3]:
# Initialize MediaPipe Hand model
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.2)
mp_drawing = mp.solutions.drawing_utils

# Path where images are stored for each letter in the ASL alphabet
data_dir = "raw_data/asl_alphabet_train/asl_alphabet_train" # Change according to local dataset

landmark_data = []
labels = []

I0000 00:00:1730835815.338194 6166503 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1 Pro
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1730835815.366423 6170260 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1730835815.373965 6170258 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [None]:
# Collect landmarks for each letter
for letter in os.listdir(data_dir):
    #if letter=="C":
    #    break
    letter_dir = os.path.join(data_dir, letter)
    for i, img_path in enumerate(os.listdir(letter_dir)):
        #if i >= 300:
        #    break
        img = cv2.imread(os.path.join(letter_dir, img_path))
        img = adjust_brightness_contrast(img, 40, 1)
        #img = adjust_brightness_contrast(img, 20, 0.7)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        results = hands.process(img_rgb)

        # Check for hand landmarks and store them
        if results.multi_hand_landmarks:
            landmarks = []
            for lm in results.multi_hand_landmarks[0].landmark:
                landmarks.extend([lm.x, lm.y, lm.z])  # Flattened landmark vector
            landmark_data.append(landmarks)
            labels.append(letter)  # Store the label (e.g., "A", "B", etc.)
    print(letter)

W0000 00:00:1730836034.243590 6170257 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


R
U
I
N
G
Z
T
S
A
F
O
H
del
nothing
space
M
J
C
D
V
Q
X
E
B
K
L
Y
P
W


In [6]:
print('done')

done


In [7]:
# Convert to arrays for model input
landmark_data = np.array(landmark_data)
labels = np.array(labels)

print(labels)
# Save the arrays to .npy files
np.save("landmark_data.npy", landmark_data)
np.save("labels.npy", labels)

# Normalize landmarks between 0 and 1
#landmark_data = landmark_data / np.max(landmark_data)

# Encode labels as integers and convert to categorical
#label_encoder = LabelEncoder()
#labels_encoded = label_encoder.fit_transform(labels)
#labels_categorical = to_categorical(labels_encoded)

['R' 'R' 'R' ... 'W' 'W' 'W']


In [10]:
# Normalize landmarks relative to the wrist (landmark 0) for each frame
normalized_landmark_data = []
for frame in landmark_data:
    # Extract wrist coordinates
    wrist_x, wrist_y, wrist_z = frame[0], frame[1], frame[2]

    # Normalize each landmark in the frame relative to the wrist
    normalized_frame = []
    for i in range(0, len(frame), 3):  # Iterate over (x, y, z) coordinates
        normalized_x = frame[i] - wrist_x
        normalized_y = frame[i + 1] - wrist_y
        normalized_z = frame[i + 2] - wrist_z
        normalized_frame.extend([normalized_x, normalized_y, normalized_z])

    normalized_landmark_data.append(normalized_frame)

# Convert to numpy array
normalized_landmark_data = np.array(normalized_landmark_data)

# Encode labels as integers and convert to categorical
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
labels_categorical = to_categorical(labels_encoded)

In [11]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(normalized_landmark_data, labels_categorical, test_size=0.2, random_state=42)

In [12]:
# Build 1D CNN model
model = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(2),
    Conv1D(128, 3, activation='relu'),
    MaxPooling1D(2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

es = EarlyStopping(patience = 5, restore_best_weights=True)

# Train the model
model.fit(X_train[..., np.newaxis], y_train, epochs=40, batch_size=32, validation_data=(X_test[..., np.newaxis], y_test), callbacks=es)
#model.save("asl_sign_language_model.h5")
#save_model(model, 'asl_sign_language_model.keras')

Epoch 1/40
[1m5750/5750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - accuracy: 0.7984 - loss: 0.6512 - val_accuracy: 0.9833 - val_loss: 0.0580
Epoch 2/40
[1m5750/5750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - accuracy: 0.9659 - loss: 0.1122 - val_accuracy: 0.9898 - val_loss: 0.0409
Epoch 3/40
[1m5750/5750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - accuracy: 0.9772 - loss: 0.0787 - val_accuracy: 0.9921 - val_loss: 0.0339
Epoch 4/40
[1m5750/5750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - accuracy: 0.9817 - loss: 0.0632 - val_accuracy: 0.9915 - val_loss: 0.0347
Epoch 5/40
[1m5750/5750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 5ms/step - accuracy: 0.9844 - loss: 0.0556 - val_accuracy: 0.9942 - val_loss: 0.0247
Epoch 6/40
[1m5750/5750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - accuracy: 0.9866 - loss: 0.0462 - val_accuracy: 0.9943 - val_loss: 0.0245
Epoch 7/40

<keras.src.callbacks.history.History at 0x30981b7f0>

In [14]:
save_model(model, 'old_asl_sign_language_model.keras')

In [4]:
# Load the trained model
#model = tf.keras.models.load_model("asl_sign_language_model.h5")
#model = tf.keras.models.load_model("asl_sign_language_model_tf_2.18.keras")

In [5]:
# Encode labels as integers and convert to categorical
#labels = np.load("labels.npy")

#label_encoder = LabelEncoder()
#labels_encoded = label_encoder.fit_transform(labels)
#labels_categorical = to_categorical(labels_encoded)

In [None]:
labels

In [None]:
# Initialize MediaPipe Hands and drawing utilities
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1)
mp_drawing = mp.solutions.drawing_utils

# Initialize webcam
cap = cv2.VideoCapture(0)

sequence = []
sequence_length = 1  # Set sequence length to 10 frames for rolling window approach - not working yet

while cap.isOpened():
    ret, frame = cap.read()
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        landmarks = []
        for lm in results.multi_hand_landmarks[0].landmark:
            landmarks.extend([lm.x, lm.y, lm.z])

        # Draw hand landmarks on the frame
        mp_drawing.draw_landmarks(
            frame,
            results.multi_hand_landmarks[0],
            mp_hands.HAND_CONNECTIONS
        )

        # Append new frame landmarks to sequence
        sequence.append(landmarks)
        if len(sequence) > sequence_length:
            sequence.pop(0)

        # Predict only if the sequence is full
        if len(sequence) == sequence_length:
            sequence_input = np.array(sequence).flatten()[np.newaxis, ..., np.newaxis]
            prediction = model.predict(sequence_input)
            predicted_label_index = np.argmax(prediction)
            predicted_label = label_encoder.inverse_transform([predicted_label_index])
            confidence = prediction[0][predicted_label_index] * 100  # Get confidence percentage

            # Display prediction and confidence
            cv2.putText(frame, f"Predicted: {predicted_label[0]} ({confidence:.2f}%)",
                        (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)  # Black text color

    # Display the frame
    cv2.imshow("ASL Sign Language Detection", frame)

    # Press 'C' to terminate the loop
    if cv2.waitKey(1) & 0xFF == ord("c"):
        break

cap.release()
cv2.destroyAllWindows()

NameError: name 'mp' is not defined