# Media Pipe + Data Modelling for Model Creation

In [1]:
import cv2
import mediapipe as mp
import numpy as np
import os
import time
import tensorflow as tf
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import save_model, load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

## Read this:

### check all paths before running

## Required Functions

In [2]:
def adjust_brightness_contrast(image, brightness=40, contrast=1.0):
    # Convert to float to prevent clipping
    img = image.astype(np.float32)
    # Adjust brightness and contrast
    img = img * contrast + brightness
    # Clip to keep pixel values between 0 and 255 and convert back to uint8
    img = np.clip(img, 0, 255).astype(np.uint8)
    return img

In [3]:
def normalization_landmarks(landmarks):
    # Normalize landmarks relative to the wrist (landmark 0) for each frame
    normalized_landmark_data = []
    for frame in landmarks:
        # Extract wrist coordinates
        wrist_x, wrist_y, wrist_z = frame[0], frame[1], frame[2]

        # Normalize each landmark in the frame relative to the wrist
        normalized_frame = []
        for i in range(0, len(frame), 3):  # Iterate over (x, y, z) coordinates
            normalized_x = frame[i] - wrist_x
            normalized_y = frame[i + 1] - wrist_y
            normalized_z = frame[i + 2] - wrist_z
            normalized_frame.extend([normalized_x, normalized_y, normalized_z])

        normalized_landmark_data.append(normalized_frame)

    # Convert to numpy array
    normalized_landmark_data = np.array(normalized_landmark_data)
    
    return normalized_landmark_data

In [4]:
def predict_image(directory):
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1,min_detection_confidence=0.4)
    mp_drawing = mp.solutions.drawing_utils

    img = cv2.imread(directory)
    img = adjust_brightness_contrast(img, 40, 1)
    img_rbg =  cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    result = hands.process(img_rbg)

    sequence = []
    sequence_length = 1

    if result.multi_hand_landmarks:
        landmarks = []
        for lm in result.multi_hand_landmarks[0].landmark:
            landmarks.extend([lm.x, lm.y, lm.z])

        # Draw hand landmarks on the frame
        mp_drawing.draw_landmarks(
            img,
            result.multi_hand_landmarks[0],
            mp_hands.HAND_CONNECTIONS
        )

        # Append new frame landmarks to sequence

        sequence.append(landmarks)
        if len(sequence) > sequence_length:
            sequence.pop(0)

        if len(sequence) == sequence_length:
            sequence_input = np.array(sequence)
            sequence_input = normalization_landmarks(sequence_input)
            sequence_input = sequence_input.flatten()[np.newaxis, ..., np.newaxis]
            prediction = model.predict(sequence_input)
            predicted_label_index = np.argmax(prediction)
            predicted_label = label_encoder.inverse_transform([predicted_label_index])
            confidence = prediction[0][predicted_label_index]

    plt.axis('off')
    plt.imshow(img)
    return predicted_label

In [5]:
def evaluate_model(test_data_dir):
        mp_hands = mp.solutions.hands
        hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.4)


        labels1 = []
        landmark_data1 = []

        for letter in os.listdir(test_data_dir):

            letter_dir = os.path.join(test_data_dir, letter)
            for i, img_path in enumerate(os.listdir(letter_dir)):

                img = cv2.imread(os.path.join(letter_dir, img_path))
                img = adjust_brightness_contrast(img, 40, 1)

                img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                results = hands.process(img_rgb)


                if results.multi_hand_landmarks:
                    landmarks = []
                    for lm in results.multi_hand_landmarks[0].landmark:
                        landmarks.extend([lm.x, lm.y, lm.z])
                    landmark_data1.append(landmarks)
                    labels1.append(letter)
        landmark_data1 = np.array(landmark_data1)
        labels1 = np.array(labels1)


        # Normalize landmarks between 0 and 1
        landmark_data1 = normalization_landmarks(landmark_data1)

        # Encode labels as integers and convert to categorical

        labels_encoded1 = label_encoder.transform(labels1)
        labels_categorical1 = to_categorical(labels_encoded1)
        landmark_data1 = np.reshape(landmark_data1,(-1,63,1))

        return model.evaluate(landmark_data1,labels_categorical1)

## MediaPipe Landmark Creation for Training Data

In [None]:
# Initialize MediaPipe Hand model
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.2)
mp_drawing = mp.solutions.drawing_utils

# Path where images are stored for each letter in the ASL alphabet
data_dir = "../raw_data/asl_alphabet_train/asl_alphabet_train" # Change according to local dataset

landmark_data = []
labels = []

In [None]:
# Collect landmarks for each letter
# This function does not normalize inside. We need to run Normalization function outside
for letter in os.listdir(data_dir):
    #if letter=="C":
    #    break
    letter_dir = os.path.join(data_dir, letter)
    for i, img_path in enumerate(os.listdir(letter_dir)):
        #if i >= 300:
        #    break
        img = cv2.imread(os.path.join(letter_dir, img_path))
        img = adjust_brightness_contrast(img, 40, 1)
        #img = adjust_brightness_contrast(img, 20, 0.7)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        results = hands.process(img_rgb)
        
        # Check for hand landmarks and store them
        if results.multi_hand_landmarks:
            landmarks = []
            for lm in results.multi_hand_landmarks[0].landmark:
                landmarks.extend([lm.x, lm.y, lm.z])  # Flattened landmark vector
            landmark_data.append(landmarks)
            labels.append(letter)  # Store the label (e.g., "A", "B", etc.)
    print(letter)

In [183]:
# Save the arrays to .npy files
# landmark_data is NOT normalized until now. Only what MediaPipe does.
np.save("landmark_data_v_large.npy", landmark_data)
np.save("labels_v_large.npy", labels)

## Preprocessing

### If you already have landmark_data and labels you can start running the following:

In [14]:
labels = np.load('modeling_stuff/labels_v_large.npy')
landmark_data = np.load('modeling_stuff/landmark_data_v_large.npy')

In [15]:
#Label encoding as integers and convert to categorical
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
labels_categorical = to_categorical(labels_encoded)

In [16]:
#Applying Normalization function:
normalized_landmarks = normalization_landmarks(landmark_data)

### Model Prep

This is only if required but as we are using a different test set I will not test/train split

In [17]:
# Split into train and test sets if needed
X_train, X_test, y_train, y_test = train_test_split(normalized_landmarks, labels_categorical, stratify=labels_categorical, test_size=0.2, random_state=42)

## Modelling

### Building & Compiling

In [59]:
model = Sequential([
    # First dense block
    Dense(512, input_shape=(63,)),  # Input layer with a high number of neurons
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.4),

    # Second dense block
    Dense(512, kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.4),

    # Third dense block
    Dense(256),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.2),

    # Fourth dense block
    Dense(128),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.25),

    # Fifth dense block
    Dense(64),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.5),
    
    # Output layer for classification with softmax activation
    Dense(len(label_encoder.classes_), activation='softmax')
])



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [60]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'],)

### Fitting

In [61]:
es = EarlyStopping(monitor = 'val_accuracy',
                   patience = 5, 
                   restore_best_weights=True)

In [62]:
# Train the model
model.fit(X_train, 
          y_train, 
          epochs=100, 
          batch_size=64, 
          validation_data=(X_test[..., np.newaxis], y_test), 
          callbacks=es)

Epoch 1/100
[1m2875/2875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 6ms/step - accuracy: 0.7220 - loss: 1.3376 - val_accuracy: 0.9795 - val_loss: 0.2016
Epoch 2/100
[1m2875/2875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6ms/step - accuracy: 0.9531 - loss: 0.3134 - val_accuracy: 0.9781 - val_loss: 0.2079
Epoch 3/100
[1m2875/2875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6ms/step - accuracy: 0.9609 - loss: 0.2828 - val_accuracy: 0.9878 - val_loss: 0.1743
Epoch 4/100
[1m2875/2875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6ms/step - accuracy: 0.9656 - loss: 0.2657 - val_accuracy: 0.9852 - val_loss: 0.1795
Epoch 5/100
[1m2875/2875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6ms/step - accuracy: 0.9679 - loss: 0.2606 - val_accuracy: 0.9904 - val_loss: 0.1672
Epoch 6/100
[1m2875/2875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6ms/step - accuracy: 0.9712 - loss: 0.2459 - val_accuracy: 0.9894 - val_loss: 0.1635
Epoc

<keras.src.callbacks.history.History at 0x33c251c00>

### Saving

In [64]:
save_model(model, 'asl_new_model.keras')


## Model Testing

### If you already have a model. You can load it up here:

In [None]:
# Load the trained model
# model = tf.keras.models.load_model("old_asl_sign_language_model.keras")

In [63]:
evaluate_model("raw_data/test_set_pics")
evaluate_model("raw_data/test_set_pics")

I0000 00:00:1731093544.839786 6620178 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1 Pro
W0000 00:00:1731093544.850211 6674290 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1731093544.855598 6674290 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7424 - loss: 1.4007


I0000 00:00:1731093559.559922 6620178 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1 Pro
W0000 00:00:1731093559.566208 6674442 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1731093559.572037 6674442 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7424 - loss: 1.4007


[1.2766408920288086, 0.7753623127937317]