In [2]:

import pandas as pd
import cv2
import mediapipe as mp
import numpy as np
import os
import re
import time
import tensorflow as tf
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,Masking,Conv1D, Flatten, Conv2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from functions.augmentation import augment_frame

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.5, min_tracking_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

In [38]:
def extract_landmarks(video_path):
    #video_path =os.path.join("raw_data/ASL/videos",str(video_name))

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return np.nan

    else:
        landmarks_list = []


        frame_index = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            #frame = normalize_frame(frame)
            #frame = augment_frame(frame, flip=True, rotate=True, brightness=True)

            frame = cv2.flip(frame, 1)
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            results = hands.process(rgb_frame)

            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    landmarks = [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]
                    landmarks_list.append(landmarks)

                    mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                    #cv2.imshow('landmarks',frame)
                    #cv2.waitKey(200)

        """normalized_lm = []
        for frame in landmarks_list:
            # Extract wrist coordinates
            wrist_x, wrist_y, wrist_z = frame[0]

            # Normalize each landmark in the frame relative to the wrist
            normalized_frame = []
            for landmark in frame:  # Iterate over (x, y, z) coordinates
                normalized_x = landmark[0] - wrist_x
                normalized_y = landmark[1] - wrist_y
                normalized_z = landmark[2] - wrist_z
                normalized_frame.append((normalized_x, normalized_y, normalized_z))



            normalized_lm.append(normalized_frame)"""


        cap.release()
        return np.array(landmarks_list)


In [39]:
def extract_landmarks_augmented(video_path):
    #video_path =os.path.join("raw_data/ASL/videos",str(video_name))

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return np.nan

    else:
        landmarks_list = []


        frame_index = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            #frame = normalize_frame(frame)
            frame = augment_frame(frame, flip=True, rotate=True, brightness=False)

            frame = cv2.flip(frame, 1)
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            results = hands.process(rgb_frame)

            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    landmarks = [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]
                    landmarks_list.append(landmarks)

                    mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                    #cv2.imshow('landmarks',frame)
                    #cv2.waitKey(200)

        """normalized_lm = []
        for frame in landmarks_list:
            # Extract wrist coordinates
            wrist_x, wrist_y, wrist_z = frame[0]

            # Normalize each landmark in the frame relative to the wrist
            normalized_frame = []
            for landmark in frame:  # Iterate over (x, y, z) coordinates
                normalized_x = landmark[0] - wrist_x
                normalized_y = landmark[1] - wrist_y
                normalized_z = landmark[2] - wrist_z
                normalized_frame.append((normalized_x, normalized_y, normalized_z))



            normalized_lm.append(normalized_frame)"""


        cap.release()
        return np.array(landmarks_list)

In [40]:
def extract_landmarks_bright(video_path):
    #video_path =os.path.join("raw_data/ASL/videos",str(video_name))

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return np.nan

    else:
        landmarks_list = []


        frame_index = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            #frame = normalize_frame(frame)
            frame = augment_frame(frame, flip=False, rotate=False, brightness=True)

            frame = cv2.flip(frame, 1)
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            results = hands.process(rgb_frame)

            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    landmarks = [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]
                    landmarks_list.append(landmarks)

                    mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                    #cv2.imshow('landmarks',frame)
                    #cv2.waitKey(200)

        """normalized_lm = []
        for frame in landmarks_list:
            # Extract wrist coordinates
            wrist_x, wrist_y, wrist_z = frame[0]

            # Normalize each landmark in the frame relative to the wrist
            normalized_frame = []
            for landmark in frame:  # Iterate over (x, y, z) coordinates
                normalized_x = landmark[0] - wrist_x
                normalized_y = landmark[1] - wrist_y
                normalized_z = landmark[2] - wrist_z
                normalized_frame.append((normalized_x, normalized_y, normalized_z))



            normalized_lm.append(normalized_frame)"""


        cap.release()
        return np.array(landmarks_list)

In [41]:
lm_list = []

path = 'raw_data/Citizen/videos'
for item in tqdm(os.listdir(path),desc = 'Processing videos'):
    item_path = os.path.join(path,item)
    landmark = extract_landmarks(item_path)
    lm_list.append(landmark)

"""for item in tqdm(os.listdir(path),desc = 'Processing videos'):
    item_path = os.path.join(path,item)
    landmark = extract_landmarks_augmented(item_path)
    lm_list.append(landmark)"""

for item in tqdm(os.listdir(path),desc = 'Processing videos'):
    item_path = os.path.join(path,item)
    landmark = extract_landmarks_bright(item_path)
    lm_list.append(landmark)

Processing videos: 100%|██████████| 459/459 [24:01<00:00,  3.14s/it]
Processing videos: 100%|██████████| 459/459 [27:34<00:00,  3.60s/it] 


In [42]:
len(lm_list)

918

In [43]:
landmarks_padded = pad_sequences(lm_list,padding = "post")

In [44]:
word_list = []

for item in os.listdir(path):
    item_path = os.path.join(path,item)

    name = lambda x:re.sub(r'\s|seed|\d+|-', '', x[0:-4])

    word_list.append(name(item))

In [11]:
len(word_list)

459

In [45]:
full_word_list = word_list+word_list

In [46]:
len(full_word_list)

918

In [47]:
np.save('lms-bright.npy',landmarks_padded)
np.save("labels-bright.npy",full_word_list)

In [48]:
y = np.load("labels-bright.npy")
X = np.load("lms-bright.npy")


In [49]:

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_encoded = to_categorical(y_encoded)
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
y_encoded.shape

(918, 16)

In [50]:
X_reshaped = np.array([x.reshape(x.shape[0], -1) for x in X])

In [51]:
X_train, X_test, y_train, y_test = train_test_split(
    X_reshaped, y_encoded, test_size=0.20, random_state=42)

In [52]:
X_train.shape

(734, 276, 63)

In [57]:
sequence_length = X_train[0].shape[0]  # Number of time steps (max length)
num_features = X_train[0].shape[1]    # This is now 63 (flattened features per time step)



model = Sequential()

model.add(Masking(mask_value=0.0, input_shape=(sequence_length, num_features)))

#model.add(Conv1D(filters=128, kernel_size=2, activation='relu'))
#model.add(Dropout(0.3))
#model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
#model.add(Dropout(0.3))


model.add(LSTM(256, return_sequences=True))  # First LSTM layer with sequences returned
#model.add(Dropout(0.3))
model.add(LSTM(128, return_sequences=False))  # Second LSTM layer
model.add(Dropout(0.3))

model.add(Dense(16,activation = 'softmax'))


model.compile(loss = 'categorical_crossentropy',optimizer = Adam(),metrics =["accuracy"])

model.summary()

  super().__init__(**kwargs)


In [59]:

es = EarlyStopping(patience = 5, restore_best_weights=True)

model.fit(X_train,y_train,validation_split = 0.2,epochs = 100,batch_size =32,callbacks=es)

Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 1s/step - accuracy: 0.1922 - loss: 2.5477 - val_accuracy: 0.1973 - val_loss: 2.5501
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 1s/step - accuracy: 0.2707 - loss: 2.4340 - val_accuracy: 0.1973 - val_loss: 2.5133
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 1s/step - accuracy: 0.2297 - loss: 2.4651 - val_accuracy: 0.1905 - val_loss: 2.4807
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 1s/step - accuracy: 0.2613 - loss: 2.3943 - val_accuracy: 0.2177 - val_loss: 2.4784
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 1s/step - accuracy: 0.2422 - loss: 2.4255 - val_accuracy: 0.2177 - val_loss: 2.4618
Epoch 6/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 1s/step - accuracy: 0.2523 - loss: 2.3487 - val_accuracy: 0.2313 - val_loss: 2.4501
Epoch 7/100
[1m19/19[0m [32m━━━

<keras.src.callbacks.history.History at 0x7f4e189cfbb0>