In [16]:
import tensorflow
import mediapipe as mp
import cv2
import numpy as np
import sklearn
import matplotlib
import os
import sys

In [3]:
# Initialize mediapipe elements: Drawing utils & Hand model
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

## Landmarks Detection Function

In [4]:
# Mediapipe Hands Detection

def hand_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)    # By default opencv uses BGR
    image.flags.writeable = False

    results = model.process(image)    # Extract the landmarks using mp model
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)    # Convert to opencv compatible color space
    return image, results

In [5]:
def draw_landmarks(image, result):
    if results.multi_hand_landmarks:    # If landmarks are recognized
            for num, hand in enumerate(results.multi_hand_landmarks):    # For every landmark point
                mp_drawing.draw_landmarks(image, hand, mp_hands.HAND_CONNECTIONS)    # Draw the corresponding joint and connection

In [18]:
cap = cv2.VideoCapture(0)

# Using the mediapipe hands model
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=1) as hands:
    while cap.isOpened():
        ret, frame = cap.read() # Get a frame from webcam stram

        # Detections
        image, results = hand_detection(frame, hands)
        
        # Draw Landmarks
        draw_landmarks(image, results)

        cv2.imshow('Hand Tracking', image)    # Frame after applying the mediapipe drawings
        
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

## Extracting Keypoints from Mediapipe output

In [7]:
def extract_keypoints(results):
    all_landmarks = []
    if results.multi_hand_world_landmarks:
        for hand_landmarks in results.multi_hand_world_landmarks:
            for _, landmark in enumerate(hand_landmarks.landmark):
                test = np.array([landmark.x, landmark.y, landmark.z])
                all_landmarks.append(test)
    else:
        all_landmarks.append(np.zeros(21 * 3))    # Return array of shape (21,) with zeroes
    all_landmarks = np.array(all_landmarks).flatten()
    return all_landmarks

In [32]:
extract_keypoints(results)

array([-0.03188626,  0.08063897, -0.01312534,  0.0064655 ,  0.0662742 ,
       -0.01613111,  0.03590117,  0.04104966, -0.01226182,  0.05813647,
        0.01183308, -0.00921023,  0.05867096, -0.01700037, -0.00477235,
        0.02659452,  0.00316647,  0.0090667 ,  0.04010674, -0.00659361,
       -0.01769856,  0.03818551,  0.00760557, -0.02270604,  0.02904305,
        0.01832306, -0.00430081,  0.00260312, -0.00394715,  0.00725539,
        0.01834306, -0.0179193 , -0.0281985 ,  0.01896388,  0.01029161,
       -0.03747998,  0.00750468,  0.01345858, -0.00973626, -0.01968946,
       -0.00460242, -0.00481195, -0.00340459, -0.01191723, -0.03401646,
        0.00016375,  0.0122586 , -0.04314898, -0.00785952,  0.02235452,
       -0.02049945, -0.03983998,  0.00471871, -0.01792581, -0.02849677,
       -0.00354113, -0.03641857, -0.01968493,  0.01249862, -0.04668313,
       -0.0286568 ,  0.02091947, -0.03463709])

## Setup Directories

In [30]:
data_path = os.path.join('Data')
letters = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y'])
no_sequences = 25    # left hand = 25 && right hand = 25
sequence_length = 10

In [31]:
for letter in letters:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(data_path, letter, str(sequence)))
        except:
            pass

## Extract Data and store in directories

In [37]:
cap = cv2.VideoCapture(0)

# Using the mediapipe hands model
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=1) as hands:
    for letter in letters:
        flag = True
        for sequence in range(no_sequences):
            for frame_number in range(-1, sequence_length):
                
                ret, frame = cap.read() # Get a frame from webcam stram

                if frame_number == -1:
                    while(flag):
                        ret, frame = cap.read()
                        cv2.putText(frame, 'START Collection for letter \'{}\', press S.'.format(letter), (120, 200), 
                                cv2.FONT_HERSHEY_COMPLEX, 1,(0, 255, 0), 1, cv2.LINE_AA)
                        cv2.imshow('Frames Collection', frame)
                        if cv2.waitKey(10) & 0xFF == ord('s'):
                            flag = False
                    continue
                
                # Detections
                image, results = hand_detection(frame, hands)
                draw_landmarks(image, results)
     
                cv2.putText(image, 'Collecting for letter \'{}\' video number {}'.format(letter, sequence), (120, 200), 
                            cv2.FONT_HERSHEY_COMPLEX, 1,(0, 255, 0), 1, cv2.LINE_AA)
                cv2.imshow('Frames Collection', image)    # Frame after applying the mediapipe drawings

                # Extracting and saving numPy arrays                
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(data_path, letter, str(sequence), str(frame_number))
                np.save(npy_path, keypoints)
                    
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

cap.release()
cv2.destroyAllWindows()

## Pre-Processing data and Creating Labels

In [38]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [41]:
label_map = {label: num for num, label in enumerate(letters)}

In [43]:
sequences, labels = [], []
for letter in letters:
    for sequence in range(no_sequences):
        window = []
        for frame_number in range(sequence_length):
            res = np.load(os.path.join(data_path, letter, str(sequence), "{}.npy".format(frame_number)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[letter])

In [45]:
np.array(sequences).shape    # (total_videos, frames_per_video, keypoints_per_frame)

(600, 10, 63)

In [50]:
X = np.array(sequences)

In [64]:
X.shape    # (a, b, c) => (b, c) is input shape for LSTM layer

(600, 10, 63)

In [53]:
y = to_categorical(labels).astype(int)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

## Building and Training the model

In [61]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [62]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [66]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(10, 63)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(letters.shape[0], activation='softmax'))

In [71]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=2000, callbacks=[tb_callback])

In [73]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 10, 64)            32768     
                                                                 
 lstm_4 (LSTM)               (None, 10, 128)           98816     
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 24)                792       
                                                                 
Total params: 188024 (734.47 KB)
Trainable params: 188

## Making Predictions and Saving Model

In [75]:
res = model.predict(X_test)



In [None]:
# Testing
for i in range(30):
    print(i, ":")
    print(letters[np.argmax(res[i])] == letters[np.argmax(y_test[i])])

In [90]:
model.save('asl.keras')

## Testing the Model

In [93]:
sequence = []
threshold = 0.7

cap = cv2.VideoCapture(0)

# Using the mediapipe hands model
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=1) as hands:
    while cap.isOpened():
        ret, frame = cap.read() # Get a frame from webcam stram

        # Detections
        image, results = hand_detection(frame, hands)
        
        # Draw Landmarks
        draw_landmarks(image, results)

        # Testing Logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-10:]

        if len(sequence) == 10:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(letters[np.argmax(res)])

        # Rendering result in OpenCV
        if res[np.argmax(res)] > threshold:
            letter = letters[np.argmax(res)]

        cv2.rectangle(image, (0,0), (640, 40), (255, 255, 255), -1)
        cv2.putText(image, ''.join(letter), (3, 30),
                    cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
        
        cv2.imshow('Hand Tracking', image)    # Frame after applying the mediapipe drawings
        
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

SyntaxError: unmatched ')' (310028508.py, line 32)