In [6]:
import tensorflow
import mediapipe as mp
import cv2
import numpy as np
import sklearn
import matplotlib
import os




In [7]:
# Initialize mediapipe elements: Drawing utils & Hand model
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

## Landmarks Detection Function

In [8]:
# Mediapipe Hands Detection
def hand_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)    # By default opencv uses BGR
    image.flags.writeable = False

    results = model.process(image)    # Extract the landmarks using mp model
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)    # Convert to opencv compatible color space
    return image, results

In [9]:
def draw_landmarks(image, result):
    if results.multi_hand_landmarks:    # If landmarks are recognized
        for num, hand in enumerate(results.multi_hand_landmarks):    # For every landmark point
            mp_drawing.draw_landmarks(image, hand, mp_hands.HAND_CONNECTIONS)    # Draw the corresponding joint and connection

In [None]:
# Testing the Webcam Stream

cap = cv2.VideoCapture(0)    # Webcam stream component

# Using the mediapipe hands model
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=1) as hands:
    while cap.isOpened():
        ret, frame = cap.read() # Get a frame from webcam stream

        # Detections
        image, results = hand_detection(frame, hands)
        
        # Draw Landmarks
        draw_landmarks(image, results)

        cv2.imshow('Hand Tracking', image)    # Frame after applying the mediapipe drawings
        
        if cv2.waitKey(10) & 0xFF == ord('q'):    # Press 'Q' to exit
            break

cap.release()
cv2.destroyAllWindows()

## Extracting Keypoints from Mediapipe output

In [10]:
def extract_keypoints(results):
    all_landmarks = []    # Array to store the flattened coordinates of the 21 hand joints
    if results.multi_hand_world_landmarks:
        for hand_landmarks in results.multi_hand_world_landmarks:
            for _, landmark in enumerate(hand_landmarks.landmark):
                test = np.array([landmark.x, landmark.y, landmark.z])
                all_landmarks.append(test)
    else:
        all_landmarks.append(np.zeros(21 * 3))    # Return array of shape (21,) with zeroes if hands are not detected
    all_landmarks = np.array(all_landmarks).flatten()
    return all_landmarks

## Setup Directories

In [39]:
data_path = os.path.join('Data')
letters = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 
                    'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y'])    # 'J' & 'Z' omitted as they are motion based signs
no_sequences = 50
# left hand = 25 && right hand = 25
# Collect data twice for both hands
sequence_length = 10    # Number of frames per video collected

In [43]:
# Make Directories to store the numPy arrays

# For first set of data
for letter in letters:
    for sequence in range(25):
        try:
            os.makedirs(os.path.join(data_path, letter, str(sequence)))
        except:
            pass

# For second set of data
for letter in letters:
    for sequence in range(25, 50):
        try:
            os.makedirs(os.path.join(data_path, letter, str(sequence)))
        except:
            pass

## Extract Data and store in directories

In [36]:
cap = cv2.VideoCapture(0)

# Using the mediapipe hands model
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=1) as hands:
    for letter in letters:
        flag = True

        # for sequence in range(no_sequences):  # For first set of data
        for sequence in range(25, 50):    # For second set of data
            
            for frame_number in range(-1, sequence_length):
                
                ret, frame = cap.read() # Get a frame from webcam stram

                if frame_number == -1:
                    while(flag):
                        ret, frame = cap.read()
                        cv2.putText(frame, 'START Collection for letter \'{}\', press S.'.format(letter), (120, 200), 
                                cv2.FONT_HERSHEY_COMPLEX, 1,(0, 255, 0), 1, cv2.LINE_AA)
                        cv2.imshow('Frames Collection', frame)
                        if cv2.waitKey(10) & 0xFF == ord('s'):    # Press 'S' to start capturing data for the particular letter
                            flag = False
                    continue
                
                # Detections
                image, results = hand_detection(frame, hands)
                draw_landmarks(image, results)
                
                cv2.putText(image, 'Collecting for letter \'{}\' video number {}'.format(letter, sequence), (120, 200), 
                            cv2.FONT_HERSHEY_COMPLEX, 1,(0, 255, 0), 1, cv2.LINE_AA)
                cv2.imshow('Frames Collection', image)    # Frame after applying the mediapipe drawings

                # Extracting and saving numPy arrays                
                keypoints = extract_keypoints(results)
                
                npy_path = os.path.join(data_path, letter, str(sequence), str(frame_number))    # Path to store the array at
                np.save(npy_path, keypoints)    # Save arrays for individual frames
                    
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

cap.release()
cv2.destroyAllWindows()

In [31]:
cap.release()
cv2.destroyAllWindows()

## Pre-Processing data and Creating Labels

In [37]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [38]:
label_map = {label: num for num, label in enumerate(letters)}    # Create a label map to train the model on

In [44]:
# Merge labels with the corresponding sequence data
sequences, labels = [], []
for letter in letters:
    for sequence in range(no_sequences):
        window = []
        for frame_number in range(sequence_length):
            res = np.load(os.path.join(data_path, letter, str(sequence), "{}.npy".format(frame_number)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[letter])

In [45]:
np.array(sequences).shape    # (total_videos, frames_per_video, keypoints_per_frame)

(1200, 10, 63)

In [46]:
X = np.array(sequences)    # The coordinate data to train the model with

In [47]:
X.shape    # (a, b, c) => (b, c) is input shape for LSTM layer

(1200, 10, 63)

In [48]:
y = to_categorical(labels).astype(int)    # Labels for the corresponsing X elements e.g., X[0] => y[0] : {'a': 0}

In [49]:
# Split the dataset into training (95%) and testing (5%) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

## Building and Training the model

In [50]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [51]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)    # (Optional) Directory for evaluating model with TensorBoard

In [52]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(10, 63)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(letters.shape[0], activation='softmax'))

In [53]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [18]:
# Load the pre-trained model (h5 or keras format). Use only one!
# model = tensorflow.keras.models.load_model('asl.keras')
model.load_weights('asl.h5')

In [55]:
# !!!! Do not run if model is already trained !!!!
# Train the model with the dataset, change epochs based on size of dataset
model.fit(X_train, y_train, epochs=2000, callbacks=[tb_callback])

Epoch 1/2000


Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000

KeyboardInterrupt: 

In [54]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 10, 64)            32768     
                                                                 
 lstm_4 (LSTM)               (None, 10, 128)           98816     
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 24)                792       
                                                                 
Total params: 188024 (734.47 KB)
Trainable params: 188

## Making Predictions and Saving Model

In [56]:
res = model.predict(X_test)



In [None]:
# Testing - Check if the model can predict the test split accurately
for i in range(50):
    print(i, ":")
    print(letters[np.argmax(res[i])] == letters[np.argmax(y_test[i])])

In [59]:
model.save('asl.h5')
model.save('asl.keras')

  saving_api.save_model(


## Testing the Model

In [60]:
sequence = []
threshold = 0.7
res = np.array([0])
letter = ' '

cap = cv2.VideoCapture(0)

# Using the mediapipe hands model
with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=1) as hands:
    while cap.isOpened():
        ret, frame = cap.read() # Get a frame from webcam stram

        # Detections
        image, results = hand_detection(frame, hands)
        
        # Draw Landmarks
        draw_landmarks(image, results)

        # Testing Logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-10:]

        if len(sequence) == 10:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            # print(letters[np.argmax(res)])    # To print predicted output

        # Rendering result in OpenCV
        if res[np.argmax(res)] > threshold:
            letter = letters[np.argmax(res)]

        cv2.rectangle(image, (0,0), (640, 40), (255, 255, 255), -1)
        cv2.putText(image, ''.join(letter), (3, 30),
                    cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
        
        cv2.imshow('Hand Tracking', image)    # Frame after applying the mediapipe drawings
        
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

