## Install dependencies ##

In [1]:
# Essential if not installed
!pip install gTTS pydub



## 1. Import necessary packages ##

In [1]:
# Essential
import cv2 as cv
import numpy as np
import os
import matplotlib.pyplot as plt
import mediapipe as mp
import sys # to use getsizeof() function
import pickle

## 2. Keypoints using MP Holistic ##

In [2]:
# Essential
mp_holistic = mp.solutions.holistic # holistic model
mp_drawing = mp.solutions.drawing_utils # drawing utilities
# we use mp_holistic to make our detections and mp_drawing to
# draw them

In [3]:
# Essential
def mediapipe_detection(image,model):
    image = cv.cvtColor(image,cv.COLOR_BGR2RGB) # Color conversion from BGR to RGB, since the input from opencv is in the colorspace BGR
    image.flags.writeable = False # Image is no longer writable
    results = model.process(image) # make the predictions
    image.flags.writeable = True # Image is writable
    image = cv.cvtColor(image, cv.COLOR_RGB2BGR) # Color space conversion from RGB to BGR for use by opencv as opencv uses BGR
    return image, results

In [4]:
# Essential
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) # the first drawing spec is the landmark drawing spec,
    # the second drawing spec is the connection drawing spec
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(0, 13, 255), thickness=2, circle_radius=3), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=3), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=3), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [6]:
# Essential for checking camera connection, press q to close the camera window
cap = cv.VideoCapture(1) # try changing this number to 0, 1, 2 if camera does not open, this is the device no.
print("Resolution: {} X {}".format(cap.get(cv.CAP_PROP_FRAME_WIDTH), cap.get(cv.CAP_PROP_FRAME_HEIGHT)))
print("FPS: {}".format(cap.get(cv.CAP_PROP_FPS)))
# Instantiate mediapipe model
# we can play around with the kwargs
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence= 0.5) as holistic:
    while cap.isOpened():
        # read feed from camera or video
        ret, frame = cap.read()
        # make detections
        image, results = mediapipe_detection(frame, holistic)
        #flipped_image, results = mediapipe_detection(cv.flip(frame, 1), holistic)
        # print(results) # uncomment to see what the results look like
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # show to screen
        cv.imshow('OpenCV Feed', image)

        # break gracefully
        # print(results) # uncomment to see what the results loon link for explanation
        if cv.waitKey(10) & 0xFF == ord('q'): # https://stackoverflow.com/questions/35372700/whats-0xff-for-in-cv2-waitkey1
            break
    cap.release()
    cv.destroyAllWindows()

Resolution: 640.0 X 480.0
FPS: 30.0


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


Run the cell below if the webcab is still active for some reason or if there is some error on running the cell above

In [35]:
cap.release()
cv.destroyAllWindows()

The function extract_keypoints() takes the landmarks of each type and flattens them. It also takes care of error handling, i.e., if a particular type of landmark is not present then an array of zeros of the same length is returned. Pose has an extra attribute visibility.

In [5]:
# Essential
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

## Be careful, run the cell below only when entering the first data ##

In [10]:
# run this when there is no data available, i.e, you are about to enter the first data, otherwise don't run this
actions_vids_no = dict() # keeps track of the number of videos available per action,
# so here initially 30 videos are available for each of the three actions
# storing the dictionary on disk so that the program doesn't have to count the number of videos per action each 
# time the program is run
try:
    actions_file = open('actions_vids_no.pkl', 'wb')
    pickle.dump(actions_vids_no, actions_file)
    actions_file.close()
  
except:
    print("Something went wrong")

In [8]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join("MP_DATA")

In [6]:
# Essential
#Videos are going to be 30 frames in length
sequence_length = 30

Run the cells below before adding new training data

In [7]:
# Essential
actions_file= open("actions_vids_no.pkl", "rb")
actions_vids_no = pickle.load(actions_file)
print(actions_vids_no)

{'hello': 130, 'no action': 90, 'thank you': 175, 'i love you': 140, 'again': 150, 'food': 150, 'me': 170, 'want': 130, 'forget': 140, 'friend': 140, 'please': 185}


In [87]:
# run this cell to enter new training data
print("actions_vids_no =", actions_vids_no, "\nTotal =",sum(actions_vids_no.values()))
action = input("Enter the action for which you want to record sign language : ").lower().strip()
if action not in actions_vids_no.keys():
    print("This is a new action")
no_sequences = int(input("Enter the number of sequences (aka videos) you want to record : "))

# making the required directories
for sequence in range(no_sequences):
    try:
        os.makedirs(os.path.join(DATA_PATH, action, str(actions_vids_no.get(action,0)+sequence))) 
        """ logic: if no_sequences = 1 and no of videos already present = 30, then next folder should be
        30 (since folders start from 0), so actions_vids_no.get(act,0) will return 30 and sequence will be 0"""
    except:
        pass


waiting_time = input("Enter the time you need to reset yourself after each action (default 2000 ms) : ")
if(waiting_time == ""):
    waiting_time = 2000
else:
    waiting_time = int(waiting_time)
cap = cv.VideoCapture(1)
# Instantiate mediapipe model
# we can play around with the kwargs
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence= 0.5) as holistic:
    
    # Loop through actions
    #for action in actions_vids_no.keys():
    # only one action here, so no need to loop through actions
    # Loop through sequences aka videos
    for sequence in range(actions_vids_no.get(action,0), actions_vids_no.get(action,0) + no_sequences):
        # Loop through sequence length aka video length:
        for frame_num in range(sequence_length):

            # read feed from camera or video
            ret, frame = cap.read()
            # make detections
            image, results = mediapipe_detection(frame, holistic)

            # print(results) # uncomment to see what the results look like

            # Draw landmarks
            draw_styled_landmarks(image, results)

            # Apply wait logic
            if frame_num == 0: 
                cv.putText(image, 'STARTING COLLECTION, GET READY', (120,200), 
                           cv.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv.LINE_AA)
                cv.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                           cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv.LINE_AA)
                # Show to screen
                cv.imshow('OpenCV Feed', image)
                cv.waitKey(waiting_time)
            else: 
                cv.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                           cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv.LINE_AA)
                # Show to screen
                cv.imshow('OpenCV Feed', image)

            # NEW Export keypoints
            keypoints = extract_keypoints(results)
            npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
            np.save(npy_path, keypoints)

            # Break gracefully, (if this statement is essential for closing each imshow window after 10 ms, thus allowing the program to display the next video frame)
            if cv.waitKey(10) & 0xFF == ord('q'):
                break
    actions_vids_no[action] = actions_vids_no.get(action,0) + no_sequences
    print(actions_vids_no, "\nTotal =",sum(actions_vids_no.values()))
    cap.release()
    cv.destroyAllWindows()

actions_vids_no = {'hello': 120, 'no action': 90, 'thank you': 175, 'i love you': 140, 'again': 150, 'food': 150, 'me': 170, 'want': 130, 'forget': 140, 'friend': 140, 'please': 185} 
Total = 1590
Enter the action for which you want to record sign language : hello
Enter the number of sequences (aka videos) you want to record : 10
Enter the time you need to reset yourself after each action (default 2000 ms) : 
{'hello': 130, 'no action': 90, 'thank you': 175, 'i love you': 140, 'again': 150, 'food': 150, 'me': 170, 'want': 130, 'forget': 140, 'friend': 140, 'please': 185} 
Total = 1600


No need to run the cell below

In [88]:
#run this after entering new data
try:
    actions_file = open('actions_vids_no.pkl', 'wb')
    pickle.dump(actions_vids_no, actions_file)
    actions_file.close()
  
except:
    print("Something went wrong")

In [11]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

2022-04-16 00:31:23.723661: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/soumadeep/Documents/Sign language project/my_env/lib/python3.9/site-packages/cv2/../../lib64:
2022-04-16 00:31:23.723692: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [12]:
label_map = {label:num for num, label in enumerate(actions_vids_no.keys())}

In [13]:
label_map

{'hello': 0,
 'no action': 1,
 'thank you': 2,
 'i love you': 3,
 'again': 4,
 'food': 5,
 'me': 6,
 'want': 7,
 'forget': 8,
 'friend': 9,
 'please': 10}

In [14]:
# New essential for training new data, this loads all the available training data into numpy array
sequences, labels = [], []
for action in actions_vids_no.keys():
    for sequence in range(actions_vids_no[action]):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [15]:
actions_vids_no

{'hello': 130,
 'no action': 90,
 'thank you': 175,
 'i love you': 140,
 'again': 150,
 'food': 150,
 'me': 170,
 'want': 130,
 'forget': 140,
 'friend': 140,
 'please': 185}

Saving whole dataset into two numpy arrays to help in uploading to google drive in order to train the model in google colab

In [94]:
npy_path = os.path.join("CombinedDatasetX")
np.save(npy_path, np.array(sequences))
npy_path = os.path.join("CombinedDatasetY")
np.save(npy_path, np.array(labels))

In [95]:
np.array(sequences).shape

(1600, 30, 1662)

In [96]:
np.array(labels).shape

(1600,)

In [97]:
sys.getsizeof(sequences)

14360

In [98]:
X = np.array(sequences)

In [99]:
X.shape

(1600, 30, 1662)

In [100]:
sys.getsizeof(X)

638208144

In [101]:
y = to_categorical(labels).astype(int)

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0625)

In [111]:
y_test.shape

(100, 11)

In [112]:
y_train.shape

(1500, 11)

In [113]:
X_train.shape

(1500, 30, 1662)

In [25]:
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## 7. Build and train LSTM model ##

In [8]:
# Essential
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, BatchNormalization, LeakyReLU, GRU
from tensorflow.keras.callbacks import TensorBoard
import tensorflow as tf

2022-04-27 09:28:36.866090: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/soumadeep/Documents/Sign language project/my_env/lib/python3.10/site-packages/cv2/../../lib64:
2022-04-27 09:28:36.866159: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [9]:
# Essential
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [10]:
# Essential
model4 = Sequential()
#model.add(Input(shape=(30,1662)))
model4.add(LSTM(64, return_sequences=False, activation='tanh', input_shape=(30, 1662), kernel_regularizer=tf.keras.regularizers.L2(l2=0.04), recurrent_regularizer=tf.keras.regularizers.L2(l2=0.01)))
#model.add(LSTM(128, return_sequences=False, activation='tanh', kernel_regularizer=tf.keras.regularizers.L2(l2=0.08), recurrent_regularizer = tf.keras.regularizers.L2(l2=0.02)))
#model.add(LSTM(64, return_sequences=False, activation='tanh'))
model4.add(Dense(64, kernel_regularizer=tf.keras.regularizers.L2(l2=0.02)))
model4.add(LeakyReLU())
model4.add(Dense(32, kernel_regularizer=tf.keras.regularizers.L2(l2=0.02)))
model4.add(LeakyReLU())
model4.add(Dense(len(actions_vids_no.keys()), activation='softmax'))

2022-04-27 09:28:40.434374: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/soumadeep/Documents/Sign language project/my_env/lib/python3.10/site-packages/cv2/../../lib64:
2022-04-27 09:28:40.434494: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-27 09:28:40.434539: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (soumadeep-HP-Laptop-15s-eq0xxx): /proc/driver/nvidia/version does not exist
2022-04-27 09:28:40.437885: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the a

In [11]:
model4.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 64)                442112    
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 11)                363       
                                                                 
Total params: 448,715
Trainable params: 448,715
Non-trai

In [12]:
# Essential
# learning rate decay, used it because the loss was abruptly increasing while fitting the model, which
# meant that learning rate was note small enough
lr_schedule4 = tf.keras.optimizers.schedules.InverseTimeDecay(
    initial_learning_rate= 0.002 ,
    decay_steps=8,
    decay_rate=0.5,
    staircase=False)

In [13]:
# Essential
model4.compile(optimizer= tf.optimizers.Adam(learning_rate=lr_schedule4), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [14]:
history = model4.fit(X_train, y_train, epochs=150, batch_size = 64)

NameError: name 'X_train' is not defined

In [15]:
# Essential
model4.load_weights("action4_1500_2.h5")

# Final 2 models which are discussed in project report #

In [16]:
# Essential
model_LSTM_Batch_Norm = Sequential(
    [
        LSTM(64, return_sequences=False, activation='tanh', input_shape=(30, 1662), kernel_regularizer=tf.keras.regularizers.L2(l2=0.044), recurrent_regularizer=tf.keras.regularizers.L2(l2=0.014)),
        Dense(64, kernel_regularizer=tf.keras.regularizers.L2(l2=0.027)),
        BatchNormalization(),
        LeakyReLU(),
        Dense(32, kernel_regularizer=tf.keras.regularizers.L2(l2=0.027)),
        BatchNormalization(),
        LeakyReLU(),
        Dense(len(actions_vids_no.keys()), activation='softmax')
    ]
)

In [17]:
# Essential
model_GRU_Batch_Norm = Sequential(
    [
        GRU(64, return_sequences=False, activation='tanh', input_shape=(30, 1662), kernel_regularizer=tf.keras.regularizers.L2(l2=0.044), recurrent_regularizer=tf.keras.regularizers.L2(l2=0.014)),
        Dense(64, kernel_regularizer=tf.keras.regularizers.L2(l2=0.027)),
        BatchNormalization(),
        LeakyReLU(),
        Dense(32, kernel_regularizer=tf.keras.regularizers.L2(l2=0.027)),
        BatchNormalization(),
        LeakyReLU(),
        Dense(len(actions_vids_no.keys()), activation='softmax')
    ]
)

In [18]:
# Essential
# learning rate decay, used it because the loss was abruptly increasing while fitting the model, which
# meant that learning rate was note small enough
# PS. The problem was alleviated more after implementing batch normalization
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
    initial_learning_rate= 0.002 ,
    decay_steps=8,
    decay_rate=0.5,
    staircase=False)

In [19]:
# Essential
model_LSTM_Batch_Norm.compile(optimizer= tf.optimizers.Adam(learning_rate=lr_schedule), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [20]:
# Essential
model_GRU_Batch_Norm.compile(optimizer= tf.optimizers.Adam(learning_rate=lr_schedule), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [21]:
model_LSTM_Batch_Norm.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 64)                442112    
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 batch_normalization (BatchN  (None, 64)               256       
 ormalization)                                                   
                                                                 
 leaky_re_lu_2 (LeakyReLU)   (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 batch_normalization_1 (Batc  (None, 32)               128       
 hNormalization)                                      

In [22]:
model_GRU_Batch_Norm.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 64)                331776    
                                                                 
 dense_6 (Dense)             (None, 64)                4160      
                                                                 
 batch_normalization_2 (Batc  (None, 64)               256       
 hNormalization)                                                 
                                                                 
 leaky_re_lu_4 (LeakyReLU)   (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 32)                2080      
                                                                 
 batch_normalization_3 (Batc  (None, 32)               128       
 hNormalization)                                      