## Importing Required Libraries

In [1]:
import mediapipe as mp
import cv2
import numpy as np
import uuid
import os
import pyttsx3

## Data Collection 

In [2]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False # Image is no longer writeable
    results = model.process(image) # Make prediction
    image.flags.writeable = True # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections
    
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )
    
    
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    
    # return np.concatenate([pose, lh, rh])
    return np.concatenate([pose, face, lh, rh])

In [9]:
# Path for exported data, numpy arrays
# DATA_PATH = os.path.join("Dataset")
# DATA_PATH = os.path.join("Dataset2")
DATA_PATH = os.path.join("Ind-Dataset")

# Actions that we try to detect
# actions = np.array(['Yellow'])
# actions = np.array(['White','Gray','Black','Goodbye','Hello'])
actions = np.array(['Thank You'])

# Thirty videos worth of data
no_sequences = 30

# Videos are going to be 30 frames in length
sequence_length = 30

# Folder start
start_folder = 1

In [10]:
for action in actions:
    # dirmax = np.max(np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int))
    for sequence in range(1, no_sequences+1):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
            # os.makedirs(os.path.join(DATA_PATH, action, str(dirmax + sequence)))
        except:
            pass

In [11]:
cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

fourcc = cv2.VideoWriter_fourcc(*'X264')

In [12]:
try:
    cap = cv2.VideoCapture(0)
    # Set mediapipe model 
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        
        # NEW LOOP
        # Loop through actions
        for action in actions:
            # Loop through sequences aka videos
            for sequence in range(1,no_sequences+1):
                # Loop through video length aka sequence length
                for frame_num in range(1,sequence_length+1):

                    # Read feed
                    ret, frame = cap.read()

                    # Make detections
                    image, results = mediapipe_detection(frame, holistic)

                    # Draw landmarks
                    draw_styled_landmarks(image, results)
                    
                    # NEW Apply wait logic
                    if frame_num == 1: 
                        cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                        cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                        # Show to screen
                        cv2.imshow('OpenCV Feed', image)
                        cv2.waitKey(1500)
                    else: 
                        cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                        # Show to screen
                        cv2.imshow('OpenCV Feed', image)
                    
                    # NEW Export keypoints
                    keypoints = extract_keypoints(results)
                    npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                    np.save(npy_path, keypoints)

                    # Break gracefully
                    if cv2.waitKey(10) & 0xFF == ord('q'):
                        break
                        
        cap.release()
        cv2.destroyAllWindows()
        
finally:
    cap.release()
    cv2.destroyAllWindows()

KeyboardInterrupt: 

In [6]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [7]:
label_map = {label: num for num, label in enumerate(actions)}
print(label_map)

{'Thank You': 0, 'Welcome': 1, 'Good Night': 2, 'Hello': 3, 'Skill': 4, 'Draw': 5, 'Play': 6, 'Computer': 7, 'Clock': 8, 'Afternoon': 9, 'Kids': 10, 'Chair': 11, 'Exercise': 12, 'Travel': 13, 'Black': 14, 'Blue': 15}


In [8]:
sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []
        for frame_num in range(1,sequence_length+1):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [9]:
len(sequences)

480

In [10]:
np.array(sequences).shape

(480, 30, 1662)

In [11]:
np.array(labels).shape

(480,)

In [12]:
x = np.array(sequences)
y = to_categorical(labels).astype(int)

In [13]:
x.shape

(480, 30, 1662)

In [14]:
y

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05)

In [16]:
x_train.shape

(456, 30, 1662)

In [17]:
y_test.shape

(24, 16)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [19]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [20]:
actions

array(['Thank You', 'Welcome', 'Good Night', 'Hello', 'Skill', 'Draw',
       'Play', 'Computer', 'Clock', 'Afternoon', 'Kids', 'Chair',
       'Exercise', 'Travel', 'Black', 'Blue'], dtype='<U10')

In [21]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [22]:
actions

array(['Thank You', 'Welcome', 'Good Night', 'Hello', 'Skill', 'Draw',
       'Play', 'Computer', 'Clock', 'Afternoon', 'Kids', 'Chair',
       'Exercise', 'Travel', 'Black', 'Blue'], dtype='<U10')

In [50]:
x.shape

(480, 30, 1662)

In [51]:
model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['categorical_accuracy'])

In [62]:
model.fit(x_train,y_train,epochs=40)
# callbacks=[tb_callback]

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1a66423a460>

In [63]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 30, 64)            442112    
                                                                 
 lstm_4 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 16)                528       
                                                                 
Total params: 597,104
Trainable params: 597,104
Non-tr

In [64]:
res = model.predict(x_test)



In [65]:
actions[np.argmax(res[15])]

'Welcome'

In [66]:
actions[np.argmax(y_test[15])]

'Welcome'

In [71]:
# model.save('project.h5')
# model.save('project2.h5')
model.save('IndHandSigns.h5')

In [31]:
# del model

In [5]:
# model.load_weights('project.h5')
# model.load_weights('project2.h5')
model.load_weights('IndHandSigns.h5')

NameError: name 'model' is not defined

In [74]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [75]:
ypred = model.predict(x_test)
ytrue = np.argmax(y_test, axis=1).tolist()
ypred = np.argmax(ypred, axis=1).tolist()



In [76]:
multilabel_confusion_matrix(ytrue, ypred)

array([[[22,  0],
        [ 0,  2]],

       [[22,  0],
        [ 0,  2]],

       [[23,  0],
        [ 0,  1]],

       [[23,  1],
        [ 0,  0]],

       [[21,  0],
        [ 0,  3]],

       [[22,  0],
        [ 1,  1]],

       [[21,  1],
        [ 1,  1]],

       [[19,  0],
        [ 0,  5]],

       [[23,  1],
        [ 0,  0]],

       [[23,  0],
        [ 1,  0]],

       [[20,  0],
        [ 0,  4]],

       [[22,  0],
        [ 0,  2]]], dtype=int64)

In [77]:
accuracy_score(ytrue, ypred)

0.875

In [78]:
# colors = [(205,17,16), (117,145,16), (126,117,245), (122,111,16), (225,17,16),(205,17,16), (117,145,16), (126,117,245), (122,111,16), (225,17,16),(205,17,16), (117,145,16), (126,117,245), (122,111,16), (225,17,16),(122,111,16)]

In [79]:
# def probability_vizulization(res, actions, input_frame, colors):
#     output_frame = input_frame.copy()
#     for num, prob in enumerate(res):
#         cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
#         cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)

#     return output_frame

In [80]:
# 1. New detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.7

try:
    cap = cv2.VideoCapture(0)
    # Set mediapipe model
    with mp_holistic.Holistic(
        min_detection_confidence=0.6, min_tracking_confidence=0.6
    ) as holistic:
        while cap.isOpened():

            # Read camera
            ret, frame = cap.read()

            # Make detections
            image, results = mediapipe_detection(frame, holistic)
            # print(results)

            # Draw landmarks
            draw_styled_landmarks(image, results)

            # 2. Prediction logic
            keypoints = extract_keypoints(results)
            sequence.append(keypoints)
            sequence = sequence[-30:]

            if len(sequence) == 30:
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                # print(actions[np.argmax(res)])
                predictions.append(np.argmax(res))

                # 3. Vizualization logic
                if np.unique(predictions[-1:])[0] == np.argmax(res):
                    if res[np.argmax(res)] > threshold:

                        word = ""
                        if len(sentence) > 0:
                            if actions[np.argmax(res)] != sentence[-1]:
                                sentence.append(actions[np.argmax(res)])
                                word = actions[np.argmax(res)]
                        else:
                            sentence.append(actions[np.argmax(res)])
                            word = actions[np.argmax(res)]

                        # engine = pyttsx3.init()
                        # engine.say(word)
                        # # play the speech
                        # engine.runAndWait()

                if len(sentence) > 1:
                    sentence = sentence[-1:]

                # Probabilities Vizualization
                # image = probability_vizulization(res, actions, image, colors)
                
                # cv2.putText(image, 'PROB', (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)
                # cv2.putText(image, str(round(body_language_prob[np.argmax(body_language_prob)],2)) , (10,40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

            cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
            cv2.putText(
                image,
                " ".join(sentence),
                (3, 30),
                cv2.FONT_HERSHEY_SIMPLEX,
                1,
                (255, 255, 255),
                2,
                cv2.LINE_AA,
            )
            
            # Flip the image
            # image = cv2.flip(image, 1)

            # Show to screen
            cv2.imshow("OpenCV Feed", image)

            # Break gracefully
            if cv2.waitKey(10) & 0xFF == ord("q"):
                break
        cap.release()
        cv2.destroyAllWindows()

finally:
    cap.release()
    cv2.destroyAllWindows()

