In [2]:
import cv2 as cv
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [3]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [4]:
def mediapipe_detection(image,model):
    image = cv.cvtColor(image,cv.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image,cv.COLOR_RGB2BGR)
    return image,results

In [5]:
def draw_landmarks(image,results):
    mp_drawing.draw_landmarks(image,results.face_landmarks,mp_holistic.FACEMESH_TESSELATION)
    mp_drawing.draw_landmarks(image,results.pose_landmarks,mp_holistic.POSE_CONNECTIONS)    
    mp_drawing.draw_landmarks(image,results.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS)    
    mp_drawing.draw_landmarks(image,results.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS)    

In [None]:
capture = cv.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence = 0.5,min_tracking_confidence = 0.5) as holistic:
    while capture.isOpened():

        ret, frame = capture.read()

        image,results = mediapipe_detection(frame,holistic)
        print(results)
        
        draw_landmarks(image,results)
        
        if ret==True:
            cv.imshow('video',image)

            if cv.waitKey(20) & 0xFF == ord('q'):
                break

    capture.release()
    cv.destroyAllWindows()

## Extract Keypoint Values

In [6]:
def extract_landmarks(results):
    pose = np.array([[res.x , res.y , res.z , res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
    face = np.array([[res.x , res.y , res.z ] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)
    lh = np.array([[res.x , res.y , res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(63)
    rh = np.array([[res.x , res.y , res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(63)
    return np.concatenate([pose,face,lh,rh])

In [None]:
result_test = extract_landmarks(results)

In [None]:
print(result_test)

In [None]:
np.save('0',result_test)

In [None]:
np.load('0.npy')

# Setup Folders For Collection

In [7]:
#path for exported data 
DATA_PATH = os.path.join("MP_DATA")

#actions that we try to detect
actions = np.array(['hello','thanks','iloveyou'])

#thirty videos worth of data
no_sequences = 30

#videos are going to be 30 frames in length
sequence_length = 30

In [8]:
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH,action,str(sequence)))
        except:
            pass

# Collecting keypoints for training and testing

In [None]:
capture = cv.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence = 0.5,min_tracking_confidence = 0.5) as holistic:
    
    for action in actions:
        for sequence in range(no_sequences):
            for frame_no in range(sequence_length):

                ret, frame = capture.read()

                image,results = mediapipe_detection(frame,holistic)
                print(results)

                draw_landmarks(image,results)
                
                if frame_no == 0:
                    cv.putText(image,'Starting collection',(120,200),cv.FONT_HERSHEY_SIMPLEX,1,(0,255,0),4,cv.LINE_AA)
                    cv.putText(image,'collecting frames {} for the sequence {} '.format(action,sequence),(15,12),cv.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255),4,cv.LINE_AA)
                    cv.waitKey(2000)
                else:
                    cv.putText(image,'collecting frames {} for the sequence {} '.format(action,sequence),(15,12),cv.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255),4,cv.LINE_AA)
                    
                keypoints = extract_landmarks(results)
                npy_path = os.path.join(DATA_PATH , action , str(sequence),str(frame_no))
                np.save(npy_path,keypoints)
                
                if ret==True:
                    cv.imshow('video',image)

                if cv.waitKey(20) & 0xFF == ord('q'):
                    break

    capture.release()
    cv.destroyAllWindows()

# Preprocess data and create labels and features

In [9]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [10]:
label_map = {label:num for num, label in enumerate(actions)}

In [11]:
label_map

{'hello': 0, 'thanks': 1, 'iloveyou': 2}

In [12]:
sequences,labels = [],[]
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_no in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH,action,str(sequence),"{}.npy".format(frame_no)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [13]:
x = np.array(sequences)
x.shape

(90, 30, 1662)

In [14]:
y = to_categorical(labels).astype(int)
y.shape

(90, 3)

In [15]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.05)

In [None]:
y_train.shape,y_test.shape

# Build and Train LSTM Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM , Dense
from tensorflow.keras.callbacks import TensorBoard

In [None]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [None]:
model = Sequential()
model.add(LSTM(64,return_sequences=True,activation='relu',input_shape=(30,1662)))
model.add(LSTM(128,return_sequences=True,activation='relu'))
model.add(LSTM(64,return_sequences=False,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(actions.shape[0],activation='softmax'))

In [None]:
res = [0.7,0.2,0.1]
actions[np.argmax(res)]         #sample working of model

In [None]:
model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['categorical_accuracy'])

In [None]:
model.fit(x_train, y_train, epochs=2000, callbacks=[tb_callback])

In [None]:
model.summary()

In [22]:
res = model.predict(x_test)



In [None]:
actions[np.argmax(res[2])]

In [None]:
actions[np.argmax(y_test[2])]

In [None]:
model.save('action.h5')

In [23]:
res

array([[3.1475742e-25, 9.9924135e-01, 7.5868383e-04],
       [2.2690001e-27, 9.9988353e-01, 1.1645824e-04],
       [1.1666619e-03, 7.6626751e-16, 9.9883336e-01],
       [1.4998470e-15, 9.5194709e-01, 4.8052944e-02],
       [3.3397790e-10, 1.9160666e-11, 1.0000000e+00]], dtype=float32)

# evaluating using confusion matrix

In [None]:
from sklearn.metrics import multilabel_confusion_matrix , accuracy_score

In [None]:
yhat = model.predict(x_test)

In [None]:
ytrue = np.argmax(y_test,axis=1).tolist()
yhat = np.argmax(yhat,axis=1).tolist()

In [None]:
multilabel_confusion_matrix(ytrue,yhat)

In [None]:
accuracy_score(ytrue,yhat)

# Testing in Real Time

In [19]:
import tensorflow
model = tensorflow.keras.models.load_model('action.h5')

In [21]:
colors = [(245,117,16),(117,245,16),(16,117,245)]
def prob_viz(res,actions,input_frame,colors):
    output_frame = input_frame.copy()
    for num,prob in enumerate(res):
        cv.rectangle(output_frame,(0,60+num*40),(int(prob*100),90+num*40),colors[num],-1)
        cv.putText(output_frame,actions[num],(0,85+num*40),cv.FONT_HERSHEY_SIMPLEX,1,(255,255,255),2,cv.LINE_AA)
        
    return output_frame

In [34]:
len(res) ,print(res)

[0.31654367 0.38543078 0.2980256 ]


(3, None)

In [33]:
res

array([0.31654367, 0.38543078, 0.2980256 ], dtype=float32)

In [35]:
sequence = []
sentence = []
threshold = 0.7

capture = cv.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence = 0.5,min_tracking_confidence = 0.5) as holistic:
    while capture.isOpened():

        ret, frame = capture.read()

        image,results = mediapipe_detection(frame,holistic)
#         print(results)
        
        draw_landmarks(image,results)
        
        keypoints = extract_landmarks(results)
        sequence.append(keypoints)
#         sequence.insert(0,keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence,axis=0))[0]
            print(actions[np.argmax(res)])
            
        if res[np.argmax(res)] > threshold:
            if len(sentence) > 0:
                if actions[np.argmax(res)] != sentence[-1]:
                    sentence.append(actions[np.argmax(res)])
            else:
                sentence.append(actions[np.argmax(res)])
                    
        if len(sentence) >5:
            sentence = sentence[-5:]
            
        image = prob_viz(res,actions,image,colors)
        
        cv.rectangle(image,(0,0),(640,40),(245,117,16),-1)
        cv.putText(image,' '.join(sentence),(3,30),cv.FONT_HERSHEY_SIMPLEX,1,(255,255,255),2,cv.LINE_AA)
            
        if ret==True:
            cv.imshow('video',image)

            if cv.waitKey(20) & 0xFF == ord('q'):
                break

    capture.release()
    cv.destroyAllWindows()

iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
hello
hello
hello
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
hello
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou
iloveyou


In [31]:
capture.release()
cv.destroyAllWindows()