In [119]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

import tensorflow as tf
from tensorflow import keras,lite

from keras.models import load_model
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import TensorBoard

from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

import pyttsx3


In [124]:
mp_hands = mp.solutions.hands # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results


def draw_styled_landmarks(image, results):
    # Draw left hand connections
    if not results.multi_hand_landmarks:
        return
    for hand_landmarks in results.multi_hand_landmarks:
        mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    
def draw_landmarks(image, results):
    if not results.multi_hand_landmarks:
        return
    for hand_landmarks in results.multi_hand_landmarks:
        mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS) # Draw left hand connections
    
# def extract_keypoints(results):
#     lh = np.array([[res.x, res.y, res.z] for res in results.multi_hand_landmarks.landmark]).flatten() if results.multi_hand_landmarks else np.zeros(21*3)
#     return np.concatenate([lh])

# def extract_keypoints(results):
#     lh = np.array([[[res.landmark[point].x, res.landmark[point].y, res.landmark[point].z] for point in mp_hands.HandLandmark for res in results.multi_hand_landmarks]], dtype=float).flatten() if results.multi_hand_landmarks else np.zeros(21*3*2)
#     return np.concatenate([lh])

def extract_keypoints(results):
    lh = np.array([[res.landmark[point].x, res.landmark[point].y, res.landmark[point].z] for point in mp_hands.HandLandmark for res in results.multi_hand_landmarks], dtype=float).flatten() if results.multi_hand_landmarks else np.zeros(21*3*2)
    if lh.shape[0] == 63:
        rh = np.array([[res.landmark[point].x, res.landmark[point].y, res.landmark[point].z] for point in mp_hands.HandLandmark for res in results.multi_hand_landmarks], dtype=float).flatten() if results.multi_hand_landmarks else np.zeros(21*3)
        return np.concatenate([lh, rh])
    return np.concatenate([lh])

colors = [(155,17,16),(245,117,16), (117,245,16), (16,117,245),(225,17,16), (11,245,16), (126,117,24)]

def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)

    return output_frame


In [125]:
# Actions that we try to detect
actions = np.array(['hello','play','okay','nice','iloveu','peace','promise','why','hungry','temple'])

log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [126]:
# model = Sequential()
# model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,126)))
# model.add(LSTM(128, return_sequences=True, activation='relu'))
# model.add(LSTM(64, return_sequences=False, activation='relu'))
# model.add(Dense(64, activation='relu'))
# model.add(Dense(32, activation='relu'))
# model.add(Dense(actions.shape[0], activation='softmax'))

# model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# model.load_weights("action.h5")
model = load_model("action.h5")

In [127]:
# 1. New detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.5

try:

    cap = cv2.VideoCapture(0)
    # Set mediapipe model 
    with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
        while cap.isOpened():

            # Read feed
            ret, frame = cap.read()

            # Make detections
            image, results = mediapipe_detection(frame, hands)
            # print(results)
            
            # Draw landmarks
            draw_styled_landmarks(image, results)
            
            # 2. Prediction logic
            keypoints = extract_keypoints(results)
            sequence.append(keypoints)
            sequence = sequence[-30:]
            
            # print(np.expand_dims(sequence, axis=0))
            
            if len(sequence) == 30:
                # res = model.predict(np.array(sequence, dtype=float))
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                # res = model.predict(sequence)
                # print(actions[np.argmax(res)])
                predictions.append(np.argmax(res))
                
                
                #3. Viz logic
                if np.unique(predictions[-10:])[0]==np.argmax(res): 
                    if res[np.argmax(res)] > threshold: 
                        
                        word = ""
                        if len(sentence) > 0:
                            if actions[np.argmax(res)] != sentence[-1]:
                                sentence.append(actions[np.argmax(res)])
                                word = actions[np.argmax(res)]
                        else:
                            sentence.append(actions[np.argmax(res)])
                            word = actions[np.argmax(res)]

                        # engine = pyttsx3.init()
                        # newVoiceRate = 125
                        # engine.setProperty('rate',newVoiceRate)
                        # engine.say(word)
                        # # play the speech
                        # engine.runAndWait()

                if len(sentence) > 1:
                    sentence = sentence[-1:]

                # Viz probabilities
                image = prob_viz(res, actions, image, colors)
                
            cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
            cv2.putText(image, ' '.join(sentence), (3,30), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
            
            # Show to screen
            cv2.imshow('OpenCV Feed', image)

            # Break gracefully
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
        cap.release()
        cv2.destroyAllWindows()

finally:
    cap.release()
    cv2.destroyAllWindows()



In [None]:
cap.release()
cv2.destroyAllWindows()