In [1]:
import mediapipe as mp 
import cv2 
import seaborn 
import matplotlib.pyplot as plt 
import time 
import os 
import numpy as np

# Load up MediaPipeline Model

This will help us extract key points

In [8]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [2]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # To improve performance, optionally mark the image as not writeable to
    # pass by reference.
    image.flags.writeable = False 
    results = model.process(image)
    image.flags.writeable = True 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results 

def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                              mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1), # landmark drawing color
                              mp_drawing.DrawingSpec(color=(80, 256, 121), thickness=1, circle_radius=1)) # connection color 
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                               mp_drawing.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=4), 
                               mp_drawing.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2))
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                               mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4), 
                               mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                               mp_drawing.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
                               mp_drawing.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2))

# Test Live Video Feed

In [30]:
vc = cv2.VideoCapture(0) # open up device camera 

# set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence =0.5, min_tracking_confidence=0.5) as holistic:
    
    # read the feed from the device camera while its open 
    while vc.isOpened():
        
        # Read video capture 
        ret, frame = vc.read()
        
        # Make detection 
        image, result = mediapipe_detection(frame, holistic)
        
        # draw landmarks in real time and display them
        draw_landmarks(frame, result)
        #plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        # Show image 
        cv2.imshow("Camera Feed", frame)
        
        # check if the q keep is pressed and break if it was 
        if cv2.waitKey(10) == ord('q'):
            break 
    vc.release()
    cv2.destroyAllWindows()

# Extract Key Points

In [33]:

HAND_LANDMARKS = 21 * 3 # 21 points, 3 dimensions 
POS_LANDMARKS = 33 * 4 # 33 points, 4 dimensions 
FACE_LANDMARKS = 468 * 3 # 468 poitns, 3 dimensions
DATA_LEN = (2 * HAND_LANDMARKS) + POS_LANDMARKS + FACE_LANDMARKS # total length of data when flattened
def get_keypoints(result):
    
    # it is important to flatten these as it will be passed into our model
    lh_kp = np.array([[res.x, res.y, res.z]  for res in result.left_hand_landmarks.landmark]).flatten() if result.left_hand_landmarks else np.zeros(HAND_LANDMARKS)
    rh_kp = np.array([[res.x, res.y, res.z]  for res in result.right_hand_landmarks.landmark]).flatten() if result.right_hand_landmarks else np.zeros(HAND_LANDMARKS)
    pos_kp = np.array([[res.x, res.y, res.z, res.visibility]  for res in result.pose_landmarks.landmark]).flatten() if result.pose_landmarks else np.zeros(POS_LANDMARKS)
    face_kp = np.array([[res.x, res.y, res.z]  for res in result.face_landmarks.landmark]).flatten() if result.face_landmarks else np.zeros(FACE_LANDMARKS)

    # concat all our data into 1 flat array
    return np.concatenate([lh_kp, rh_kp, pos_kp, face_kp])



# Setup Folders for Data Collection

In [5]:
DATA_PATH = os.path.join('MP_DATA')

# Actions that we can detect 
actions = np.array(['lift', 'land', 'follow'])
TOTAL_SEQENCES = 30 # the number of sequences we collect (think number of examples)
SEQUENCE_LENGTH = 30 # the number of frames used to classify action (think length of examples)

In [6]:
# create folders for training data
for action in actions:
    for sequences in range(TOTAL_SEQENCES):
        try:
            # make folder for each example for each action 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequences)))
        except:
            pass 

In [9]:
vc = cv2.VideoCapture(0) # open up device camera 

# set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence =0.5, min_tracking_confidence=0.5) as holistic:
    
    # go through each action 
    for action in actions:
        
        # create desired amount of examples
        for seq in range(TOTAL_SEQENCES):
            
            # collect data for each sequence length
            for frame_num in range(SEQUENCE_LENGTH):
                
                # get frame from stream 
                ret, frame = vc.read()
                
                # make detection
                image, results = mediapipe_detection(frame, holistic)
                
                # show detection
                draw_landmarks(image, results)
                
                # display useful info on collecting or starting new collection
                if frame_num == 0:
                    cv2.putText(image, 'STARTING COLLECTION', (120, 200),
                                cv2.FONT_HERSHEY_SIMPLEX, .5, (0, 0, 255), 4, cv2.LINE_AA)
                    
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, seq), (15, 12),
                                cv2.FONT_HERSHEY_SIMPLEX, .5, (0, 0, 255), 1, cv2.LINE_AA)
                    cv2.waitKey(2000)
                else:
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, seq), (15, 12),
                                cv2.FONT_HERSHEY_SIMPLEX, .5, (0, 0, 255), 1, cv2.LINE_AA)

                # save kp 
                keypoints = get_keypoints(results)
                np.save(os.path.join(DATA_PATH, action, str(seq), str(frame_num)), keypoints)
                
                # show the feed
                cv2.imshow('Live Feed', image)
                
                # break gracefully 
                if cv2.waitKey(10) == ord('q'):
                    break
    vc.release()
    cv2.destroyAllWindows()

# Preprocess Data + Create Labels and Features

In [12]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [14]:
label_map = {label:num for num, label in enumerate(actions)}
label_map

{'lift': 0, 'land': 1, 'follow': 2}

In [15]:
sequences, labels = [], []
for action in actions: 
    for sequence in range(TOTAL_SEQENCES):
        
        # load in all numpy data for a given example 
        window = []
        for frame in range(SEQUENCE_LENGTH):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame)))
            window.append(res)
        
        # for current example, add all data for that sequence and appropraite label 
        sequences.append(window)
        labels.append(label_map[action])

In [18]:
print(np.array(sequences).shape, np.array(labels).shape)

(90, 30, 1662) (90,)


In [19]:
X = np.array(sequences)
y = to_categorical(labels).astype(int)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.05) # 5% to test and 95% to train 

# Build and Train LSTM NN

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense 
from tensorflow.keras.callbacks import TensorBoard

In [22]:
# web app that allows you to see training
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [35]:
# build model architecture 
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape = (SEQUENCE_LENGTH, DATA_LEN))) # to stack LSTMs return_sequences needs to be True, the next layer will recieve the "history"
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))


In [36]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [37]:
model.fit(X_train, y_train, epochs=1000, callbacks=[tb_callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

KeyboardInterrupt: 

In [38]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 30, 64)            442112    
                                                                 
 lstm_4 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 3)                 99        
                                                                 
Total params: 596675 (2.28 MB)
Trainable params: 59667

In [47]:
model.save('gesture.keras')

# Evaluate Model

In [50]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score 

In [48]:
yhat = model.predict(X_test)
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()



In [51]:
accuracy_score(ytrue, yhat)

1.0

In [52]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[3, 0],
        [0, 2]],

       [[2, 0],
        [0, 3]]], dtype=int64)

# Real Time Detection

In [62]:
colors = [(245, 117, 16), (117, 245, 16), (16, 117, 245)]
def prob_viz(res, actions, input_Frame, colors):
    output_frame = input_Frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0, 60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
    return output_frame

In [66]:
sequence = []
sentence = []
thres = .6

vc = cv2.VideoCapture(0) # open up device camera 

# set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence =0.5, min_tracking_confidence=0.5) as holistic:
    
    # read the feed from the device camera while its open 
    while vc.isOpened():
        
        # Read video capture 
        ret, frame = vc.read()
        
        # Make detection 
        image, result = mediapipe_detection(frame, holistic)
        
        # draw landmarks in real time and display them
        draw_landmarks(image, result)
        
        # prediction logic
        kp = get_keypoints(result)
        sequences.insert(0, kp) # add new frames to beginning so we make prediction from most recent to past  (you may need to reverse this list)
        sequence = sequences[:SEQUENCE_LENGTH] # get the last n frames
        sequence.reverse()
        
        # make prediction if you can
        if len(sequence) == SEQUENCE_LENGTH:
            res = model.predict(np.array([sequence]))[0]
            print(actions[np.argmax(res)])
        
        # see if prediction is above threshold 
        if res[np.argmax(res)] > thres:
            
            if len(sentence) > 0: 
                if actions[np.argmax(res)] != sentence[-1]:
                    sentence.append(actions[np.argmax(res)])
            else:
                sentence.append(actions[np.argmax(res)])
        
        if len(sentence) > 5:
            sentence = sentence[-5:]
        
        image = prob_viz(res, actions, image, colors)
        
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show image 
        cv2.imshow("Camera Feed", image)
        
        # check if the q keep is pressed and break if it was 
        if cv2.waitKey(10) == ord('q'):
            break 
    vc.release()
    cv2.destroyAllWindows()

land
lift
land
land
land
land
lift
land
lift
land
lift
follow
lift
land
lift
land
lift
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
follow
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
land
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
follow
land
land
land
land
land
lift
lift
lift
lift
li