## Importin Libraries

In [1]:
import os
import cv2
import json
import time
import numpy as np
import mediapipe as mp
import tensorflow as tf
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor

tf.get_logger().setLevel('ERROR')




## MediaPipe Implementation

In [2]:
filtered_hand = list(range(21))

filtered_pose = [11, 12, 13, 14, 15, 16]

filtered_face = [0, 4, 7, 8, 10, 13, 14, 17, 21, 33, 37, 39, 40, 46, 52, 53, 54, 55, 58,
                 61, 63, 65, 66, 67, 70, 78, 80, 81, 82, 84, 87, 88, 91, 93, 95, 103, 105,
                 107, 109, 127, 132, 133, 136, 144, 145, 146, 148, 149, 150, 152, 153, 154,
                 155, 157, 158, 159, 160, 161, 162, 163, 172, 173, 176, 178, 181, 185, 191,
                 234, 246, 249, 251, 263, 267, 269, 270, 276, 282, 283, 284, 285, 288, 291,
                 293, 295, 296, 297, 300, 308, 310, 311, 312, 314, 317, 318, 321, 323, 324,
                 332, 334, 336, 338, 356, 361, 362, 365, 373, 374, 375, 377, 378, 379, 380,
                 381, 382, 384, 385, 386, 387, 388, 389, 390, 397, 398, 400, 402, 405, 409,
                 415, 454, 466, 468, 473]

HAND_NUM = len(filtered_hand)
POSE_NUM = len(filtered_pose)
FACE_NUM = len(filtered_face)

In [3]:
hands = mp.solutions.hands.Hands()
pose = mp.solutions.pose.Pose()
face_mesh = mp.solutions.face_mesh.FaceMesh(refine_landmarks=True)

def get_frame_landmarks(frame):
    
    all_landmarks = np.zeros((HAND_NUM * 2 + POSE_NUM + FACE_NUM, 3))
    
    def get_hands(frame):
        results_hands = hands.process(frame)
        if results_hands.multi_hand_landmarks:
            for i, hand_landmarks in enumerate(results_hands.multi_hand_landmarks):
                if results_hands.multi_handedness[i].classification[0].index == 0: 
                    all_landmarks[:HAND_NUM, :] = np.array(
                        [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]) # right
                else:
                    all_landmarks[HAND_NUM:HAND_NUM * 2, :] = np.array(
                        [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]) # left

    def get_pose(frame):
        results_pose = pose.process(frame)
        if results_pose.pose_landmarks:
            all_landmarks[HAND_NUM * 2:HAND_NUM * 2 + POSE_NUM, :] = np.array(
                [(lm.x, lm.y, lm.z) for lm in results_pose.pose_landmarks.landmark])[filtered_pose]
        
    def get_face(frame):
        results_face = face_mesh.process(frame)
        if results_face.multi_face_landmarks:
            all_landmarks[HAND_NUM * 2 + POSE_NUM:, :] = np.array(
                [(lm.x, lm.y, lm.z) for lm in results_face.multi_face_landmarks[0].landmark])[filtered_face]
        
    with ThreadPoolExecutor(max_workers=3) as executor:
        executor.submit(get_hands, frame)
        executor.submit(get_pose, frame)
        executor.submit(get_face, frame)

    return all_landmarks

## Load Trained Model

In [6]:
model_path = "C:\\Users\\dell\\Desktop\AI\\Grad\\2nd\\Testing Model\\test.h5"
index_mapping_path = "C:\\Users\\dell\\Desktop\\AI\\Grad\\2nd\\Testing Model\\index_mapping_586.json"
label_mapping_path = "C:\\Users\\dell\\Desktop\\AI\\Grad\\2nd\\Testing Model\\label_mapping_586.json"

model = tf.keras.models.load_model(model_path)
index_mapping = json.load(open(index_mapping_path, "r"))
label_mapping = json.load(open(label_mapping_path, "r"))

In [7]:
model.input_shape, model.output_shape

((None, 100, 180, 3), (None, 586))

In [8]:
from collections import deque

cap = cv2.VideoCapture(0)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

sequence = deque(maxlen=model.input_shape[1])
for _ in range(model.input_shape[1]):
    sequence.append(np.zeros((model.input_shape[2], 3)))

tic = tac = 0
counter = 0
step = model.input_shape[1]
label = ''
   
while True:
    ret, frame = cap.read()
    if not ret: continue
    
    tac = time.time()
    fps = str(int(1 / (tac - tic)))
    tic = tac
    
    frame = cv2.flip(frame, 1)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame_rgb.flags.writeable = False
    frame_landmarks = get_frame_landmarks(frame_rgb)
    
    for point in frame_landmarks:
        x = int(point[0] * width)
        y = int(point[1] * height)
        cv2.circle(frame, (x, y), 2, (0, 255, 0), -1)
    cv2.putText(frame, fps, (30,60), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 2, cv2.LINE_AA)
    cv2.putText(frame, label, (30, 130), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
    
    sequence.append(frame_landmarks)
    counter = (counter + 1) % step
    if counter == 0:     
        sequence_array = np.array(sequence)
        sequence_array = sequence_array.reshape(1, model.input_shape[1], model.input_shape[2], 3)
        
        prediction = model.predict(sequence_array)
        prediction = prediction.reshape(-1)
        prediction = prediction.argmax()
        
        label = index_mapping[str(prediction)]
        print(label)
    
    cv2.imshow("Test", frame)
    cv2.setWindowProperty("Test", cv2.WND_PROP_TOPMOST, 1)
    k = cv2.waitKey(1)
    if k == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

go back and forth
go back and forth
firetruck
swear in
make believe
take off
river
know nothing
greet
know nothing


In [14]:
from collections import deque

cap = cv2.VideoCapture(0)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

sequence = deque(maxlen=model.input_shape[1])
for _ in range(model.input_shape[1]):
    sequence.append(np.zeros((model.input_shape[2], 3)))

tic = tac = 0
counter = 0
step = model.input_shape[1]
label = ''


output_fps = 1/4 
output_width = width
output_height = height
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output = cv2.VideoWriter('output.avi', fourcc, output_fps, (output_width, output_height))

start_time = time.time()
while True:
    ret, frame = cap.read()
    if not ret:
        continue
    
    tac = time.time()
    fps = str(int(1 / (tac - tic)))
    tic = tac
    
    frame = cv2.flip(frame, 1)
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame_rgb.flags.writeable = False
    frame_landmarks = get_frame_landmarks(frame_rgb)
    
    for point in frame_landmarks:
        x = int(point[0] * width)
        y = int(point[1] * height)
        cv2.circle(frame, (x, y), 2, (0, 255, 0), -1)
    cv2.putText(frame, fps, (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 2, cv2.LINE_AA)
    cv2.putText(frame, label, (30, 130), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
    
    sequence.append(frame_landmarks)
    counter = (counter + 1) % step
    if counter == 0:
        sequence_array = np.array(sequence)
        sequence_array = sequence_array.reshape(1, model.input_shape[1], model.input_shape[2], 3)
        
        prediction = model.predict(sequence_array)
        prediction = prediction.reshape(-1)
        prediction = prediction.argmax()
        
        label = index_mapping[str(prediction)]
        print(label)
    
    current_time = time.time()
    elapsed_time = current_time - start_time
    cv2.putText(frame, f"Time: {elapsed_time:.2f}s", (30, 200), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)
    if elapsed_time >= 4:
        output.write(frame)  
        start_time = current_time
    
    cv2.imshow("Test", frame)
    cv2.setWindowProperty("Test", cv2.WND_PROP_TOPMOST, 1)
    k = cv2.waitKey(1)
    if k == ord('q'):
        break

cap.release()
output.release()  
cv2.destroyAllWindows()

no big deal
go back and forth
basketball
cellphone
go back and forth
motorcycle
