In [2]:
# Import the necessary Packages for this software to run
import mediapipe as mp
import cv2
import os
import time

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
#Create Table
df_columns = [] #'video_id', 'label_id', 'label']
for frame in range(1,17):
    for landmark in range (0,21):
        s1 = f'{"F"}{frame}{"_L"}{landmark}{"_X"}'
        s2 = f'{"F"}{frame}{"_L"}{landmark}{"_Y"}'
        s3 = f'{"F"}{frame}{"_L"}{landmark}{"_Z"}'
        df_columns.append(s1)
        df_columns.append(s2)
        df_columns.append(s3)


In [4]:
#Create Dataframe
df = pd.DataFrame(columns=df_columns)

In [5]:
#Show Dataframe
df.head()

Unnamed: 0,F1_L0_X,F1_L0_Y,F1_L0_Z,F1_L1_X,F1_L1_Y,F1_L1_Z,F1_L2_X,F1_L2_Y,F1_L2_Z,F1_L3_X,...,F16_L17_Z,F16_L18_X,F16_L18_Y,F16_L18_Z,F16_L19_X,F16_L19_Y,F16_L19_Z,F16_L20_X,F16_L20_Y,F16_L20_Z


In [6]:
# Use MediaPipe to draw the hand framework over the top of hands it identifies
drawingModule = mp.solutions.drawing_utils
handsModule = mp.solutions.hands


def extract_data(image):
    # Use MediaPipe hand tracking with static_image_mode set to True
    with handsModule.Hands(static_image_mode=True, min_detection_confidence=0.7, max_num_hands=1) as hands:
        new_row=[]
        j = 0
        frame = cv2.resize(image, (640, 480))
        a = time.time()
        # Process the image and produce the hand framework overlay on top of the hand
        results = hands.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        
        # In case the system sees multiple hands, this if statement deals with that and produces another hand overlay
        if results.multi_hand_landmarks:
            for handLandmarks in results.multi_hand_landmarks:
                drawingModule.draw_landmarks(frame, handLandmarks, handsModule.HAND_CONNECTIONS)
                i = 0
                last_idx = 0
                for idx, landmark in enumerate(handLandmarks.landmark):
                    while (i < idx):
                        new_row.append(0)
                        new_row.append(0)
                        new_row.append(0)
                        i = i+1

                    new_row.append(landmark.x)
                    new_row.append(landmark.y)
                    new_row.append(landmark.z)
                    i = idx+1
                    last_idx = idx
                if last_idx <= 20:
                    for i in range(last_idx+1, 21):
                        new_row.append(0)
                        new_row.append(0)
                        new_row.append(0)
        
        else:
            for i in range (0,63):
                new_row.append(0)
            j = j+1
        result = new_row.copy()
        new_row.clear()
        return result

In [7]:
import torch
import torch.nn as nn

# Define the RNNClassifier class (as you have done before)
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.5):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(self.dropout(out[:, -1, :]))
        return out
# Hyperparameters
input_size = 64
hidden_size = 256
num_layers = 2
num_classes = 10
dropout = 0.5  # Adjust this value as needed

# Initialize the model
model = RNNClassifier(input_size, hidden_size, num_layers, num_classes, dropout)

In [8]:
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
print(device)
model.load_state_dict(torch.load('final_mit0.pth', map_location=device))
model.to(device)
model.eval()

cpu


RNNClassifier(
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=256, out_features=10, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [15]:
import pyautogui
def on_input(label):
    print(label)
    print(type(label))
    match label:
        case 1:  # Pushing Hand Away
            pyautogui.click()
        case 2: # Sliding Two Fingers Down
            pyautogui.move(0,20,0)
        case 5: # Sliding Two Fingers Up
            pyautogui.move(0,-20,0)
        case 3: # Sliding Two Fingers Left
            pyautogui.move(-20,0,0)
        case 4: # Sliding Two Fingers Right
            pyautogui.move(20,0,0)
        case 8: # Zooming In With Two Fingers
            pyautogui.keyDown('ctrl')
            pyautogui.scroll(100)
            pyautogui.keyUp('ctrl')
        case 9: # Zooming Out With Two Fingers
            pyautogui.keyDown('ctrl')
            pyautogui.scroll(-100)
            pyautogui.keyUp('ctrl')
        case default:
            print("Nothing happened")

# label_dict = {0: "Pushing hands away",
#               1: "Sliding two fingers down",
#               2: "Sliding two fingers left",
#               3: "Sliding two fingers right",
#               4: "Sliding two fingers up",
#               5: "Thumbs down",
#               6: "Thumbs up",
#               7: "Zooming in with two fingers",
#               8: "Zooming out with two fingers"}

label_dict = {0: "Doing other things",
              1: "Pushing hands away",
              2: "Sliding two fingers down",
              3: "Sliding two fingers left",
              4: "Sliding two fingers right",
              5: "Sliding two fingers up",
              6: "Thumbs down",
              7: "Thumbs up",
              8: "Zooming in with two fingers",
              9: "Zooming out with two fingers"}


cap = cv2.VideoCapture(0)
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
row_arr = []
counter, ct = 0, 0
n = 16

starttime = time.time()
frames_per_second = 30

last_predictions = []

while True:
    ret, frame = cap.read()
    row_arr.extend(extract_data(frame))
    counter += 1
    
    a = time.time()
    if counter > n:
        del row_arr[0:63]
        counter -= 1
    
    if counter == n:
        # model anwenden
        x = np.concatenate([np.array(row_arr).reshape(16,63), np.zeros((16, 1))], axis=1)[None,:,:]
        input_data = torch.tensor(x, dtype=torch.float32) 
        input_data = input_data.to(device)

        if np.sum(x==0) / x.size < 0.5:     # Nur prediction wenn genug Datenpunkte vorhanden sind (>50%)
            with torch.no_grad():           # Inference
                output = model(input_data)[0]
            label = torch.argmax(output).cpu().item()

            # if ct%10 == 0:                # Printe nur alle 10 frames
            #     print("-"*100,"\n",label, label_dict[label],"\n","-"*100)
            # ct += 1

            # Printe nur wenn die letzten 5 frames das gleiche predicted haben
            # if len(last_predictions)!=0 and np.all(np.array(last_predictions) == np.array(label)):
            #     on_input(label)
            #     # print("-"*100,"\n",label, label_dict[label],"\n","-"*100)
            # if len(last_predictions) >= 5:
            #     last_predictions = last_predictions[1:]
            # last_predictions.append(label)
            cv2.putText(frame,str(label_dict[label]), (50,50), cv2.FONT_HERSHEY_SIMPLEX, 1, 255, thickness=5)
            
        
        # uncomment following lines to get mouse movement
        # if label and (label == 2 or label == 3 or label == 4 or label == 5):
        #     on_input(label)
        # elif label and (label == 1 or label == 8 or label == 9):
        #     if ct%10 == 0:
        #         on_input(label)
        # ct +=1
            


    # Below shows the current frame to the desktop 
    cv2.imshow("Frame", frame)                                                                                                                                                                                                                                                                                                                                                              
    key = cv2.waitKey(1) & 0xFF
    
    # #Below states that if the |q| is press on the keyboard it will stop the system
    if key == ord("q"):
        cv2.destroyAllWindows()
        del(cap)
        break