In [1]:
import os
import numpy as np
import cv2
import time
from pynput.keyboard import Key, Controller
# import screen_brightness_control as sbc
from keras.models import load_model

from lib.data_loader import DataLoader

In [2]:
root_directory = r'<path>\dataset'
labels_csv_path = os.path.join(root_directory, 'labels_extracted.csv')
train_csv_path = os.path.join(root_directory, 'train_extracted.csv')
val_csv_path = os.path.join(root_directory, 'validation_extracted.csv')

data = DataLoader(labels_csv_path, train_csv_path, val_csv_path)

print('loading model...')
model = load_model('output/models/resnet_101.h5', compile=False)
print('model loaded successfully')

loading model...
model loaded successfully


In [5]:
hand_gesture_action_mapping = {
    'Swiping Left': 'fast forward 10 seconds',
    'Swiping Right': 'rewind 10 seconds',
    'Swiping Down': 'previous video',
    'Swiping Up': 'next video',
    'Sliding Two Fingers Down': 'decrease volume',
    'Sliding Two Fingers Up': 'increase volume',
    'Thumb Down': 'mute / unmute',
    'Thumb Up': 'enter / exit full screen',
    'Stop Sign': 'play / pause',
    'No gesture': 'no action'
}

In [16]:
WIDTH = 96
HEIGHT = 64
N_FRAMES = 16

buffer = []
predicted_value = 9
hand_gesture = ""

cam = cv2.VideoCapture(0)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 400)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 400)
keyboard = Controller()

while(cam.isOpened()):
    
    return_value, frame = cam.read()
    if return_value:
        image = cv2.resize(frame, (WIDTH, HEIGHT))
        image = image/255.0
        buffer.append(image)
        
        # if buffer is appended with n frames
        if(len(buffer)%N_FRAMES == 0):
            buffer = np.expand_dims(buffer, 0)
            predicted_value = np.argmax(model.predict(buffer, verbose=0))

            if(predicted_value == 0):
                keyboard.tap('l')
                
            elif (predicted_value == 1):
                keyboard.tap('j')
                
            elif (predicted_value == 2):
                keyboard.press(Key.shift)
                keyboard.tap('p')
                keyboard.release(Key.shift)
                
            elif (predicted_value== 3):
                keyboard.press(Key.shift)
                keyboard.tap('n')
                keyboard.release(Key.shift)
                
            elif (predicted_value == 4):
                keyboard.tap(Key.down)
                
            elif (predicted_value == 5):
                keyboard.tap(Key.up)
                
            elif (predicted_value == 6):
                keyboard.tap('m')

            elif (predicted_value == 7):
                keyboard.tap('f')
                
            elif (predicted_value == 8):
                keyboard.tap('k')
                
            elif (predicted_value == 9):
                pass

            cv2.imshow('frame', frame)
            buffer = []
            
        gesture = data.int_to_label[predicted_value]
        text = f'{gesture} -> {hand_gesture_action_mapping[gesture]}'
        cv2.putText(frame, text, (20, 35), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 3)
        cv2.imshow('frame',frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
        
cam.release()
cv2.destroyAllWindows()