## Hand Gesture Controls For YouTube

The Hand Gesture program is developed using Jupyter Notebook and implemented in Python 3. 
Make sure to install required libraries for the project:
    
    - Numpy:     !pip install numpy
    - OpenCV:    !pip install opencv-python 
    - Mediapipe: !pip install mediapipe
    - Pyautogui: !pip install PyAutoGUI
    - Time:      !pip install python-time
    - Sys:       !pip install numpy

The hand detection algorithm is written for the right hand, with the palm facing the camera. After successfully running the program, make sure the Browser window containing YouTube is focused on since the program automates keyboard presses based on the different gestures. The media player controls include Pause/Play, Skip video, Volume up/Volume down, Mute, Seek left/right, and Full screen control.

In [2]:
# Import libraries
import numpy as np
import cv2
import mediapipe as mp
import pyautogui
import time
import sys

# For drawing the hand landmarks and connections
mp_drawing = mp.solutions.drawing_utils
# For detecting hands and their landmarks
mp_hands = mp.solutions.hands

# initializes a VideoCapture (change value to 0 depending of the camera used)
cap = cv2.VideoCapture(1)

# set the initial time to the current time
last_press_time = time.time()   

if not cap.isOpened():
    print("Cannot open camera")
    exit()

# Helper function, that determines the current gesture from the given hand_landmarks
def Gesture_recognizer(hand_landmarks):
    """
    Args:
        hand_landmarks: landmarks from mediapipe
    Returns:
        None
    """
    
    global last_press_time
    # finger coordinates
    # Thumb coordinates
    thumb_tip = np.array([hand_landmarks[mp_hands.HandLandmark.THUMB_TIP.value].x, hand_landmarks[mp_hands.HandLandmark.THUMB_TIP.value].y])
    thumb_ip = np.array([hand_landmarks[mp_hands.HandLandmark.THUMB_IP.value].x, hand_landmarks[mp_hands.HandLandmark.THUMB_IP.value].y])
    thumb_mcp = np.array([hand_landmarks[mp_hands.HandLandmark.THUMB_MCP.value].x, hand_landmarks[mp_hands.HandLandmark.THUMB_MCP.value].y])
    thumb_cmc = np.array([hand_landmarks[mp_hands.HandLandmark.THUMB_CMC.value].x, hand_landmarks[mp_hands.HandLandmark.THUMB_CMC.value].y])
     
    # Index finger coordinates
    index_tip = np.array([hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_TIP.value].x, hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_TIP.value].y])
    index_dip = np.array([hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_DIP.value].x, hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_DIP.value].y])
    index_pip = np.array([hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_PIP.value].x, hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_PIP.value].y])
    index_mcp = np.array([hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_MCP.value].x, hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_MCP.value].y])

    # Middle finger coordinates
    middle_tip = np.array([hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_TIP.value].x, hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_TIP.value].y])
    middle_dip = np.array([hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_DIP.value].x, hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_DIP.value].y])
    middle_pip = np.array([hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_PIP.value].x, hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_PIP.value].y])
    middle_mcp = np.array([hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_MCP.value].x, hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_MCP.value].y])
    
    # ring finger coordinates
    ring_tip = np.array([hand_landmarks[mp_hands.HandLandmark.RING_FINGER_TIP.value].x, hand_landmarks[mp_hands.HandLandmark.RING_FINGER_TIP.value].y])
    ring_dip = np.array([hand_landmarks[mp_hands.HandLandmark.RING_FINGER_DIP.value].x, hand_landmarks[mp_hands.HandLandmark.RING_FINGER_DIP.value].y])
    ring_pip = np.array([hand_landmarks[mp_hands.HandLandmark.RING_FINGER_PIP.value].x, hand_landmarks[mp_hands.HandLandmark.RING_FINGER_PIP.value].y])
    ring_mcp = np.array([hand_landmarks[mp_hands.HandLandmark.RING_FINGER_MCP.value].x, hand_landmarks[mp_hands.HandLandmark.RING_FINGER_MCP.value].y])
    
    # pinky finger coordinates
    pinky_tip = np.array([hand_landmarks[mp_hands.HandLandmark.PINKY_TIP.value].x, hand_landmarks[mp_hands.HandLandmark.PINKY_TIP.value].y])
    pinky_dip = np.array([hand_landmarks[mp_hands.HandLandmark.PINKY_DIP.value].x, hand_landmarks[mp_hands.HandLandmark.PINKY_DIP.value].y])
    pinky_pip = np.array([hand_landmarks[mp_hands.HandLandmark.PINKY_PIP.value].x, hand_landmarks[mp_hands.HandLandmark.PINKY_PIP.value].y])
    pinky_mcp = np.array([hand_landmarks[mp_hands.HandLandmark.PINKY_MCP.value].x, hand_landmarks[mp_hands.HandLandmark.PINKY_MCP.value].y])
    
    # wrist coordinates
    wrist = np.array([hand_landmarks[mp_hands.HandLandmark.WRIST.value].x, hand_landmarks[mp_hands.HandLandmark.WRIST.value].y])


    ### Pause/Play control ###
    if index_tip[1]*100 < index_pip[1]*100 and middle_tip[1]*100 < middle_pip[1]*100 and ring_tip[1]*100 < ring_pip[1]*100 and pinky_tip[1]*100 < pinky_pip[1]*100 and thumb_tip[0]*100 < thumb_cmc[0]*100 and thumb_tip[0]*100 < thumb_ip[0]*100:
        cv2.putText(image, "Pause/Play", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        if abs(time.time() - last_press_time) >= 1:
            pyautogui.press("space")
            last_press_time = time.time()

    ### Full screen control ### 
    elif index_tip[1]*100 > index_pip[1]*100 and middle_tip[1]*100 < middle_pip[1]*100 and ring_tip[1]*100 < ring_pip[1]*100 and pinky_tip[1]*100 < pinky_pip[1]*100 and index_tip[1]*100 < wrist[1]*100:
        cv2.putText(image, "Full Screen", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        if abs(time.time() - last_press_time) >= 1:
            pyautogui.press("f")
            last_press_time = time.time() 
            
    ### Next video control ### 
    elif index_tip[1]*100 < index_pip[1]*100 and middle_tip[1]*100 < middle_pip[1]*100 and ring_tip[1]*100 > ring_pip[1]*100 and pinky_tip[1]*100 > pinky_pip[1]*100:
        cv2.putText(image, "Next Video", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        if abs(time.time() - last_press_time) >= 1:
            pyautogui.hotkey('shift', 'n')
            last_press_time = time.time()    

    ### Volume controls ###    
    # volume down
    elif index_tip[1]*100 > wrist[1]*100:
        cv2.putText(image, "Volume down", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        if abs(time.time() - last_press_time) >= 1:
            pyautogui.press("down")
            last_press_time = time.time()
    
    # volume up
    elif index_tip[1]*100 < index_pip[1]*100 and middle_tip[1]*100 > middle_pip[1]*100 and ring_tip[1]*100 > ring_pip[1]*100 and pinky_tip[1]*100 > pinky_pip[1]*100:
        cv2.putText(image, "Volume Up", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        if abs(time.time() - last_press_time) >= 1:
            pyautogui.press("up")
            last_press_time = time.time()    
    
    # Mute/unmute
    elif index_tip[1]*100 < index_pip[1]*100 and middle_tip[1]*100 > middle_pip[1]*100 and ring_tip[1]*100 > ring_pip[1]*100 and pinky_tip[1]*100 < pinky_pip[1]*100:
        cv2.putText(image, "Mute/Unmute", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        if abs(time.time() - last_press_time) >= 1:
            pyautogui.press("m")
            last_press_time = time.time()             
    
    ### Seek left control ### 
    elif thumb_tip[0]*100 < thumb_cmc[0]*100 and thumb_tip[0]*100 < thumb_ip[0]*100:
        cv2.putText(image, "Seek Left", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        if abs(time.time() - last_press_time) >= 1:
            pyautogui.press("left")
            last_press_time = time.time()

    ### Seek right control ###  
    elif index_tip[1]*100 > index_pip[1]*100 and middle_tip[1]*100 > middle_pip[1]*100 and ring_tip[1]*100 > ring_pip[1]*100 and pinky_tip[1]*100 < pinky_pip[1]*100:
        cv2.putText(image, "Seek right", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        if abs(time.time() - last_press_time) >= 1:
            pyautogui.press("right")
            last_press_time = time.time()
            
  
    
# set up opencv to gather video stream, run hand landmark recogntion until the program is intrupted by user   
with mp_hands.Hands(
    max_num_hands=1,  # Only detect one hand
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        success, image = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue

        # Flip the image horizontally for a later selfie-view display.
        image = cv2.flip(image, 1)
        
        # Resize the image to 640x480 pixels
        image = cv2.resize(image, (640, 480))

        # Convert the BGR image to RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # To improve performance, optionally mark the image as not writeable to
        # pass by reference.
        image.flags.writeable = False
        results = hands.process(image)

        # Draw the hand annotations on the image.
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                # Get the landmarks for the first hand
                hand_landmarks_1 = results.multi_hand_landmarks[0].landmark
                hand_landmarks = results.multi_hand_landmarks[0]
                # Get the location of the wrist landmark
                wrist = np.array([hand_landmarks_1[mp_hands.HandLandmark.WRIST.value].x, hand_landmarks_1[mp_hands.HandLandmark.WRIST.value].y])
                # Get the location of the thumb joint
                thumb_mcp = np.array([hand_landmarks_1[mp_hands.HandLandmark.THUMB_MCP.value].x, hand_landmarks_1[mp_hands.HandLandmark.THUMB_MCP.value].y])
                # check if the thumb is located to the right or left of the wrist to determine right or left hand
                if thumb_mcp[0] > wrist[0]:
                    continue 
                else:
                    # draw landmarks on the hand
                    mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                    #helper function to identify gestures
                    Gesture_recognizer(hand_landmarks_1)
            
        # Display the image with the hand landmarks marked.
        cv2.imshow('MediaPipe Hands', image)
        # wait for escape or 'q' key press to stop the program
        if cv2.waitKey(5) & 0xFF == 27 or cv2.waitKey(1) == ord('q'):
            break

# close the program
cap.release()
cv2.waitKey(1)
cv2.destroyAllWindows()
for i in range (1,5):
    cv2.waitKey(1)
sys.exit()

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
