In [1]:
# Imported necessory libraires
import cv2
import mediapipe as mp
import datetime
import math
import pyautogui

class Control: # created a class named control 
    def __init__(self): # initialised the calss with some necessory variables
        self.mp_hands = mp.solutions.hands
        self.mp_draw = mp.solutions.drawing_utils
        self.hands = self.mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.7)

        self.mp_face_detection = mp.solutions.face_detection
        self.face_detection = self.mp_face_detection.FaceDetection(min_detection_confidence=0.7)

        self.video = cv2.VideoCapture(0) 
        self.then = datetime.datetime.now()  # created a variable named then to implement latency int the actions

    def detect_gesture(self, landmarks):     # the landmarks will be passed from run function 
        if len(landmarks) != 21:             # checked if all 21 landmarks are present 
            return None

        lmlist = [[i, pt[0], pt[1]] for i, pt in enumerate(landmarks)]   # This code is a list comprehension to collect id,cx,cy
        fingerlist = []                                                  # we initialised a finger list to append on off fingers
        tip_ids = [4, 8, 12, 16, 20]                                     # Landmark IDs for fingertips

        # Determine if it's a right or left hand.
        is_right_hand = lmlist[9][1] < lmlist[17][1]  # True if right hand, False if left hand
        # Thumb check based on hand type
        if is_right_hand:
            fingerlist.append(0 if lmlist[4][1] > lmlist[3][1] else 1)
        else:
            fingerlist.append(0 if lmlist[4][1] < lmlist[3][1] else 1)

        # Appending Other fingers
        for i in range(1, 5):
            fingerlist.append(1 if lmlist[tip_ids[i]][2] < lmlist[tip_ids[i] - 2][2] else 0)

        return fingerlist, is_right_hand  
    '''
    - Fingerlist will return the list of 0 and 1 where 0 representing finger closed and 1 representing finger open
    - is_right_hand returns a true value if the hand shown is right else a false value
    '''

    def control_media_right_hand(self, gesture): # function containing which key needs to be pressed if hand is right
        if gesture == [0, 1, 0, 0, 1]:
            pyautogui.press("volumemute", interval=0.8)
        elif gesture == [0, 1, 0, 0, 0]:
            pyautogui.hotkey("shift", "<", interval=0.5)  
        elif gesture == [0, 0, 0, 0, 1]:
            pyautogui.hotkey("shift", ">", interval=0.5)  
        elif gesture == [1, 1, 1, 1, 1]:
            pyautogui.press("space", interval=0.8)  
        elif gesture == [0, 0, 1, 1, 1]:
            pyautogui.scroll(10)
        elif gesture == [0, 1, 1, 1, 0]:
            pyautogui.press("volumeup", interval=0.3)
        elif gesture == [0, 1, 1, 0, 0]:
            pyautogui.press("volumedown", interval=0.3)
        elif gesture == [0, 0, 0, 0, 0]:
            pyautogui.press("q")
        elif gesture == [1, 0, 0, 0, 0]:
            pyautogui.hotkey("shift","p", interval=0.5)

    def control_media_left_hand(self, gesture): # function containing which key needs to be pressed if hand is left
        if gesture == [0, 0, 1, 1, 1]:
            pyautogui.scroll(-10)  # Scroll down
        elif gesture == [1, 1, 1, 1, 1]:
            pyautogui.press("esc") 
        elif gesture == [1,1,0,0,0] :
            pyautogui.press("f",interval=0.8)
        elif gesture == [0, 0, 0, 0, 1]:
            pyautogui.hotkey("i", interval=0.5)
        elif gesture == [1, 0, 0, 0, 0]:
            pyautogui.hotkey("shift","n", interval=0.5) 
            

    def detect_face_tilt(self, frame, face_results):
        """Detects head tilt based on face detection."""
        if not face_results.detections:
            return

        for detection in face_results.detections:
            h, w, _ = frame.shape
            self.mp_draw.draw_detection(frame, detection)

            # Extract eye keypoints
            right_eye = detection.location_data.relative_keypoints[0]
            left_eye = detection.location_data.relative_keypoints[1]

            right_eye_coord = (int(right_eye.x * w), int(right_eye.y * h))
            left_eye_coord = (int(left_eye.x * w), int(left_eye.y * h))            
            # The code till here is to find the x,y points of right eye and left eye

            cv2.circle(frame, right_eye_coord, 3, (0, 255, 255), -1)
            cv2.circle(frame, left_eye_coord, 3, (0, 255, 255), -1)

            # Compute the angle between eyes
            dx = right_eye_coord[0] - left_eye_coord[0]
            dy = right_eye_coord[1] - left_eye_coord[1]
            angle = math.degrees(math.atan2(dy, dx))

            # Normalize the angle to [-90, 90]
            angle = angle + 180 if angle < -90 else angle - 180 if angle > 90 else angle # Normalisation code to calculate the angle

            cv2.putText(frame, f"Head Angle: {int(angle)}", (50, 100),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

            tilt_threshold = 23                                                          # we use tilt threshold in deciding after which angle the key to be pressed                                          
            now = datetime.datetime.now()                                                # used to get time for now to calculate total seconds
            tot = now - self.then                                                        # Used to delay the key presses, we can set the key to be pressed after 1 second delay
            tot_int = int(tot.total_seconds())

            if angle < -tilt_threshold and tot_int >= 1:
                pyautogui.press("j")  # Previous slide
                self.then = datetime.datetime.now()
            elif angle > tilt_threshold and tot_int >= 1:
                pyautogui.press("l")  # Next slide
                self.then = datetime.datetime.now()

    def run(self):
        """Main loop for detecting hand gestures and face tilt."""
        while True:
            suc, frame = self.video.read()
            if not suc:
                break

            frame = cv2.flip(frame, 1)  # Flip frame horizontally
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            hand_results = self.hands.process(rgb_frame)
            face_results = self.face_detection.process(rgb_frame)

            if hand_results.multi_hand_landmarks:
                for i, hand_landmarks in enumerate(hand_results.multi_hand_landmarks):
                    h, w, _ = frame.shape
                    landmarks = [(int(lm.x * w), int(lm.y * h)) for lm in hand_landmarks.landmark] 
                    # we append the landmarks into this list this list is then passed to detect_gesture

                    self.mp_draw.draw_landmarks(frame, hand_landmarks, self.mp_hands.HAND_CONNECTIONS)

                    gesture, is_right_hand = self.detect_gesture(landmarks)   
                    # we call the detect_gesture here from there we get if the hand is right hand or not
                    # and also the gesture list  
                    hand_label = "Right Hand" if is_right_hand else "Left Hand" # outputs based on which hand is it

                    cv2.putText(frame, f"{hand_label} Gesture: {gesture}", (50, 50 + i * 30),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

                    if is_right_hand:
                        self.control_media_right_hand(gesture) # if the hand is right then the gesture list is given to control media right hand
                    else:
                        self.control_media_left_hand(gesture)  # if the hand is left then the gesture list is passed to control media left hand

            self.detect_face_tilt(frame, face_results)

            cv2.imshow("YouTube Gesture Control", frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

        self.video.release()
        cv2.destroyAllWindows()





In [4]:
model = Control()
model.run()

# Gesture Guide
- **Right hand**
    -  rock symbol = mute
    -  open hand = pause/unpause
    -  index finger = to reduce the playback speed
    -  pinky = to increace the playback speed
    -  combination of last 3 fingers = scroll up
    -  combination of middle and index finger = volume down
    -  combination of middle,ring,and index = volume up
    -  thump open = go to previous video
- **Left hand**
    -  combination of last 3 fingers = scroll up
    -  open hand = escape key
    -  L gesture = full screen
    -  pinkey finger = minimise
    -  thumb open = go to next video

- **Head**
    -  tilt right to skip 10 seconds
    -  tilt left to go back 10 seconds 

# Future Scope:
- As of now the project is only focusing on Youtube control, we can create different functions here for other video players
- We can add more gestures and more controlls acording to the features and keyboard shortcuts available