In [24]:
import cv2
import numpy as np
import mediapipe as mp
import tensorflow as tf
from tensorflow.keras.models import load_model

In [25]:
mpHands = mp.solutions.hands # hand recognize algor
hands = mpHands.Hands(max_num_hands=2, min_detection_confidence=0.7)

mpDraw = mp.solutions.drawing_utils #key points 21

In [26]:
model = load_model('mp_hand_gesture')

# Load class names
f = open('gesture.names', 'r')
classNames = f.read().split('\n')
f.close()
print(classNames)

['okay', 'peace', 'thumbs up', 'thumbs down', 'call me', 'stop', 'rock', 'live long', 'fist', 'smile']


In [34]:
cap = cv2.VideoCapture(0)

while True:
    # Read each frame from the webcam
    _, frame = cap.read()

    x, y, c = frame.shape

    frame = cv2.flip(frame, 1) #make camera like mirror
    framergb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) #to convert BGR ro RGB because mediapipe read rgb not bgr

    # Get hand landmark prediction
    
    result = hands.process(framergb) #process takes rgb frame and return class to detect in webcam hand with positions of landmarks
    # print(result)  ====>  <class 'mediapipe.python.solution_base.SolutionOutputs'>

    
    all_landmarks = []

    if result.multi_hand_landmarks: #to check if any hand is detected or no
        for handslms in result.multi_hand_landmarks:
            landmarks = []
            for lm in handslms.landmark: #handslms provide a list of 21 keyspoints landamrks
                # for each lm points it has x and y
                lmx = int(lm.x * x)
                lmy = int(lm.y * y)
                #These lines scale the normalized coordinates lm.x and lm.y in range 0.0 1.0 to pixel coordinates 
                landmarks.append([lmx, lmy])

            # Drawing landmarks on frames for each hand
            mpDraw.draw_landmarks(frame, handslms, mpHands.HAND_CONNECTIONS) #the connection of keypoints
            
            all_landmarks.append(landmarks) #will contains 2 landmarks for 2 hands
            

        # recognize gesture for each hand
        for i, landmarks in enumerate(all_landmarks):
            prediction = model.predict([landmarks]) #return array of 10 landmarks output (2.0691623e-18 1.9585415e-27 .....)
            classID = np.argmax(prediction)  #return index of the max value
            className = classNames[classID]


            # Display the gesture prediction for each hand as needed
            hand_text_position = (10, 50 + i * 50)  # Adjust Y position for each hand
            cv2.putText(frame, f"Hand {i + 1}: {className}", hand_text_position, cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 255), 3, cv2.LINE_AA)

    # Show the final output
    cv2.imshow("Output", frame) 

    if cv2.waitKey(1) == ord('q'):
        break

# release the webcam and destroy all active windows
cap.release()
cv2.destroyAllWindows()

