In [2]:
pip install mediapipe


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
!wget https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task

--2024-09-13 13:26:45--  https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task
Resolving storage.googleapis.com (storage.googleapis.com)... 34.101.5.27, 34.101.5.123, 142.251.221.155, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|34.101.5.27|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7819105 (7.5M) [application/octet-stream]
Saving to: ‘hand_landmarker.task.2’


2024-09-13 13:26:52 (1.10 MB/s) - ‘hand_landmarker.task.2’ saved [7819105/7819105]



In [3]:
import os
import numpy as np
import cv2
import time

import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2

In [4]:
MARGIN = 10  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # vibrant green

def draw_landmarks_on_image(rgb_image, detection_result):
    hand_landmarks_list = detection_result.hand_landmarks
    handedness_list = detection_result.handedness
    annotated_image = np.copy(rgb_image)
    # Loop through the detected hands to visualize.
    for idx in range(len(hand_landmarks_list)):
        hand_landmarks = hand_landmarks_list[idx]
        handedness = handedness_list[idx]
        # Draw the hand landmarks.
        hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
        hand_landmarks_proto.landmark.extend([
          landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
        ])
        solutions.drawing_utils.draw_landmarks(
          annotated_image,
          hand_landmarks_proto,
          solutions.hands.HAND_CONNECTIONS,
          solutions.drawing_styles.get_default_hand_landmarks_style(),
          solutions.drawing_styles.get_default_hand_connections_style())
        # Get the top left corner of the detected hand's bounding box.
        height, width, _ = annotated_image.shape
        x_coordinates = [landmark.x for landmark in hand_landmarks]
        y_coordinates = [landmark.y for landmark in hand_landmarks]
        text_x = int(min(x_coordinates) * width)
        text_y = int(min(y_coordinates) * height) - MARGIN
        # Draw handedness (left or right hand) on the image.
        cv2.putText(annotated_image, f"{handedness[0].category_name}",
                    (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                    FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)
    return annotated_image

In [5]:
# Setup options
base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
options = vision.HandLandmarkerOptions(base_options=base_options,
                                       num_hands=2)
detector = vision.HandLandmarker.create_from_options(options)

# Get inference
def get_annotation_from(frame):
    image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
    detection_result = detector.detect(image)
    annotated_image = draw_landmarks_on_image(image.numpy_view(), detection_result)
    
    return detection_result, annotated_image

I0000 00:00:1725962900.174924 1518134 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 76.3), renderer: Apple M2
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1725962900.270371 1519631 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1725962900.356484 1519631 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [6]:
# define a video capture object
cap = cv2.VideoCapture(0)
  
while True:
    # capture image
    ret, frame = cap.read()
    
    if ret:
        detection_result, annotation = get_annotation_from(cv2.flip(frame, 1))
    
        cv2.imshow('', annotation)  
    else:
        print("! No frame")
        
    time.sleep(0.05)
     
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
        
# After the loop release the cap object
cap.release()

# Destroy all the windows
cv2.destroyAllWindows()

W0000 00:00:1725962913.427498 1519633 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


In [5]:
import cv2
import mediapipe as mp
import numpy as np

# Initialize Mediapipe Hand Landmarker
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

# Initialize webcam feed
cap = cv2.VideoCapture(0)

def classify_gesture(hand_landmarks):
    # Extract landmark positions for relevant points
    thumb_tip = hand_landmarks.landmark[mp_hands.HandLandmark.THUMB_TIP]
    index_tip = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP]
    middle_tip = hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP]
    ring_tip = hand_landmarks.landmark[mp_hands.HandLandmark.RING_FINGER_TIP]
    pinky_tip = hand_landmarks.landmark[mp_hands.HandLandmark.PINKY_TIP]

    index_mcp = hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_MCP]
    wrist = hand_landmarks.landmark[mp_hands.HandLandmark.WRIST]

    # Simple heuristic to classify gestures for I, L, O, V, E, U (expand based on your needs)
    if (index_tip.y < index_mcp.y and thumb_tip.y > wrist.y):  # Index finger up, thumb down (letter "I")
        return "I"
    elif (index_tip.y > thumb_tip.y and middle_tip.x< thum_tip.x):  # Thumb and index make "L" shape
        return "L"
    elif (thumb_tip.x > index_tip.x and pinky_tip.y <wrist.y):  # Circle-like gesture for "O"
        return "O"
    elif (index_tip.y < wrist.y and middle_tip.y < wrist.y and ring_tip.y > wrist.y):  # "V" gesture
        return "V"
    elif (index_tip.y < wrist.y and middle_tip.y < wrist.y and ring_tip.y < wrist.y):  # Open hand (letter "E")
        return "E"
    elif (pinky_tip.y < wrist.y and index_tip.y < wrist.y):  # Thumb and pinky make "U" shape
        return "U"
    else:
        return ""

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("Ignoring empty frame.")
        continue

    # Convert the BGR image to RGB.
    image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image_rgb.flags.writeable = False

    # Process the image and find hand landmarks
    results = hands.process(image_rgb)

    # Convert the image back to BGR so OpenCV can display it
    image_rgb.flags.writeable = True
    image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)

    # Detect hand landmarks and handedness
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Draw hand landmarks on the image
            mp_drawing.draw_landmarks(image_bgr, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            # Classify gesture and get the corresponding letter
            letter = classify_gesture(hand_landmarks)

            # If a letter is detected, display it on the screen
            if letter:
                # Get the wrist landmark position to display the letter near the hand
                wrist = hand_landmarks.landmark[mp_hands.HandLandmark.WRIST]
                h, w, _ = image_bgr.shape
                cx, cy = int(wrist.x * w), int(wrist.y * h)

                # Display the letter using OpenCV
                cv2.putText(image_bgr, letter, (cx, cy), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 3, cv2.LINE_AA)

    # Display the image
    cv2.imshow('Hand Gesture Recognition', image_bgr)

    # Break the loop when 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close all windows
cap.release()
cv2.destroyAllWindows()


I0000 00:00:1725965623.515630 1616558 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 76.3), renderer: Apple M2
W0000 00:00:1725965623.543058 1666171 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1725965623.563794 1666171 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


NameError: name 'thum_tip' is not defined

In [5]:
import cv2
import numpy as np
import mediapipe as mp

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()
draw_color = (255, 255, 255)  # Color for drawing
erase_color = (0, 0, 0)        # Color for erasing
mp_drawing = mp.solutions.drawing_utils
# Initialize webcam
cap = cv2.VideoCapture(0)

# Create a blank canvas to draw
canvas = np.zeros((600, 400, 3), dtype=np.uint8)

# Initialize previous position variables
prev_x, prev_y = 0, 0

# Function to draw lines on canvas
def draw_line(canvas, start, end, color, thickness=2):
    cv2.line(canvas, start, end, color, thickness)

# Function to erase drawn areas on canvas
def erase_area(canvas, center, radius, color):
    cv2.circle(canvas, center, radius, color, -1)

# Main loop
while True:
    # Read frame from webcam
    ret, frame = cap.read()
    if not ret:
        break

    # Flip the frame horizontally
    frame = cv2.flip(frame, 1)

    # Convert frame to RGB for MediaPipe
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Detect hand landmarks
    results = hands.process(frame_rgb)

    # Draw landmarks and get hand positions
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            for id, lm in enumerate(hand_landmarks.landmark):
                # Get x, y coordinates of each landmark
                h, w, c = frame.shape
                cx, cy = int(lm.x * w), int(lm.y * h)

                if id == 8:  # Index finger tip (Left hand)
                    # Use index finger to draw
                    if prev_x != 0 and prev_y != 0:
                        draw_line(canvas, (prev_x, prev_y), (cx, cy), draw_color)
                    prev_x, prev_y = cx, cy

                elif id == 12:  # Index finger tip (Right hand)
                    # Use middle finger to erase
                    erase_area(canvas, (cx, cy), 50, erase_color)

    # Display frame and canvas
    cv2.imshow('Frame', frame)
    cv2.imshow('Canvas', canvas)

    # Check for key press to exit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()

I0000 00:00:1726208948.480701   47070 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 76.3), renderer: Apple M2
W0000 00:00:1726208948.497905   59238 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1726208948.509188   59238 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [6]:
import cv2
import mediapipe as mp
import numpy as np

from joblib import load
from sklearn.preprocessing import Normalizer

## Open capture with video path
capture = cv2.VideoCapture(0)

## Initialize mediapipe hand detection function
mpHands = mp.solutions.hands
hands = mpHands.Hands(max_num_hands=1)
mp_drawing = mp.solutions.drawing_utils

## Load trained model and initialize a normalizer 
model = load("model.joblib")
normalizer = Normalizer()

## Define variables for output video
h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
size = (w, h)

## Create VideoWriter instance with variables taken from input
outputVid = cv2.VideoWriter("result.avi", cv2.VideoWriter_fourcc('M','J','P','G'), 24, size, isColor = True)

## Helper function to create a bounding box around each hand.
## Takes in video frame img and hand landmarks lm
def createBoundingBox(img, lm):

	## Initialize empty array to store all landmarks of 
	## hand landmark lm
	lm_array = np.empty((0,2), int)

	## For each landmark in hand landmark, append 
	## minimum points to array
	for _, landmark in enumerate(lm.landmark):
		
		width, height = img.shape[1], img.shape[0]
		## Calculate minimum point between landmark
		## position and size of video frame
		lm_x = min(int(landmark.x * width), width - 1)
		lm_y = min(int(landmark.y * height), height - 1)

		## Create a point using the minimum for landmark
		lm_point = [np.array((lm_x, lm_y))]

		## Append point to array
		lm_array = np.append(lm_array, lm_point, axis=0)

	## Using built-in method boundingRect, get the x,y,w,h
	## from the bounding box of lm_array
	x, y, w, h = cv2.boundingRect(lm_array)

	## Define positions for bouding box to encapsulate hand
	x_min = x - 20

	y_min = y - 15

	x_max = x + w + 20

	y_max = y + h + 15

	return [x_min, y_min, x_max, y_max]


## While capture is open
while(capture.isOpened()):

	## Read the frame from capture
	read, frame = capture.read()

	frame = cv2.flip(frame,1)

	## If frame was properly read
	if read == True:
		
		## Convert frame to RGB for proper mediapipe detection
		rgbFrame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

		## Process each frame to get hand landmarks
		results = hands.process(rgbFrame)

		## If results exists
		if results.multi_hand_landmarks:

			## For each hand detected
			for handLms in results.multi_hand_landmarks:

				## Call upon createBoudningBox() method to get bounding box coordinates
				boudingBox = createBoundingBox(frame, handLms)

				## Draw a rectangle around each processed bounding box
				cv2.rectangle(frame, (boudingBox[0], boudingBox[1]), (boudingBox[2], boudingBox[3]), (0, 255, 0), 2)

				## Draw the connections between landmarks for better visualization
				mp_drawing.draw_landmarks(frame, handLms, mpHands.HAND_CONNECTIONS)

				## Define coords as the landmark's x and y coordinates and normalize them
				coords = handLms.landmark
				coords = list(np.array([[landmark.x, landmark.y] for landmark in coords]).flatten())
				coords = normalizer.transform([coords])

				## Predict which letter is being gestured using the trained model
				predicted_letter = model.predict(coords)

				# Write above the bouding box the predicted letter
				cv2.putText(frame, str(predicted_letter[0]),(boudingBox[0], boudingBox[1]), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

		cv2.imshow("Frame", frame)

		## Write frame with detection results to VideoWriter 
		## instance outputVid
		outputVid.write(frame)

		if cv2.waitKey(1) & 0xFF == ord('q'):
			break

	else:
		break

capture.release()
outputVid.release()
cv2.destroyAllWindows()

ModuleNotFoundError: No module named 'joblib'

In [15]:
import mediapipe as mp
import cv2

#Initializations: static code
mpHands = mp.solutions.hands
mpDraw = mp.solutions.drawing_utils



class HandDetector:
    def __init__(self, max_num_hands=2, min_detection_confidence=0.5, min_tracking_confidence=0.5):
        #when the mediapipe is first started, it detects the hands. After that it tries to track the hands
        #as detecting is more time consuming than tracking. If the tracking confidence goes down than the
        #specified value then again it switches back to detection
        self.hands = mpHands.Hands(max_num_hands=max_num_hands, min_detection_confidence=min_detection_confidence,
                                   min_tracking_confidence=min_tracking_confidence)


    def findHandLandMarks(self, image, handNumber=0, draw=False):
        originalImage = image
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # mediapipe needs RGB
        results = self.hands.process(image)
        landMarkList = []

        if results.multi_handedness:
            label = results.multi_handedness[handNumber].classification[0].label  # label gives if hand is left or right
            #account for inversion in webcams
            if label == "Left":
                label = "Right"
            elif label == "Right":
                label = "Left"


        if results.multi_hand_landmarks:  # returns None if hand is not found
            hand = results.multi_hand_landmarks[handNumber] #results.multi_hand_landmarks returns landMarks for all the hands

            for id, landMark in enumerate(hand.landmark):
                # landMark holds x,y,z ratios of single landmark
                imgH, imgW, imgC = originalImage.shape  # height, width, channel for image
                xPos, yPos = int(landMark.x * imgW), int(landMark.y * imgH)
                landMarkList.append([id, xPos, yPos, label])

            if draw:
                mpDraw.draw_landmarks(originalImage, hand, mpHands.HAND_CONNECTIONS)

        return landMarkList

In [14]:
from handDetector import HandDetector
import cv2
import math
import numpy as np


handDetector = HandDetector(min_detection_confidence=0.7)
webcamFeed = cv2.VideoCapture(0)


while True:
    status, image = webcamFeed.read()
    handLandmarks = handDetector.findHandLandMarks(image=image, draw=True)
    count=0

    if(len(handLandmarks) != 0):
        #we will get y coordinate of finger-tip and check if it lies above middle landmark of that finger
        #details: https://google.github.io/mediapipe/solutions/hands

        if handLandmarks[4][3] == "Right" and handLandmarks[4][1] > handLandmarks[3][1]:       #Right Thumb
            count = count+1
        elif handLandmarks[4][3] == "Left" and handLandmarks[4][1] < handLandmarks[3][1]:       #Left Thumb
            count = count+1
        if handLandmarks[8][2] < handLandmarks[6][2]:       #Index finger
            count = count+1
        if handLandmarks[12][2] < handLandmarks[10][2]:     #Middle finger
            count = count+1
        if handLandmarks[16][2] < handLandmarks[14][2]:     #Ring finger
            count = count+1
        if handLandmarks[20][2] < handLandmarks[18][2]:     #Little finger
            count = count+1

    cv2.putText(image, str(count), (45, 375), cv2.FONT_HERSHEY_SIMPLEX, 5, (255, 0, 0), 25)
    cv2.imshow("Volume", image)
    cv2.waitKey(1)

ModuleNotFoundError: No module named 'handDetector'