In [1]:
import cv2
import numpy as np
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

from IPython.display import clear_output, Image, display
from PIL import Image as PILImage
import io

In [None]:
def display_jupyter(frame, processed=None):
    """Display frame(s) in Jupyter notebook"""
    
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Display side by side if we have two frames
    if processed is not None:
        
        processed_rgb = cv2.cvtColor(processed, cv2.COLOR_BGR2RGB)
        
        combined = np.hstack((rgb_frame, processed_rgb))
        pil_img = PILImage.fromarray(combined)
    else:
        pil_img = PILImage.fromarray(rgb_frame)
    
    # Create binary stream
    bio = io.BytesIO()
    pil_img.save(bio, format='PNG')
    
    display(Image(data=bio.getvalue()))
    clear_output(wait=True)  # Clear previous frame

In [4]:
def display_cv2(frame, processed=None):
    """Display frame(s) using OpenCV window"""
    if processed is not None:
        # Stack frames horizontally
        combined = np.hstack((frame, processed))
        cv2.imshow('Video Feed', combined)
    else:
        cv2.imshow('Video Feed', frame)

In [None]:
def detect_initial_point(frame):
    """Process frame to detect pointing gesture and get index fingertip"""
    
    # Initialize gesture recognizer of mediapipe
    base_options = python.BaseOptions(model_asset_path='models/gesture_recognizer.task')
    options = vision.GestureRecognizerOptions(base_options=base_options)
    gesture_recognizer = vision.GestureRecognizer.create_from_options(options)
    
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame_rgb)
    
    # Detect gestures from the frame
    gesture_result = gesture_recognizer.recognize(mp_image)
    
    index_tip = None
    
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame_rgb)
    
    gesture_result = gesture_recognizer.recognize(mp_image)
    
    # The landmark 8 is the tip of the index finger based on the mediapipe hand landmarks
    # Available at https://ai.google.dev/edge/mediapipe/solutions/vision/hand_landmarker#get_started
    if gesture_result.gestures and gesture_result.hand_landmarks:
        top_gesture = gesture_result.gestures[0]
        if top_gesture[0].category_name == "Pointing_Up":
            hand_landmarks = gesture_result.hand_landmarks[0]
            index_tip = (
                int(hand_landmarks[8].x * frame.shape[1]),
                int(hand_landmarks[8].y * frame.shape[0])
            )
            gesture_recognizer.close()
            return index_tip
    
    gesture_recognizer.close()
    return None


In [None]:
# Video path
video_path = "hand_video.mp4"
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise IOError("Cannot open video file")

frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

canvas = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)

# State variables
initial_point = None
tracking_started = False

### No Tracking

In [None]:
try:
    while True:
        ret, frame = cap.read()
        if not ret:
            cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
            continue
        
        output_frame = frame.copy()
        
        if not tracking_started:
            # Try to detect initial pointing gesture
            initial_point = detect_initial_point(frame)
            if initial_point is not None:
                tracking_started = True
                print("Initial point detected! Starting tracking...")
                # Here we would initialize any tracking-specific variables or algorithms
        
        if tracking_started:
            cv2.circle(output_frame, initial_point, 5, (0, 255, 0), -1)
            cv2.putText(output_frame, "Tracking Active", (10, 30),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
            # Here we would implement our own tracking logic
        else:
            cv2.putText(output_frame, "Waiting for pointing gesture...", (10, 30),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        
        # Display the frame in jupyter notebook or OpenCV window
        display_jupyter(frame, output_frame)
        
        # Add a small delay to control display speed
        cv2.waitKey(int(1000/fps))
        
except KeyboardInterrupt:
    print("Interrupted by user")
finally:
    cap.release()

Interrupted by user


### With Tracking

In [None]:
try:
    
    tracker = None
    
    while True:
        ret, frame = cap.read()
        if not ret:
            cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
            continue
        
        output_frame = frame.copy()
        if not tracking_started:
            
            initial_point = detect_initial_point(frame)
            if initial_point is not None:
                tracking_started = True
                print("Initial point detected! Starting tracking...")
                
                # Initialize tracker
                tracker = cv2.TrackerCSRT_create()  # or cv2.TrackerKCF_create()
                
                # Create bounding box around initial point of 50x50 pixels
                # The goal of the bounding box is to create an area of interest for the tracker
                box_size = 50
                bbox = (
                    initial_point[0] - box_size//2,
                    initial_point[1] - box_size//2,
                    box_size,
                    box_size
                )
                tracker.init(frame, bbox)
                current_point = initial_point
        
        if tracking_started:
            # Update tracker
            success, bbox = tracker.update(frame)
            
            if success:
                # Get center point of bounding box for plotting
                current_point = (
                    int(bbox[0] + bbox[2]//2),
                    int(bbox[1] + bbox[3]//2)
                )
                
                cv2.circle(output_frame, current_point, 5, (0, 255, 0), -1)
                cv2.rectangle(output_frame, 
                            (int(bbox[0]), int(bbox[1])), 
                            (int(bbox[0] + bbox[2]), int(bbox[1] + bbox[3])),
                            (255, 0, 0), 2)
                cv2.putText(output_frame, "Tracking Active", (10, 30),
                           cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            else:
                cv2.putText(output_frame, "Tracking Lost", (10, 30),
                           cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                tracking_started = False
        else:
            cv2.putText(output_frame, "Waiting for pointing gesture...", (10, 30),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        
        # Display the frame in jupyter notebook or OpenCV window
        display_cv2(frame, output_frame)
        
        # Add a small delay to control display speed
        cv2.waitKey(int(1000/fps))
        
except KeyboardInterrupt:
    print("Interrupted by user")
finally:
    cap.release()

I0000 00:00:1738858581.011594 88864218 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1738858581.012665 88864218 gesture_recognizer_graph.cc:129] Hand Gesture Recognizer contains CPU only ops. Sets HandGestureRecognizerGraph acceleration to Xnnpack.
I0000 00:00:1738858581.016690 88864218 hand_gesture_recognizer_graph.cc:250] Custom gesture classifier is not defined.
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1738858581.034047 88864757 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738858581.049848 88864758 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1738858581.051534 88864761 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedb

Initial point detected! Starting tracking...
Interrupted by user
