In [1]:
import os
import cv2
import numpy as np
from dotenv import load_dotenv


from IPython.display import clear_output, Image, display
from PIL import Image as PILImage

from src.utils import (
    display_jupyter,
    display_cv2,
    setup_gesture_recognizer,
    detect_initial_point,
    detect_closed_fist,
    setup_text_detector,
    detect_text_from_canvas,
    draw_recognized_text,
)
from src.kalman_filter import KalmanFilter

## Initialization

This section defines the video sources and configures the parameters used in the tracking setup.

In [2]:
### Get the genAI recognizer

# Load environment variables
load_dotenv()

# Get the GEMINI_API_KEY
gemini_api_key = os.getenv("GEMINI_API_KEY")
if gemini_api_key is None:
    raise ValueError("GEMINI_API_KEY is not set")

text_detector = setup_text_detector(gemini_api_key)

### Get the video source

# Video path
video_path = "hand_tracking.mp4"
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise IOError("Cannot open video file")

# Frame dimensions
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Canvas to draw the gestures
canvas = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)

### State variables
initial_point = None
tracking_started = False
points = []
recognized_text = ""

## Tracking

In this section, we perform the tracking task based on predefined algorithms.

### Tracking with CSRT Tracker

In [None]:
try:
    gesture_recongizer = setup_gesture_recognizer()
    tracker = None
    
    while True:
        ret, frame = cap.read()
        if not ret:
            cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
            continue
        
        frame = cv2.flip(frame, 1)
        output_frame = frame.copy()
        
        if not tracking_started:
            # Detect the initial point using the MediaPipe library
            initial_point = detect_initial_point(frame, gesture_recongizer)
            
            if initial_point is not None:
                tracking_started = True
                print("Initial point detected! Starting tracking...")
                
                # Initialize tracker
                tracker = cv2.TrackerCSRT_create()
                
                # Create bounding box around initial point of 50x50 pixels
                box_size = 50
                bbox = (
                    initial_point[0] - box_size//2,
                    initial_point[1] - box_size//2,
                    box_size,
                    box_size
                )
                tracker.init(frame, bbox)
                current_point = initial_point
                points = [initial_point]
        
        if tracking_started:
            # Update tracker to get new bounding box
            success, bbox = tracker.update(frame)
            
            # If tracking is successful, get the center point of the bounding box
            if success:
                current_point = (
                    int(bbox[0] + bbox[2]//2),
                    int(bbox[1] + bbox[3]//2)
                )
                
                points.append(current_point)
                
                # Draw line on canvas
                if len(points) > 1:
                    cv2.line(canvas, points[-2], points[-1], (0, 0, 255), 3, cv2.LINE_AA)
                
                # Combine canvas with output frame
                output_frame = cv2.addWeighted(output_frame, 1.0, canvas, 1.0, 0)
                
                cv2.circle(output_frame, current_point, 5, (0, 255, 0), -1)
                cv2.rectangle(output_frame, 
                            (int(bbox[0]), int(bbox[1])), 
                            (int(bbox[0] + bbox[2]), int(bbox[1] + bbox[3])),
                            (255, 0, 0), 2)
                cv2.putText(output_frame, "Tracking Active", (10, 30),
                           cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                
                if detect_closed_fist(frame, gesture_recongizer):
                    tracking_started = False
                    print("Closed fist detected - ending tracking")
                    
                    # Perform OCR on the canvas
                    if len(points) > 1:  # Only if something was drawn
                        text = detect_text_from_canvas(canvas, text_detector)
                
                        recognized_text += f"{text} "
                        print(f"Recognized text: {recognized_text}")
                        
                        canvas = np.zeros_like(frame)
                    
            else:
                cv2.putText(output_frame, "Tracking Lost", (10, 30),
                           cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                
                # Reset tracking variables
                tracking_started = False
                points = []
                canvas = np.zeros_like(frame)
        else:
            # Combine existing canvas with output frame
            output_frame = cv2.addWeighted(output_frame, 1.0, canvas, 0.5, 0)
            cv2.putText(output_frame, "Waiting for pointing gesture...", (10, 30),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        
        # Display the last recognized text as subtitle if it exists
        if recognized_text:
           draw_recognized_text(output_frame, recognized_text)
        
        display_cv2(frame, output_frame)
        cv2.waitKey(int(1000/fps))
        
except KeyboardInterrupt:
    print("Interrupted by user")
finally:
    gesture_recongizer.close()
    cap.release()

### Tracking with Kalman Filters

Tutorial Guide: https://machinelearningspace.com/2d-object-tracking-using-kalman-filter/

In [3]:
# Tracking using Lucas-Kanade Optical Flow and Kalman Filter

try:
    gesture_recognizer = setup_gesture_recognizer()
    tracker = None
    old_frame = None
    old_gray = None
    
    # Luka-Kanade parameters
    lk_params = dict(winSize=(15, 15),
                    maxLevel=2,
                    criteria=(cv2.TERM_CRITERIA_EPS |
                            cv2.TERM_CRITERIA_COUNT, 10, 0.03))
    
    # Define the Kalman Filter
    kf = KalmanFilter(0.1, 1, 1, 1, 0.1,0.1)
    
    while True:
        ret, frame = cap.read()
        if not ret:
            cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
            continue
        
        frame = cv2.flip(frame, 1)
        output_frame = frame.copy()
        
        if not tracking_started:
            initial_point = detect_initial_point(frame, gesture_recognizer)
            
            if initial_point is not None:
                tracking_started = True
                print("Initial point detected! Starting tracking...")
                
                current_point = initial_point
                points = [initial_point]
                
                p0 = np.array([[current_point[0], current_point[1]]], dtype=np.float32).reshape(-1, 1, 2)
            
            old_frame = frame
            old_gray = cv2.cvtColor(old_frame, cv2.COLOR_BGR2GRAY)
        
        if tracking_started:
            frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            
            (kalman_pred_x, kalman_pred_y) = kf.predict()
            p1, st, err = cv2.calcOpticalFlowPyrLK(old_gray, frame_gray, p0, None, **lk_params)
            
            if st[0][0] == 1:
                measurement = p1.reshape(-1, 2)
                x_meas, y_meas = measurement[0]
                
                (kalman_updated_x, kalman_updated_y) = kf.update([[x_meas], [y_meas]])
                
                points.append((int(kalman_updated_x), int(kalman_updated_y)))
                
                if len(points) > 1:
                    cv2.line(canvas, points[-2], points[-1], (0, 0, 255), 3)
                
                output_frame = cv2.addWeighted(output_frame, 1.0, canvas, 1.0, 0)
                
                # Draw tracking visualization
                cv2.circle(output_frame, (int(kalman_pred_x), int(kalman_pred_y)), 5, (0, 255, 255), -1)
                cv2.circle(output_frame, (int(kalman_updated_x), int(kalman_updated_y)), 5, (0, 255, 0), -1)
                cv2.rectangle(output_frame, 
                            (int(kalman_updated_x - 15), int(kalman_updated_y - 15)), 
                            (int(kalman_updated_x + 15), int(kalman_updated_y + 15)),
                            (255, 0, 0), 2)
                cv2.putText(output_frame, "Tracking Active", (10, 30),
                           cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                
                # Check for closed fist gesture
                if detect_closed_fist(frame, gesture_recognizer):
                    print("Closed fist detected - ending tracking")
                    detected_text = detect_text_from_canvas(canvas, text_detector)
                    if detected_text:
                        recognized_text += f"{detected_text} "
                    
                    tracking_started = False
                    points = []
                    canvas = np.zeros_like(frame)
                    continue
                
                p0 = p1.reshape(-1, 1, 2)
                old_frame = frame
                old_gray = frame_gray
                
            else:
                p0 = np.array([[kalman_pred_x, kalman_pred_y]], dtype=np.float32).reshape(-1, 1, 2)
                output_frame = cv2.addWeighted(output_frame, 1.0, canvas, 1.0, 0)
                
                cv2.circle(output_frame, (int(kalman_pred_x), int(kalman_pred_y)), 5, (0, 255, 255), -1)
                cv2.putText(output_frame, "Tracking Lost - Kalman Guesses only", (10, 30),
                           cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                
                tracking_started = False
                points = []
                canvas = np.zeros_like(frame)
        else:
            output_frame = cv2.addWeighted(output_frame, 1.0, canvas, 1.0, 0)
            cv2.putText(output_frame, "Waiting for pointing gesture...", (10, 30),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        
        # Draw recognized text as subtitle
        if recognized_text:
            draw_recognized_text(output_frame, recognized_text)
        
        display_cv2(frame, output_frame)
        cv2.waitKey(int(1000/fps))
        
except KeyboardInterrupt:
    print("Interrupted by user")
finally:
    gesture_recognizer.close()

I0000 00:00:1739219705.799447 90964474 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1739219705.800520 90964474 gesture_recognizer_graph.cc:129] Hand Gesture Recognizer contains CPU only ops. Sets HandGestureRecognizerGraph acceleration to Xnnpack.
I0000 00:00:1739219705.805244 90964474 hand_gesture_recognizer_graph.cc:250] Custom gesture classifier is not defined.
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1739219705.820174 90965313 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1739219705.836605 90965316 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1739219705.838437 90965311 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedb

Initial point detected! Starting tracking...


  points.append((int(kalman_updated_x), int(kalman_updated_y)))
  cv2.circle(output_frame, (int(kalman_pred_x), int(kalman_pred_y)), 5, (0, 255, 255), -1)
  cv2.circle(output_frame, (int(kalman_updated_x), int(kalman_updated_y)), 5, (0, 255, 0), -1)
  (int(kalman_updated_x - 15), int(kalman_updated_y - 15)),
  (int(kalman_updated_x + 15), int(kalman_updated_y + 15)),


Closed fist detected - ending tracking
Initial point detected! Starting tracking...


  cv2.circle(output_frame, (int(kalman_pred_x), int(kalman_pred_y)), 5, (0, 255, 255), -1)


Initial point detected! Starting tracking...
Closed fist detected - ending tracking
Initial point detected! Starting tracking...
Closed fist detected - ending tracking
Initial point detected! Starting tracking...
Closed fist detected - ending tracking
Interrupted by user
