In [18]:
!pip install mediapipe opencv-python



In [19]:
import mediapipe as mp
import cv2
import numpy as np
import uuid
import os

# Hand Landmark Preprocessing Pipeline

This notebook cell contains a set of functions to **preprocess hand landmarks**.  
The preprocessing pipeline includes the following steps:

1. **Extract coordinates** – Convert each hand landmark into a flat array of `(x, y, z)` values.  
2. **Normalize wrist position** – Shift all landmarks so that the wrist (the first landmark) is at the origin `(0,0,0)`.  
3. **Scale according to hand size** – Adjust the landmarks so that hand size differences do not affect the analysis.  
4. **Normalize values** – Scale all coordinates to a fixed range (e.g., `[-1, 1]`) for consistent model input.

In [20]:
def extract_landmarks(hand_landmarks):
    """
    Extracts the x, y, z coordinates from each landmark of the hand.

    Parameters:
        hand_landmarks: The hand landmarks object from a hand tracking model.

    Returns:
        np.array: Flattened array containing all landmark coordinates [x1, y1, z1, x2, y2, z2, ...].
    """
    data = []
    for lm in hand_landmarks.landmark:
        data.extend([lm.x, lm.y, lm.z])
    return np.array(data)


def normalize_landmarks(landmarks):
    """
    Shifts all landmarks so that the wrist (first landmark) is at the origin (0,0,0).

    Parameters:
        landmarks (np.array): Flattened array of hand landmarks [x1, y1, z1, ...].

    Returns:
        np.array: Landmarks translated relative to the wrist position.
    """
    landmarks = landmarks.copy()
    base_x, base_y, base_z = landmarks[0], landmarks[1], landmarks[2]
    for i in range(0, len(landmarks), 3):
        landmarks[i] -= base_x
        landmarks[i+1] -= base_y
        landmarks[i+2] -= base_z
    return landmarks

def scale_landmarks(landmarks):
    """
    Normalizes the scale of the hand landmarks based on the distance from wrist to middle finger MCP joint.

    Parameters:
        landmarks (np.array): Wrist-normalized landmarks.

    Returns:
        np.array: Landmarks scaled to have a consistent hand size.
    """
    wrist = np.array([landmarks[0], landmarks[1], landmarks[2]])
    mcp = np.array([landmarks[27], landmarks[28], landmarks[29]])

    scale = np.linalg.norm(mcp - wrist)
    return landmarks / scale

def minmax_landmarks(landmarks):
    """
    Normalizes all landmark values to the range [-1, 1] based on the maximum absolute value.

    Parameters:
        landmarks (np.array): Scaled landmarks.

    Returns:
        np.array: Landmarks normalized to [-1, 1].
    """
    max_val = np.max(np.abs(landmarks))
    return landmarks / max_val

def preprocess_landmarks(hand_landmarks):
    """
    Applies the full preprocessing pipeline:
    1. Extracts landmarks
    2. Normalizes wrist position
    3. Scales landmarks according to hand size
    4. Normalizes values to [-1, 1]

    Parameters:
        hand_landmarks: The hand landmarks object.

    Returns:
        np.array: Fully preprocessed landmark array ready for model input.
    """
    landmarks = extract_landmarks(hand_landmarks)
    landmarks = normalize_landmarks(landmarks)
    landmarks = scale_landmarks(landmarks)
    landmarks = minmax_landmarks(landmarks)
    return landmarks

# Hand Landmark Drawing and Overlay Function

This function draws hand landmarks on a video frame and overlays the processed landmark coordinates as text annotations.

In [21]:
mp_drawing = mp.solutions.drawing_utils
mp_hands_connections = mp.solutions.hands.HAND_CONNECTIONS

def draw_overlays(frame, hand_landmarks, processed_landmarks):
    """
    Draws hand landmarks and overlays processed coordinates on a given frame.

    Parameters:
        frame (np.array): The image frame on which to draw.
        hand_landmarks (mp.framework.formats.landmark_pb2.NormalizedLandmarkList): 
            Hand landmarks detected by MediaPipe.
        processed_landmarks (np.array): Preprocessed landmark coordinates from the pipeline.
    """
    h, w, _ = frame.shape
    mp_drawing.draw_landmarks(
        frame, hand_landmarks, mp_hands_connections,
		mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
		mp_drawing.DrawingSpec(color=(250,44,250), thickness=2, circle_radius=2)
  )
    for idx, lm in enumerate(hand_landmarks.landmark):
        cx, cy = int(lm.x * w), int(lm.y * h)
        px, py = processed_landmarks[idx*3], processed_landmarks[idx*3 + 1]
        text = f"{idx}:({px:.2f},{py:.2f})"
        cv2.putText(frame, text, (cx + 5, cy - 5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,255,255), 1, cv2.LINE_AA)

# Flexible Video Capture Loop

The `video_capture_loop` function:

- Reads frames from a video source (`cv2.VideoCapture`)
- Optionally flips frames horizontally or vertically
- Processes frames using a user-defined function
- Can display frames in a window or skip GUI for automated processing
- Ensures proper resource cleanup


In [22]:
from contextlib import AbstractContextManager
from typing import Callable, TypeVar
import cv2
import numpy as np

ContextType = TypeVar("ContextType", bound=AbstractContextManager)

class ContextError(Exception):
    pass

class VideoCaptureError(Exception):
    pass


def flip_frame(frame: np.ndarray, flipH: bool, flipV: bool) -> np.ndarray:
	if flipH and flipV:
		return cv2.flip(frame, -1)
	elif flipH:
		return cv2.flip(frame, 1) 
	elif flipV:
		return cv2.flip(frame, 0)
	return frame

def video_capture_loop(
    cap: cv2.VideoCapture,
    context: ContextType,
    loop: Callable[[ContextType, np.ndarray], bool],
    flipH: bool = False,
    flipV: bool = False,
    show_window: bool = True,
) -> None:
    """
    Continuously captures frames from a video source, processes them using a user-defined 
    function, optionally flips frames, and optionally displays them in a window.

    This function manages the video capture and a user-provided context manager, ensuring 
    proper cleanup of resources even in case of errors.

    Parameters
    ----------
    cap : cv2.VideoCapture
        An OpenCV video capture object (camera or video file).
    context : ContextType
        A context manager instance used during the video processing loop.
    loop : Callable[[ContextType, np.ndarray], bool]
        A user-defined function that processes each frame.
        - Arguments:
            context: the active context manager instance
            frame: the current video frame as a NumPy array
        - Returns:
            True to terminate the loop early, False to continue processing.
    flipH : bool, optional
        If True, each frame is flipped horizontally before processing. Default is False.
    flipV : bool, optional
        If True, each frame is flipped vertically before processing. Default is False.
    show_window : bool, optional
        If True, frames are displayed in a window named "VideoInput".
        If False, no GUI window is shown and processing runs as fast as possible.
        Default is True.

    Returns
    -------
    None

    Raises
    ------
    VideoCaptureError
        Raised if a frame cannot be read from the video capture or if an error occurs 
        during frame processing.
    ContextError
        Raised if an error occurs while entering or using the provided context manager.
    """
    fps = cap.get(cv2.CAP_PROP_FPS)
    delay = int(1000 / fps) if fps > 0 else 1

    try:
        with context as ctx:
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret or frame is None:
                    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
                    current_frame = cap.get(cv2.CAP_PROP_POS_FRAMES)
                    if total_frames > 0 and current_frame >= total_frames:
                        break
                    else:
                        raise VideoCaptureError("Unable to read video capture.")
                
                frame = flip_frame(frame, flipH, flipV)
                
                try:
                    if loop(ctx, frame):
                        break
                except Exception as e:
                    raise VideoCaptureError("Error while processing frame.") from e
                
                if show_window:
                    cv2.imshow("VideoInput", frame)
                    if cv2.waitKey(delay) & 0xFF == ord("q"):
                        break

    except Exception as e:
        if not isinstance(e, VideoCaptureError):
            raise ContextError("Error while using context.") from e
        else:
            raise

    finally:
        if cap.isOpened():
            cap.release()
        if show_window:
            cv2.destroyAllWindows()


In [23]:
from contextlib import AbstractContextManager
from typing import  Optional, Protocol, Sequence ,cast
from mediapipe.python.solutions.hands import Hands

class Landmark(Protocol):
    x: float
    y: float
    z: float


class WorldLandmark(Protocol):
    x: float
    y: float
    z: float

class Handedness(Protocol):
    index: int
    score: float
    label: str

class SafeHandsResult(Protocol):
    multi_hand_landmarks: Optional[Sequence]
    multi_hand_world_landmarks: Optional[Sequence]
    multi_handedness: Optional[Sequence]
    
class SafeHands(AbstractContextManager):
	def __init__(self, *args, **kwargs):
		self.hands = Hands(*args, **kwargs)
	
	def __enter__(self) -> "SafeHands":
		return self
	
	def __exit__(self, exc_type, exc_value, traceback) -> None:
		self.hands.close()

	def process(self, frame_rgb: np.ndarray) -> SafeHandsResult:
		return cast(SafeHandsResult, self.hands.process(frame_rgb))


In [24]:
hands_model = SafeHands(min_detection_confidence=0.8, min_tracking_confidence=0.5)
video_capture = cv2.VideoCapture(0)

def loop(hands: SafeHands, frame: np.ndarray) -> bool:
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        print(results.multi_hand_landmarks)
        for hand_landmarks in results.multi_hand_landmarks:
            processed = preprocess_landmarks(hand_landmarks)
            draw_overlays(frame, hand_landmarks, processed)
    return False

video_capture_loop(
    cap=video_capture,
    context=hands_model,
    loop=loop,
    flipH=True,
)

I0000 00:00:1764992873.772987   70807 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1764992873.774936   73472 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1764992873.800080   73468 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764992873.811859   73463 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[landmark {
  x: 0.772275567
  y: 0.448765755
  z: 3.85897152e-07
}
landmark {
  x: 0.681511879
  y: 0.559850693
  z: -0.0265681669
}
landmark {
  x: 0.551353
  y: 0.593076825
  z: -0.041889865
}
landmark {
  x: 0.441994667
  y: 0.581377506
  z: -0.0527501777
}
landmark {
  x: 0.358048826
  y: 0.562616706
  z: -0.0633614138
}
landmark {
  x: 0.502420604
  y: 0.467198879
  z: -0.0463165194
}
landmark {
  x: 0.377639532
  y: 0.423284411
  z: -0.0759307221
}
landmark {
  x: 0.301883578
  y: 0.39687264
  z: -0.0940941647
}
landmark {
  x: 0.243047059
  y: 0.37700066
  z: -0.105121709
}
landmark {
  x: 0.520269632
  y: 0.359380037
  z: -0.0507835411
}
landmark {
  x: 0.388059735
  y: 0.290177554
  z: -0.07364095
}
landmark {
  x: 0.302103
  y: 0.252785891
  z: -0.0899814665
}
landmark {
  x: 0.236215442
  y: 0.223972887
  z: -0.101180635
}
landmark {
  x: 0.555778265
  y: 0.273419321
  z: -0.0572301485
}
landmark {
  x: 0.421940476
  y: 0.202492446
  z: -0.0806430057
}
landmark {
  x: 0.341

# Real-Time Hand Landmark Preprocessing Visualization

This cell captures video from the webcam, processes hand landmarks in real-time, and visualizes both the original landmarks and the preprocessed coordinates.

## Main Components

1. **Video Capture**
    - `cv2.VideoCapture(0)` initializes the webcam.  
    - Frames are continuously captured in a loop until the user quits.

2. **Hand Detection**
    - MediaPipe's `Hands` module is used with:
        - `min_detection_confidence=0.8`
        - `min_tracking_confidence=0.5`  
    - The `hands.process(frame_rgb)` method detects hand landmarks in each frame.

3. **Preprocessing Pipeline**
    - For each detected hand:
        1. Extract landmarks.
        2. Normalize wrist position.
        3. Scale landmarks relative to hand size.
        4. Normalize values to the range [-1, 1].
    - This is done using the `preprocess_landmarks` function.

4. **Overlay Visualization**
    - `draw_overlays` draws both:
        - MediaPipe hand landmarks.
        - Preprocessed coordinates as text annotations.

5. **Display**
    - The annotated frame is shown in a window using `cv2.imshow`.
    - Press `q` to quit the visualization.