In [74]:
!pip install mediapipe opencv-python



In [75]:
import mediapipe as mp
import cv2
import numpy as np
import uuid
import os

# Hand Landmark Preprocessing Pipeline

This notebook cell contains a set of functions to **preprocess hand landmarks**.  
The preprocessing pipeline includes the following steps:

1. **Extract coordinates** – Convert each hand landmark into a flat array of `(x, y, z)` values.  
2. **Normalize wrist position** – Shift all landmarks so that the wrist (the first landmark) is at the origin `(0,0,0)`.  
3. **Scale according to hand size** – Adjust the landmarks so that hand size differences do not affect the analysis.  
4. **Normalize values** – Scale all coordinates to a fixed range (e.g., `[-1, 1]`) for consistent model input.

In [76]:
def normalize_landmarks(landmarks: np.ndarray) -> np.ndarray:
    """
    Shifts all landmarks so that the wrist (first landmark) is at the origin (0,0,0).

    Parameters:
        landmarks: np.ndarray of shape [NUM_LANDMARKS, 3]

    Returns:
        np.ndarray: Landmarks translated relative to the wrist.
    """
    landmarks = landmarks.copy()
    base = landmarks[0]  # [x, y, z] del primer landmark (muñeca)
    return landmarks - base


def scale_landmarks(landmarks: np.ndarray) -> np.ndarray:
    """
    Normalizes the scale of the hand landmarks based on the distance from wrist to middle finger MCP joint.

    Parameters:
        landmarks: np.ndarray de landmarks normalizados por la muñeca [NUM_LANDMARKS, 3]

    Returns:
        np.ndarray: Landmarks escalados a tamaño de mano consistente.
    """
    wrist = landmarks[0]
    mcp = landmarks[9]  # 9 es el MCP del dedo medio
    scale = np.linalg.norm(mcp - wrist)
    return landmarks / scale


def minmax_landmarks(landmarks: np.ndarray) -> np.ndarray:
    """
    Normalizes all landmark values to the range [-1, 1] based on the maximum absolute value.

    Parameters:
        landmarks: np.ndarray de landmarks escalados

    Returns:
        np.ndarray: Landmarks normalizados a [-1, 1]
    """
    max_val = np.max(np.abs(landmarks))
    return landmarks / max_val


def preprocess_landmarks(hand_landmarks: np.ndarray) -> np.ndarray:
    """
    Applies the full preprocessing pipeline, preserving [NUM_LANDMARKS, 3] shape.

    Parameters:
        hand_landmarks: Array de forma [21, 3]

    Returns:
        np.ndarray: Fully preprocessed landmark array.
    """
    landmarks = normalize_landmarks(hand_landmarks)
    landmarks = scale_landmarks(landmarks)
    landmarks = minmax_landmarks(landmarks)
    return landmarks


# Hand Landmark Drawing and Overlay Function

This function draws hand landmarks on a video frame and overlays the processed landmark coordinates as text annotations.

In [None]:
import cv2
import mediapipe as mp

mp_drawing = mp.solutions.drawing_utils
mp_hands_connections = mp.solutions.hands.HAND_CONNECTIONS

def draw_overlays(frame, hand_landmarks_np, processed_landmarks=None):
    """
    Draws hand landmarks and connections on a given frame.
    Parameters:
        frame (np.array): The image frame.
        hand_landmarks_np (np.array): Numpy array of shape (21, 3) with hand landmarks.
        processed_landmarks (np.array, optional): Preprocessed landmark coordinates [21,3].
    """
    h, w, _ = frame.shape

    for connection in mp_hands_connections:
        start_idx, end_idx = connection
        x0, y0 = int(hand_landmarks_np[start_idx][0] * w), int(hand_landmarks_np[start_idx][1] * h)
        x1, y1 = int(hand_landmarks_np[end_idx][0] * w), int(hand_landmarks_np[end_idx][1] * h)
        cv2.line(frame, (x0, y0), (x1, y1), (0, 255, 0), 2)

    for idx, lm in enumerate(hand_landmarks_np):
        cx, cy = int(lm[0] * w), int(lm[1] * h)
        cv2.circle(frame, (cx, cy), 6, (0, 0, 255), -1)

        if processed_landmarks is not None:
            px, py = processed_landmarks[idx][0], processed_landmarks[idx][1]
            text = f"{idx}:({px:.2f},{py:.2f})"
            cv2.putText(frame, text, (cx + 5, cy - 5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.35, (255, 255, 0), 1, cv2.LINE_AA)


# Flexible Video Capture Loop

The `video_capture_loop` function:

- Reads frames from a video source (`cv2.VideoCapture`)
- Optionally flips frames horizontally or vertically
- Processes frames using a user-defined function
- Can display frames in a window or skip GUI for automated processing
- Ensures proper resource cleanup


In [78]:
from contextlib import AbstractContextManager
from typing import Callable, TypeVar
import cv2
import numpy as np
import time

ContextType = TypeVar("ContextType", bound=AbstractContextManager)

class ContextError(Exception):
    pass

class VideoCaptureError(Exception):
    pass


def flip_frame(frame: np.ndarray, flipH: bool, flipV: bool) -> np.ndarray:
    if flipH and flipV:
        return cv2.flip(frame, -1)
    elif flipH:
        return cv2.flip(frame, 1) 
    elif flipV:
        return cv2.flip(frame, 0)
    return frame


def video_capture_loop(
    cap: cv2.VideoCapture,
    context: ContextType,
    loop: Callable[[ContextType, np.ndarray], bool],
    flipH: bool = False,
    flipV: bool = False,
    show_window: bool = True,
    show_fps: bool = False
) -> None:
    """
    Continuously captures frames from a video source, processes them using a user-defined 
    function, optionally flips frames, optionally displays them in a window, and optionally
    shows the current FPS on the video.
    """
    fps = cap.get(cv2.CAP_PROP_FPS)
    delay = int(1000 / fps) if fps > 0 else 1
    prev_time = time.time()

    try:
        with context as ctx:
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret or frame is None:
                    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
                    current_frame = cap.get(cv2.CAP_PROP_POS_FRAMES)
                    if total_frames > 0 and current_frame >= total_frames:
                        break
                    else:
                        raise VideoCaptureError("Unable to read video capture.")
                
                frame = flip_frame(frame, flipH, flipV)
                
                try:
                    if loop(ctx, frame):
                        break
                except Exception as e:
                    raise VideoCaptureError("Error while processing frame.") from e
                
                if show_window:
                    if show_fps:
                        current_time = time.time()
                        current_fps = 1.0 / (current_time - prev_time) if (current_time - prev_time) > 0 else 0
                        prev_time = current_time
                        cv2.putText(
                            frame,
                            f"FPS: {current_fps:.2f}",
                            (10, 30),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            1,
                            (0, 255, 0),
                            2,
                            cv2.LINE_AA
                        )
                    cv2.imshow("VideoInput", frame)
                    if cv2.waitKey(delay) & 0xFF == ord("q"):
                        break

    except Exception as e:
        if not isinstance(e, VideoCaptureError):
            raise ContextError("Error while using context.") from e
        else:
            raise

    finally:
        if cap.isOpened():
            cap.release()
        if show_window:
            cv2.destroyAllWindows()


In [85]:
from asyncio import Protocol
import numpy as np
from contextlib import AbstractContextManager
from typing import Optional, Sequence, Tuple, cast
from mediapipe.python.solutions.hands import Hands


class SafeHandsResult(Protocol):
    multi_hand_landmarks: Optional[Sequence]
    multi_hand_world_landmarks: Optional[Sequence]
    multi_handedness: Optional[Sequence]

class SafeHands(AbstractContextManager):
	def __init__(self, 
              	static_image_mode: bool = False,
			    max_num_hands: int = 2,
			    model_complexity: int = 1,
			    min_detection_confidence: float = 0.5,
			    min_tracking_confidence: float = 0.5):
		self.max_num_hands = max_num_hands
		self.hands = Hands(static_image_mode = static_image_mode,
			    max_num_hands = max_num_hands,
			    model_complexity = model_complexity,
			    min_detection_confidence = min_detection_confidence,
			    min_tracking_confidence = min_tracking_confidence)
    
	def __enter__(self) -> "SafeHands":
		return self
    
	def __exit__(self, exc_type, exc_value, traceback) -> None:
		self.hands.close()

	def process(self, frame_rgb: np.ndarray) -> Optional[Tuple[np.ndarray, np.ndarray, np.ndarray]]:
		mp_result = cast(SafeHandsResult, self.hands.process(frame_rgb))
		if (not mp_result.multi_hand_landmarks and not mp_result.multi_hand_world_landmarks and not mp_result.multi_handedness):
			return None
		landmarks = np.zeros((self.max_num_hands, 21, 3), dtype=np.float32)
		world_landmarks = np.zeros((self.max_num_hands, 21, 3), dtype=np.float32)
		handedness = np.zeros((self.max_num_hands, 2), dtype=np.float32)
		if mp_result.multi_hand_landmarks:
			for i, hand in enumerate(mp_result.multi_hand_landmarks[:self.max_num_hands]):
				for j, lm in enumerate(hand.landmark):
					landmarks[i, j, 0] = lm.x
					landmarks[i, j, 1] = lm.y
					landmarks[i, j, 2] = lm.z

		if mp_result.multi_hand_world_landmarks:
			for i, hand in enumerate(mp_result.multi_hand_world_landmarks[:self.max_num_hands]):
				for j, lm in enumerate(hand.landmark):
					world_landmarks[i, j, 2] = lm.z
					world_landmarks[i, j, 0] = lm.x
					world_landmarks[i, j, 1] = lm.y

		if mp_result.multi_handedness:
			for i, h in enumerate(mp_result.multi_handedness[:self.max_num_hands]):
				handedness[i, 0] = 0 if h.classification[0].label == "Left" else 1
				handedness[i, 1] = h.classification[0].score

		return landmarks, world_landmarks, handedness


In [86]:
hands_model = SafeHands(min_detection_confidence=0.8, min_tracking_confidence=0.5)
video_capture = cv2.VideoCapture(0)

def loop(hands: SafeHands, frame: np.ndarray) -> bool:
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	result = hands.process(frame_rgb)
	if result:
		landmarks = result[0]
		for hand_landmarks in landmarks:
			draw_overlays(frame, hand_landmarks, preprocess_landmarks(hand_landmarks))
	return False

video_capture_loop(
    cap=video_capture,
    context=hands_model,
    loop=loop,
    flipH=True,
    show_fps=True,
)

I0000 00:00:1765030502.409102   77667 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1765030502.411976  109646 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1765030502.442793  109636 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1765030502.453916  109639 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
  return landmarks / scale


# Real-Time Hand Landmark Preprocessing Visualization

This cell captures video from the webcam, processes hand landmarks in real-time, and visualizes both the original landmarks and the preprocessed coordinates.

## Main Components

1. **Video Capture**
    - `cv2.VideoCapture(0)` initializes the webcam.  
    - Frames are continuously captured in a loop until the user quits.

2. **Hand Detection**
    - MediaPipe's `Hands` module is used with:
        - `min_detection_confidence=0.8`
        - `min_tracking_confidence=0.5`  
    - The `hands.process(frame_rgb)` method detects hand landmarks in each frame.

3. **Preprocessing Pipeline**
    - For each detected hand:
        1. Extract landmarks.
        2. Normalize wrist position.
        3. Scale landmarks relative to hand size.
        4. Normalize values to the range [-1, 1].
    - This is done using the `preprocess_landmarks` function.

4. **Overlay Visualization**
    - `draw_overlays` draws both:
        - MediaPipe hand landmarks.
        - Preprocessed coordinates as text annotations.

5. **Display**
    - The annotated frame is shown in a window using `cv2.imshow`.
    - Press `q` to quit the visualization.