In [2]:
!pip install mediapipe opencv-python



In [3]:
import mediapipe as mp
import cv2
import numpy as np
import uuid
import os

# Hand Landmark Preprocessing Pipeline

This notebook cell contains a set of functions to **preprocess hand landmarks**.  
The preprocessing pipeline includes the following steps:

1. **Extract coordinates** – Convert each hand landmark into a flat array of `(x, y, z)` values.  
2. **Normalize wrist position** – Shift all landmarks so that the wrist (the first landmark) is at the origin `(0,0,0)`.  
3. **Scale according to hand size** – Adjust the landmarks so that hand size differences do not affect the analysis.  
4. **Normalize values** – Scale all coordinates to a fixed range (e.g., `[-1, 1]`) for consistent model input.

In [4]:
def extract_landmarks(hand_landmarks):
    """
    Extracts the x, y, z coordinates from each landmark of the hand.

    Parameters:
        hand_landmarks: The hand landmarks object from a hand tracking model.

    Returns:
        np.array: Flattened array containing all landmark coordinates [x1, y1, z1, x2, y2, z2, ...].
    """
    data = []
    for lm in hand_landmarks.landmark:
        data.extend([lm.x, lm.y, lm.z])
    return np.array(data)


def normalize_landmarks(landmarks):
    """
    Shifts all landmarks so that the wrist (first landmark) is at the origin (0,0,0).

    Parameters:
        landmarks (np.array): Flattened array of hand landmarks [x1, y1, z1, ...].

    Returns:
        np.array: Landmarks translated relative to the wrist position.
    """
    landmarks = landmarks.copy()
    base_x, base_y, base_z = landmarks[0], landmarks[1], landmarks[2]
    for i in range(0, len(landmarks), 3):
        landmarks[i] -= base_x
        landmarks[i+1] -= base_y
        landmarks[i+2] -= base_z
    return landmarks

def scale_landmarks(landmarks):
    """
    Normalizes the scale of the hand landmarks based on the distance from wrist to middle finger MCP joint.

    Parameters:
        landmarks (np.array): Wrist-normalized landmarks.

    Returns:
        np.array: Landmarks scaled to have a consistent hand size.
    """
    wrist = np.array([landmarks[0], landmarks[1], landmarks[2]])
    mcp = np.array([landmarks[27], landmarks[28], landmarks[29]])

    scale = np.linalg.norm(mcp - wrist)
    return landmarks / scale

def minmax_landmarks(landmarks):
    """
    Normalizes all landmark values to the range [-1, 1] based on the maximum absolute value.

    Parameters:
        landmarks (np.array): Scaled landmarks.

    Returns:
        np.array: Landmarks normalized to [-1, 1].
    """
    max_val = np.max(np.abs(landmarks))
    return landmarks / max_val

def preprocess_landmarks(hand_landmarks):
    """
    Applies the full preprocessing pipeline:
    1. Extracts landmarks
    2. Normalizes wrist position
    3. Scales landmarks according to hand size
    4. Normalizes values to [-1, 1]

    Parameters:
        hand_landmarks: The hand landmarks object.

    Returns:
        np.array: Fully preprocessed landmark array ready for model input.
    """
    landmarks = extract_landmarks(hand_landmarks)
    landmarks = normalize_landmarks(landmarks)
    landmarks = scale_landmarks(landmarks)
    landmarks = minmax_landmarks(landmarks)
    return landmarks

# Camera Frame Capture Function

This function captures a single frame from a webcam or video stream using OpenCV and performs basic preprocessing.

In [5]:
def get_camera_frame(cap:cv2.VideoCapture) -> np.ndarray|None:
    """
    Captures a frame from the given video capture object and flips it horizontally.

    Parameters:
        cap (cv2.VideoCapture): OpenCV video capture object, usually created with `cv2.VideoCapture(0)`.

    Returns:
        frame (np.array or None): The captured frame with horizontal flip applied, or
                                  None if the frame could not be read.
    """
    ret, frame = cap.read()
    if not ret:
        return None
    frame = cv2.flip(frame, 1)
    return frame

# Hand Landmark Drawing and Overlay Function

This function draws hand landmarks on a video frame and overlays the processed landmark coordinates as text annotations.

In [6]:
mp_drawing = mp.solutions.drawing_utils
mp_hands_connections = mp.solutions.hands.HAND_CONNECTIONS

def draw_overlays(frame, hand_landmarks, processed_landmarks):
    """
    Draws hand landmarks and overlays processed coordinates on a given frame.

    Parameters:
        frame (np.array): The image frame on which to draw.
        hand_landmarks (mp.framework.formats.landmark_pb2.NormalizedLandmarkList): 
            Hand landmarks detected by MediaPipe.
        processed_landmarks (np.array): Preprocessed landmark coordinates from the pipeline.

    Returns:
        np.array: The frame with landmarks and coordinate annotations drawn.
    """
    h, w, _ = frame.shape
    mp_drawing.draw_landmarks(
        frame, hand_landmarks, mp_hands_connections,
		mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
		mp_drawing.DrawingSpec(color=(250,44,250), thickness=2, circle_radius=2)
  )
    for idx, lm in enumerate(hand_landmarks.landmark):
        cx, cy = int(lm.x * w), int(lm.y * h)
        px, py = processed_landmarks[idx*3], processed_landmarks[idx*3 + 1]
        text = f"{idx}:({px:.2f},{py:.2f})"
        cv2.putText(frame, text, (cx + 5, cy - 5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,255,255), 1, cv2.LINE_AA)
    return frame

In [None]:

from contextlib import AbstractContextManager
from typing import Callable, Protocol, TypeVar

ContextType = TypeVar("ContextType", bound=AbstractContextManager)

def cv_frame_loop(
    cap: cv2.VideoCapture,
    context: ContextType,
    loop: Callable[[ContextType, np.ndarray], None],
    flip: bool = True,
) -> None:
	fps = cap.get(cv2.CAP_PROP_FPS)
	delay = int(1000 / fps) if fps > 0 else 1
	try:
		with context:
			while cap.isOpened():
				ret, frame = cap.read()
				if not ret:
					return None
				if flip:
					frame = cv2.flip(frame, 1)
				if frame is None:
					print("No se puede leer la cámara.")
					break
				loop(context, frame)
				cv2.imshow("VideoInput", frame)
				if cv2.waitKey(delay) & 0xFF == ord("q"):
					break
	finally:
		cap.release()
		cv2.destroyAllWindows()


In [None]:
from typing import Any, List,cast
from mediapipe.python.solutions.hands import Hands

class SafeHandsResult(Protocol):
    multi_hand_landmarks: List[Any] | None
    
class SafeHands(AbstractContextManager):
	def __init__(self, *args, **kwargs):
		self.hands = Hands(*args, **kwargs)
	
	def __enter__(self) -> Hands:
		return self.hands
	
	def __exit__(self, exc_type, exc_value, traceback) -> None:
		self.hands.close()

	def process(self, frame_rgb: np.ndarray) -> SafeHandsResult:
		return cast(SafeHandsResult, self.hands.process(frame_rgb))


In [19]:
hands_model = SafeHands(min_detection_confidence=0.8, min_tracking_confidence=0.5)
video_capture = cv2.VideoCapture(0)

def loop(hands: SafeHands, frame: np.ndarray) -> None:
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            processed = preprocess_landmarks(hand_landmarks)
            frame = draw_overlays(frame, hand_landmarks, processed)

cv_frame_loop(
    cap=video_capture,
    context=hands_model,
    loop=loop,
)

I0000 00:00:1764978490.996635   41546 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1764978490.998903   48047 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1764978491.028094   48037 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764978491.039397   48044 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


# Real-Time Hand Landmark Preprocessing Visualization

This cell captures video from the webcam, processes hand landmarks in real-time, and visualizes both the original landmarks and the preprocessed coordinates.

## Main Components

1. **Video Capture**
    - `cv2.VideoCapture(0)` initializes the webcam.  
    - Frames are continuously captured in a loop until the user quits.

2. **Hand Detection**
    - MediaPipe's `Hands` module is used with:
        - `min_detection_confidence=0.8`
        - `min_tracking_confidence=0.5`  
    - The `hands.process(frame_rgb)` method detects hand landmarks in each frame.

3. **Preprocessing Pipeline**
    - For each detected hand:
        1. Extract landmarks.
        2. Normalize wrist position.
        3. Scale landmarks relative to hand size.
        4. Normalize values to the range [-1, 1].
    - This is done using the `preprocess_landmarks` function.

4. **Overlay Visualization**
    - `draw_overlays` draws both:
        - MediaPipe hand landmarks.
        - Preprocessed coordinates as text annotations.

5. **Display**
    - The annotated frame is shown in a window using `cv2.imshow`.
    - Press `q` to quit the visualization.

In [9]:
mp_hands_module = mp.solutions.hands.Hands
   
def main():
    cap = cv2.VideoCapture(0)
    with mp_hands_module(min_detection_confidence=0.8, min_tracking_confidence=0.5) as hands:
        while cap.isOpened():
            frame = get_camera_frame(cap)
            if frame is None:
                print("No se puede leer la cámara.")
                break

            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = hands.process(frame_rgb)

            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    processed = preprocess_landmarks(hand_landmarks)
                    frame = draw_overlays(frame, hand_landmarks, processed)

            cv2.imshow("Webcam", frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()


I0000 00:00:1764975777.017385   41546 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1764975777.019576   41682 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1764975777.050173   41675 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1764975777.069498   41675 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
