In [45]:
!pip install mediapipe opencv-python pandas tensorflow scikit-learn datasets

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m[31m1.4 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
Collecting fsspec<=2025.10.0

In [9]:
import mediapipe as mp
import cv2
import numpy as np
import uuid
import os

# Hand Landmark Preprocessing Pipeline

This notebook cell contains a set of functions to **preprocess hand landmarks**.  
The preprocessing pipeline includes the following steps:

1. **Extract coordinates** – Convert each hand landmark into a flat array of `(x, y, z)` values.  
2. **Normalize wrist position** – Shift all landmarks so that the wrist (the first landmark) is at the origin `(0,0,0)`.  
3. **Scale according to hand size** – Adjust the landmarks so that hand size differences do not affect the analysis.  
4. **Normalize values** – Scale all coordinates to a fixed range (e.g., `[-1, 1]`) for consistent model input.

In [22]:
def normalize_landmarks(landmarks: np.ndarray) -> np.ndarray:
    """
    Shifts all landmarks so that the wrist (first landmark) is at the origin (0,0,0).

    Parameters:
        landmarks: np.ndarray of shape [NUM_LANDMARKS, 3]

    Returns:
        np.ndarray: Landmarks translated relative to the wrist.
    """
    landmarks = landmarks.copy()
    base = landmarks[0]
    return landmarks - base


def scale_landmarks(landmarks: np.ndarray) -> np.ndarray:
    """
    Normalizes the scale of the hand landmarks based on the distance from wrist to middle finger MCP joint.

    Parameters:
        landmarks: np.ndarray de landmarks normalizados por la muñeca [NUM_LANDMARKS, 3]

    Returns:
        np.ndarray: Landmarks escalados a tamaño de mano consistente.
    """
    wrist = landmarks[0]
    mcp = landmarks[9]
    scale = np.linalg.norm(mcp - wrist)
    return landmarks / scale


def minmax_landmarks(landmarks: np.ndarray) -> np.ndarray:
    """
    Normalizes all landmark values to the range [-1, 1] based on the maximum absolute value.

    Parameters:
        landmarks: np.ndarray de landmarks escalados

    Returns:
        np.ndarray: Landmarks normalizados a [-1, 1]
    """
    max_val = np.max(np.abs(landmarks))
    return landmarks / max_val


def preprocess_landmarks(hand_landmarks: np.ndarray) -> np.ndarray:
    """
    Applies the full preprocessing pipeline, preserving [NUM_LANDMARKS, 3] shape.

    Parameters:
        hand_landmarks: Array de forma [21, 3]

    Returns:
        np.ndarray: Fully preprocessed landmark array.
    """
    landmarks = normalize_landmarks(hand_landmarks)
    landmarks = scale_landmarks(landmarks)
    landmarks = minmax_landmarks(landmarks)
    return landmarks


# Hand Landmark Drawing and Overlay Function

This function draws hand landmarks on a video frame and overlays the processed landmark coordinates as text annotations.

In [10]:
mp_drawing = mp.solutions.drawing_utils
mp_hands_connections = mp.solutions.hands.HAND_CONNECTIONS

def draw_overlays(frame, hand_landmarks_np, processed_landmarks=None):
    """
    Draws hand landmarks and connections on a given frame.
    Parameters:
        frame (np.array): The image frame.
        hand_landmarks_np (np.array): Numpy array of shape (21, 3) with hand landmarks.
        processed_landmarks (np.array, optional): Preprocessed landmark coordinates [21,3].
    """
    h, w, _ = frame.shape

    for connection in mp_hands_connections:
        start_idx, end_idx = connection
        x0, y0 = int(hand_landmarks_np[start_idx][0] * w), int(hand_landmarks_np[start_idx][1] * h)
        x1, y1 = int(hand_landmarks_np[end_idx][0] * w), int(hand_landmarks_np[end_idx][1] * h)
        cv2.line(frame, (x0, y0), (x1, y1), (0, 255, 0), 2)

    for idx, lm in enumerate(hand_landmarks_np):
        cx, cy = int(lm[0] * w), int(lm[1] * h)
        cv2.circle(frame, (cx, cy), 6, (0, 0, 255), -1)

        if processed_landmarks is not None:
            px, py = processed_landmarks[idx][0], processed_landmarks[idx][1]
            text = f"{idx}:({px:.2f},{py:.2f})"
            cv2.putText(frame, text, (cx + 5, cy - 5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.35, (255, 255, 0), 1, cv2.LINE_AA)


# Flexible Video Capture Loop

The `video_capture_loop` function:

- Reads frames from a video source (`cv2.VideoCapture`)
- Optionally flips frames horizontally or vertically
- Processes frames using a user-defined function
- Can display frames in a window or skip GUI for automated processing
- Ensures proper resource cleanup


In [11]:
from contextlib import AbstractContextManager
from typing import Callable, TypeVar
import time

ContextType = TypeVar("ContextType", bound=AbstractContextManager)

class ContextError(Exception):
    pass

class VideoCaptureError(Exception):
    pass


def flip_frame(frame: np.ndarray, flipH: bool, flipV: bool) -> np.ndarray:
    if flipH and flipV:
        return cv2.flip(frame, -1)
    elif flipH:
        return cv2.flip(frame, 1) 
    elif flipV:
        return cv2.flip(frame, 0)
    return frame


def video_capture_loop(
    cap: cv2.VideoCapture,
    context: ContextType,
    loop: Callable[[ContextType, np.ndarray], bool],
    flipH: bool = False,
    flipV: bool = False,
    show_window: bool = True,
    show_fps: bool = False
) -> None:
    """
    Continuously captures frames from a video source, processes them using a user-defined 
    function, optionally flips frames, optionally displays them in a window, and optionally
    shows the current FPS on the video.
    """
    fps = cap.get(cv2.CAP_PROP_FPS)
    delay = int(1000 / fps) if fps > 0 else 1
    prev_time = time.time()

    try:
        with context as ctx:
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret or frame is None:
                    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
                    current_frame = cap.get(cv2.CAP_PROP_POS_FRAMES)
                    if total_frames > 0 and current_frame >= total_frames:
                        break
                    else:
                        raise VideoCaptureError("Unable to read video capture.")
                
                frame = flip_frame(frame, flipH, flipV)
                
                try:
                    if loop(ctx, frame):
                        break
                except Exception as e:
                    raise VideoCaptureError("Error while processing frame.") from e
                
                if show_window:
                    if show_fps:
                        current_time = time.time()
                        current_fps = 1.0 / (current_time - prev_time) if (current_time - prev_time) > 0 else 0
                        prev_time = current_time
                        cv2.putText(
                            frame,
                            f"FPS: {current_fps:.2f}",
                            (10, 30),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            1,
                            (0, 255, 0),
                            2,
                            cv2.LINE_AA
                        )
                    cv2.imshow("VideoInput", frame)
                    if cv2.waitKey(delay) & 0xFF == ord("q"):
                        break

    except Exception as e:
        if not isinstance(e, VideoCaptureError):
            raise ContextError("Error while using context.") from e
        else:
            raise

    finally:
        if cap.isOpened():
            cap.release()
        if show_window:
            cv2.destroyAllWindows()


# Safe HandModel Wrapper

In [12]:
from asyncio import Protocol
from contextlib import AbstractContextManager
from typing import Optional, Sequence, Tuple, cast
from mediapipe.python.solutions.hands import Hands


class SafeHandsResult(Protocol):
    multi_hand_landmarks: Optional[Sequence]
    multi_hand_world_landmarks: Optional[Sequence]
    multi_handedness: Optional[Sequence]

class SafeHands(AbstractContextManager):
	def __init__(self, 
              	static_image_mode: bool = False,
			    max_num_hands: int = 2,
			    model_complexity: int = 1,
			    min_detection_confidence: float = 0.5,
			    min_tracking_confidence: float = 0.5):
		self.max_num_hands = max_num_hands
		self.hands = Hands(static_image_mode = static_image_mode,
			    max_num_hands = max_num_hands,
			    model_complexity = model_complexity,
			    min_detection_confidence = min_detection_confidence,
			    min_tracking_confidence = min_tracking_confidence)
    
	def __enter__(self) -> "SafeHands":
		return self
    
	def __exit__(self, exc_type, exc_value, traceback) -> None:
		self.hands.close()

	def process(self, frame_rgb: np.ndarray) -> Optional[Tuple[np.ndarray, np.ndarray, np.ndarray]]:
		mp_result = cast(SafeHandsResult, self.hands.process(frame_rgb))
		if (not mp_result.multi_hand_landmarks and not mp_result.multi_hand_world_landmarks and not mp_result.multi_handedness):
			return None
		landmarks = np.zeros((self.max_num_hands, 21, 3), dtype=np.float32)
		world_landmarks = np.zeros((self.max_num_hands, 21, 3), dtype=np.float32)
		handedness = np.zeros((self.max_num_hands, 2), dtype=np.float32)
		if mp_result.multi_hand_landmarks:
			for i, hand in enumerate(mp_result.multi_hand_landmarks[:self.max_num_hands]):
				for j, lm in enumerate(hand.landmark):
					landmarks[i, j, 0] = lm.x
					landmarks[i, j, 1] = lm.y
					landmarks[i, j, 2] = lm.z

		if mp_result.multi_hand_world_landmarks:
			for i, hand in enumerate(mp_result.multi_hand_world_landmarks[:self.max_num_hands]):
				for j, lm in enumerate(hand.landmark):
					world_landmarks[i, j, 2] = lm.z
					world_landmarks[i, j, 0] = lm.x
					world_landmarks[i, j, 1] = lm.y

		if mp_result.multi_handedness:
			for i, h in enumerate(mp_result.multi_handedness[:self.max_num_hands]):
				handedness[i, 0] = 0 if h.classification[0].label == "Left" else 1
				handedness[i, 1] = h.classification[0].score

		return landmarks, world_landmarks, handedness


# LandMark Visualizadion

In [None]:
hands_model = SafeHands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=1)
video_capture = cv2.VideoCapture(0)

def loop(hands: SafeHands, frame: np.ndarray) -> bool:
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	result = hands.process(frame_rgb)
	if result:
		landmarks = result[0]
		for hand_landmarks in landmarks:
			draw_overlays(frame, hand_landmarks, preprocess_landmarks(hand_landmarks))
	return False

video_capture_loop(
    cap=video_capture,
    context=hands_model,
    loop=loop,
    flipH=True,
    show_fps=True,
)

# Unlabeld Dataset Creation

In [19]:
import pandas as pd

# ----------------------------
# Configuración del dataset
# ----------------------------

dataset_name = "Vincent-luo/hagrid-mediapipe-hands"
split_name = "train"
N = 15000  # número de imágenes a procesar

# Cargar dataset en streaming
dataset = load_dataset(dataset_name, split=split_name, streaming=True)

# ----------------------------
# Procesar imágenes y guardar en DataFrame
# ----------------------------

rows = []

with SafeHands(static_image_mode=True, max_num_hands=2) as safe_hands:
    for i, item in enumerate(dataset):
        print(f"Procesando ejemplo {i+1}...")
        if i >= N:
            break

        # Convertir a RGB np.array
        img = item['image']
        frame_rgb = np.array(img.convert('RGB'))

        # Procesar con mediapipe
        result = safe_hands.process(frame_rgb)

        if result is not None:
            landmarks, world_landmarks, handedness = result

            # Guardar como fila en DataFrame (puedes modificar la estructura)
            rows.append({
                'landmarks': landmarks,
                'world_landmarks': world_landmarks,
                'handedness': handedness
            })

        # Liberar memoria de la imagen
        del img, frame_rgb

# Crear DataFrame final
df = pd.DataFrame(rows)

print(df.head())

# Guardar a disco (opcional)
df.to_pickle("hands_landmarks.pkl")

print(f"Procesadas {len(df)} imágenes y guardadas en 'hands_landmarks.pkl'")


I0000 00:00:1765058877.866784  147411 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1765058877.869668  156035 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1765058877.909451  156025 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1765058877.923148  156030 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Procesando ejemplo 1...
Procesando ejemplo 2...
Procesando ejemplo 3...
Procesando ejemplo 4...
Procesando ejemplo 5...
Procesando ejemplo 6...
Procesando ejemplo 7...
Procesando ejemplo 8...
Procesando ejemplo 9...
Procesando ejemplo 10...
Procesando ejemplo 11...
Procesando ejemplo 12...
Procesando ejemplo 13...
Procesando ejemplo 14...
Procesando ejemplo 15...
Procesando ejemplo 16...
Procesando ejemplo 17...
Procesando ejemplo 18...
Procesando ejemplo 19...
Procesando ejemplo 20...
Procesando ejemplo 21...
Procesando ejemplo 22...
Procesando ejemplo 23...
Procesando ejemplo 24...
Procesando ejemplo 25...
Procesando ejemplo 26...
Procesando ejemplo 27...
Procesando ejemplo 28...
Procesando ejemplo 29...
Procesando ejemplo 30...
Procesando ejemplo 31...
Procesando ejemplo 32...
Procesando ejemplo 33...
Procesando ejemplo 34...
Procesando ejemplo 35...
Procesando ejemplo 36...
Procesando ejemplo 37...
Procesando ejemplo 38...
Procesando ejemplo 39...
Procesando ejemplo 40...
Procesand

# Manual Labeld Dataset Creation

In [31]:
import cv2
import numpy as np
from pathlib import Path

DATASET_DIR = Path("gestures_dataset")
DATASET_DIR.mkdir(exist_ok=True)

GESTURE_LABEL = input("Introduce el nombre del gesto: ").strip()
GESTURE_DIR = DATASET_DIR / GESTURE_LABEL
GESTURE_DIR.mkdir(exist_ok=True)

MAX_HANDS = 1
IMG_SIZE = 224

existing_files = list(GESTURE_DIR.glob(f"{GESTURE_LABEL}_*.png"))
sample_idx = len(existing_files)

def save_sample(image: np.ndarray, landmarks: np.ndarray, world_landmarks: np.ndarray, handedness: np.ndarray, idx: int):
    img_path = GESTURE_DIR / f"{GESTURE_LABEL}_{idx:03d}.png"
    cv2.imwrite(str(img_path), image)
    np.savez(GESTURE_DIR / f"{GESTURE_LABEL}_{idx:03d}_landmarks.npz",
             landmarks=landmarks,
             world_landmarks=world_landmarks,
             handedness=handedness)
    print(f"Muestra {idx} guardada en {GESTURE_DIR}")

save_next = False

def loop_fn(ctx: SafeHands, frame: np.ndarray):
    global sample_idx, save_next

    result = ctx.process(frame)
    if result is None:
        return False

    landmarks, world_landmarks, handedness = result
    lm = landmarks[0]
    if not lm.any():
        return False

    h, w, _ = frame.shape
    margin_ratio = 1  # 10% de margen

    # Centro de la mano
    x_center = int(np.mean(lm[:, 0]) * w)
    y_center = int(np.mean(lm[:, 1]) * h )

    # Tamaño fijo del recorte (ajústalo según tu necesidad)
    base_size = 200  # la mano siempre ocupará este tamaño aproximado
    side = int(base_size * (1 + margin_ratio))

    # Coordenadas del recorte
    x_min_s = max(x_center - side // 2, 0)
    x_max_s = min(x_center + side // 2, w)
    y_min_s = max(y_center - side // 2, 0)
    y_max_s = min(y_center + side // 2, h)

    if x_max_s <= x_min_s or y_max_s <= y_min_s:
        return False

    # Extraer recorte y escalar a tamaño fijo
    hand_img = frame[y_min_s:y_max_s, x_min_s:x_max_s]
    hand_img = cv2.resize(hand_img, (IMG_SIZE, IMG_SIZE))

    cv2.imshow("Hand", hand_img)

    key = cv2.waitKey(30) & 0xFF
    if key == ord(" "):
        save_next = True
    elif key == ord("q"):
        return True

    if save_next:
        save_sample(hand_img, landmarks, world_landmarks, handedness, sample_idx)
        sample_idx += 1
        save_next = False

    return False

cap = cv2.VideoCapture(0)
video_capture_loop(cap, SafeHands(static_image_mode=False, max_num_hands=MAX_HANDS), loop_fn, flipH=True, flipV=False, show_window=True)


I0000 00:00:1765037323.522073  119354 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1765037323.527498  123871 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1765037323.542757  123865 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1765037323.549784  123863 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


Muestra 0 guardada en gestures_dataset/none
Muestra 1 guardada en gestures_dataset/none
Muestra 2 guardada en gestures_dataset/none
Muestra 3 guardada en gestures_dataset/none
Muestra 4 guardada en gestures_dataset/none
Muestra 5 guardada en gestures_dataset/none
Muestra 6 guardada en gestures_dataset/none
Muestra 7 guardada en gestures_dataset/none
Muestra 8 guardada en gestures_dataset/none
Muestra 9 guardada en gestures_dataset/none


In [10]:
import numpy as np
import pandas as pd
from pathlib import Path

def load_landmarks_to_dataframe(dataset_dir: str):
    dataset_path = Path(dataset_dir)
    data = []

    # Recorre cada categoría de gesto
    for gesture_dir in dataset_path.iterdir():
        if not gesture_dir.is_dir():
            continue
        label = gesture_dir.name

        # Recorre todos los archivos npz de landmarks
        for npz_file in gesture_dir.glob("*_landmarks.npz"):
            try:
                npz = np.load(npz_file)
                landmarks = npz['landmarks']  # shape: (num_hands, num_landmarks, 3)

                # Tomamos solo la primera mano si hay varias
                lm = landmarks[0] if landmarks.shape[0] > 0 else np.zeros((21, 3))
                
                #TODO: Quitar
                #lm = preprocess_landmarks(lm)

                # Aplanar los landmarks a una sola fila
                lm_flat = lm.flatten()

                # Guardar como diccionario
                sample = {"label": label}
                for i, coord in enumerate(lm_flat):
                    sample[f"lm_{i}"] = coord

                data.append(sample)
            except Exception as e:
                print(f"No se pudo cargar {npz_file}: {e}")

    df = pd.DataFrame(data)
    return df

df = load_landmarks_to_dataframe("gestures_dataset")
print(df)


    label      lm_0      lm_1          lm_2      lm_3      lm_4      lm_5  \
0    none  0.718661  0.766555  3.567058e-07  0.730553  0.646978  0.026051   
1    none  0.752250  0.701544 -6.153338e-07  0.690851  0.629828  0.001988   
2    none  0.711098  0.756638 -4.836457e-08  0.726945  0.637981  0.038778   
3    none  0.737014  0.775776 -7.479043e-07  0.662198  0.776066 -0.025476   
4    none  0.782946  0.720399 -1.079192e-07  0.705931  0.701256 -0.021767   
5    none  0.729350  0.754570 -6.508913e-07  0.751450  0.646743  0.017810   
6    none  0.779380  0.653826  1.346741e-07  0.699118  0.695524 -0.025288   
7    none  0.711429  0.643413 -8.365785e-08  0.628100  0.634775 -0.015640   
8    none  0.724759  0.807863 -2.492916e-07  0.638554  0.764977 -0.018498   
9   right  0.303219  0.983664  1.218524e-08  0.397605  0.840492 -0.051059   
10  right  0.510764  0.715408 -2.747380e-07  0.574706  0.603763 -0.005561   
11  right  0.323002  0.768299 -3.304240e-07  0.408399  0.677710  0.005127   

# Training Model

In [None]:
import pandas as pd
import numpy as np

dfA = pd.read_pickle("hands_landmarks.pkl")
dfB = load_landmarks_to_dataframe("gestures_dataset")

# =====================================================
# Funciones de flatten y preprocesado
# =====================================================

def extract_flat_landmarks(landmarks, n_landmarks=21):
    if landmarks is None or len(landmarks) == 0:
        return None
    hand = landmarks[0][:n_landmarks]
    return np.array(hand)  # NO flatten aquí

def preprocess_landmarks_pipeline(hand):
    return preprocess_landmarks(hand)

# =====================================================
# Data Augmentation (landmarks)
# =====================================================

def jitter(landmarks, sigma=0.01):
    return landmarks + np.random.normal(0, sigma, landmarks.shape)

def random_scale(landmarks, scale_range=(0.9, 1.1)):
    scale = np.random.uniform(*scale_range)
    return landmarks * scale

def random_rotate_xy(landmarks, angle_deg=10):
    angle = np.deg2rad(np.random.uniform(-angle_deg, angle_deg))
    R = np.array([
        [np.cos(angle), -np.sin(angle), 0],
        [np.sin(angle),  np.cos(angle), 0],
        [0,              0,             1]
    ])
    return landmarks @ R.T

def augment_landmarks(landmarks):
    landmarks = jitter(landmarks, sigma=0.01)
    landmarks = random_scale(landmarks)
    landmarks = random_rotate_xy(landmarks)
    return landmarks

# =====================================================
# Dataset B → augmentación con balanceo a 200
# =====================================================

dfB_copy = dfB.copy()
dfB_copy['label'] = dfB_copy['label'].replace({'none': 'unknown'})

TARGET_SAMPLES = 200
augmented_rows = []
lm_cols_B = [f'lm_{i}' for i in range(21*3)]
classes_B = dfB_copy['label'].unique()

for cls in classes_B:
    df_class = dfB_copy[dfB_copy['label'] == cls]
    n_current = len(df_class)
    n_to_generate = max(0, TARGET_SAMPLES - n_current)

    # Guardamos las filas originales
    for _, row in df_class.iterrows():
        lm = row[lm_cols_B].values.reshape(21,3)
        new_row = dict(zip(lm_cols_B, lm.flatten()))
        new_row['label'] = cls
        augmented_rows.append(new_row)

    # Generamos augmentación solo si hace falta
    for _ in range(n_to_generate):
        row = df_class.sample(1, replace=True).iloc[0]
        lm = row[lm_cols_B].values.reshape(21,3)
        lm_aug = augment_landmarks(lm)
        new_row = dict(zip(lm_cols_B, lm_aug.flatten()))
        new_row['label'] = cls
        augmented_rows.append(new_row)

dfB_augmented = pd.DataFrame(augmented_rows)

# =====================================================
# Dataset A → tabular (sin augmentación)
# =====================================================

dfA_subset = dfA.head(1000)
flat_data = dfA_subset['landmarks'].apply(extract_flat_landmarks)
flat_data = flat_data.dropna()

# ⚠️ Aplanar antes de crear DataFrame
flat_data_flattened = flat_data.apply(lambda x: x.flatten())

dfA_flat = pd.DataFrame(flat_data_flattened.tolist(), columns=[f'lm_{i}' for i in range(21*3)])
dfA_flat['label'] = 'unknown'

# =====================================================
# Dataset C (concat)
# =====================================================

df = pd.concat([dfA_flat, dfB_augmented], ignore_index=True)

# =====================================================
# Preprocesado de landmarks
# =====================================================

lm_cols = [c for c in df.columns if c.startswith("lm_")]

def preprocess_row(row):
    hand = row[lm_cols].values.reshape(21,3)
    hand = preprocess_landmarks_pipeline(hand)
    return hand.flatten()

processed = df.apply(preprocess_row, axis=1)
df_final = pd.DataFrame(processed.tolist(), columns=lm_cols)
df_final['label'] = df['label'].values

# =====================================================
# Resultado final
# =====================================================

print(df_final)
print(df_final['label'].value_counts())


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models

# 1️⃣ Cargar DataFrame de landmarks
df = load_landmarks_to_dataframe("gestures_dataset")

# Todo lo que no sea right o left -> none
df['label'] = df['label'].apply(lambda x: x if x in ['right', 'left'] else 'none')

# Separar características y etiquetas
X = df.drop(columns=['label']).values.astype('float32')
y = df['label'].values

# Escalar características
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Codificar etiquetas a números
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # right=2, left=1, none=0, por ejemplo

# Dividir en train/validation
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# 2️⃣ Construir modelo MLP
input_size = X.shape[1]  # 63 landmarks

model = models.Sequential([
    layers.Input(shape=(input_size,)),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(3, activation='softmax')  # 3 clases: right, left, none
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# 3️⃣ Entrenar
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32
)

# 4️⃣ Evaluar
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
print(f"Validation Accuracy: {val_acc:.4f}")

# 5️⃣ Predecir
def predict_gesture(landmarks_vector):
    lm_vector = np.array(landmarks_vector).reshape(1, -1).astype('float32')
    lm_vector = scaler.transform(lm_vector)
    pred = model.predict(lm_vector)
    class_idx = np.argmax(pred)
    return le.inverse_transform([class_idx])[0]

# Ejemplo:
# gesture = predict_gesture(new_landmarks_vector)
# print("Predicción:", gesture)




['left' 'none' 'right']
Epoch 1/50


KeyboardInterrupt: 

In [40]:
hands_model = SafeHands(min_detection_confidence=0.8, min_tracking_confidence=0.5, max_num_hands=1)
video_capture = cv2.VideoCapture(0)

def loop(hands: SafeHands, frame: np.ndarray) -> bool:
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	result = hands.process(frame_rgb)
	if result:
		hands_landmarks = result[0]
		for hand_landmarks in hands_landmarks:
			print(predict_gesture(preprocess_landmarks(hand_landmarks)))
	return False

video_capture_loop(
    cap=video_capture,
    context=hands_model,
    loop=loop,
    flipH=True,
    show_fps=True,
)

I0000 00:00:1765038641.671502  119354 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1765038641.673975  133766 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: Mesa Intel(R) UHD Graphics (CML GT2)
W0000 00:00:1765038641.704448  133758 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1765038641.718373  133757 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━