# Sistema Split‑and‑Merge para Alfabeto de Señas
Combina **SVM + Landmarks MediaPipe** para letras estáticas y **CNN 3D (R3D‑18)** para letras dinámicas  
*Generado: 2025-07-05*


## Resumen del flujo

1. **Detector de movimiento** con flujo óptico → decide si la mano está quieta o en movimiento.  
2. **Letras estáticas** (mano quieta) → MediaPipe Hands → 63 landmarks → `StandardScaler` → **SVM RBF**.  
3. **Letras dinámicas** (mano moviéndose) → acumulamos 16 frames 224×224 → **R3D‑18** en GPU.

Arquivos de modelo que debes tener en la misma carpeta del notebook:

| Archivo | Rol |
|---------|-----|
| `letters_landmarks_scaler.pkl` | Escalador para los vectores de 63 landmarks |
| `letters_landmarks_svm.pkl`    | SVM entrenado sobre esos landmarks |
| `r3d18_dynamic.pth`            | Pesos de la CNN 3D para las 5 letras dinámicas |


In [14]:

import cv2, joblib, time, collections, numpy as np
import mediapipe as mp
from pathlib import Path
import torch, torch.nn as nn
from torchvision.models.video import r3d_18
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

print('Torch CUDA disponible:', torch.cuda.is_available())


Torch CUDA disponible: True


In [33]:

# ─── Parámetros generales ────────────────────────────────────────
CLIP_LEN = 16          # nº de frames para la CNN 3D
VID_SIZE = 224         # tamaño cuadrado de entrada
TAU      = 1.2         # umbral de energía de movimiento (ajústalo)
LABELS_STATIC  = list("ABCDEFGHILMNOPRSTUVWY")   # 21 letras estáticas
LABELS_DYNAMIC = ['J', 'K', 'Q', 'X', 'Z', 'Ñ']
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [34]:

# ─── Cargar modelos estáticos ───────────────────────────────────
scaler = joblib.load('../models/letters_landmarks_scaler.pkl')
svm    = joblib.load('../models/letters_landmarks_svm.pkl')

# ─── Cargar modelo dinámico ─────────────────────────────────────
cnn = r3d_18(pretrained=False)
cnn.fc = nn.Linear(cnn.fc.in_features, len(LABELS_DYNAMIC))
cnn.load_state_dict(torch.load('../models/r3d18_dynamic.pth', map_location=DEVICE))
cnn = cnn.to(DEVICE).eval()

print('Modelos cargados correctamente')


  cnn.load_state_dict(torch.load('../models/r3d18_dynamic.pth', map_location=DEVICE))


Modelos cargados correctamente


### Funciones auxiliares

In [36]:

mp_hands = mp.solutions.hands

def motion_energy(frame_bgr, prev_gray):
    gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
    if prev_gray is None:
        return 0, gray
    flow = cv2.calcOpticalFlowFarneback(prev_gray, gray,
                                        None, 0.5, 3, 15, 3, 5, 1.2, 0)
    return np.mean(np.linalg.norm(flow, axis=2)), gray

def extract_landmarks(frame_bgr, detector):
    img_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    res = detector.process(img_rgb)
    if not res.multi_hand_landmarks:
        return None
    lm = res.multi_hand_landmarks[0]
    coords = np.array([[p.x, p.y, p.z] for p in lm.landmark])
    coords -= coords[0]
    norm = np.linalg.norm(coords).mean()
    coords /= norm if norm else 1
    return coords.flatten()          # shape (63,)

def classify_static(frame, detector):
    feats = extract_landmarks(frame, detector)
    if feats is None:                       # no mano detectada
        return None, 0.0
    feats_std = scaler.transform(feats.reshape(1, -1))
    idx  = svm.predict(feats_std)[0]
    conf = svm.decision_function(feats_std).max()
    return idx, conf

def classify_dynamic(clip):
    tensor = np.stack([cv2.cvtColor(f, cv2.COLOR_BGR2RGB) for f in clip])  # T,H,W,C
    tensor = torch.from_numpy(tensor.astype('float32')/255.).permute(3,0,1,2)  # C,T,H,W
    with torch.no_grad():
        out  = cnn(tensor.unsqueeze(0).to(DEVICE))
        idx  = out.argmax(1).item()
        conf = torch.softmax(out, dim=1)[0, idx].item()
    return LABELS_DYNAMIC[idx], conf



### Bucle de demo en vivo  
Ejecuta la siguiente celda para usar la webcam.  
Pulsa **ESC** para salir.


In [37]:
with mp_hands.Hands(static_image_mode=False,
                    max_num_hands=1,
                    min_detection_confidence=0.5,
                    min_tracking_confidence=0.5) as hands_detector:

    cap = cv2.VideoCapture(0)
    clip_buf = collections.deque(maxlen=CLIP_LEN)
    prev_gray, cooldown = None, 0

    try:
        while True:
            ok, frame = cap.read()
            if not ok:
                break

            energy, prev_gray = motion_energy(frame, prev_gray)
            clip_buf.append(cv2.resize(frame, (VID_SIZE, VID_SIZE)))

            text = ''
            if energy < TAU and len(clip_buf):
                letter, conf = classify_static(frame, hands_detector)
                if letter:
                    text = f'{letter} ({conf:.2f})'
                    cooldown = 0
            else:
                if len(clip_buf) == CLIP_LEN and cooldown == 0:
                    letter, conf = classify_dynamic(list(clip_buf))
                    text = f'{letter} ({conf:.2f})'
                    cooldown = CLIP_LEN
                else:
                    cooldown = max(0, cooldown-1)

            cv2.putText(frame, f'E={energy:.2f}', (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)
            cv2.putText(frame, text, (10, 70),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 255, 0), 3)
            cv2.imshow('Split-Merge Sign Detector', frame)
            if cv2.waitKey(1) & 0xFF == 27:   # ESC
                break
    finally:
        cap.release()
        cv2.destroyAllWindows()
