# Implementacion con camara (tiempo real)

In [2]:
import cv2, time, collections, pickle, joblib, numpy as np, mediapipe as mp, tensorflow as tf
from pathlib import Path
from sklearn.preprocessing import StandardScaler

# ── Rutas y modelos ─────────────────────────────────
ROOT = Path("..").resolve()
M = ROOT / "models"
scaler = joblib.load(M / "letters_landmarks_scaler.pkl")
svm    = joblib.load(M / "letters_landmarks_svm.pkl")
lstm   = tf.keras.models.load_model(M / "dynamic_letters_lstm.h5")
le_dyn = pickle.load(open(M / "dynamic_letters_label_encoder.pkl","rb"))
x_mean = np.load(M / "X_mean.npy"); x_std = np.load(M / "X_std.npy")

LABELS_STATIC = list("ABCDEFGHILMNOPRSTUVWY")

# ── Parámetros ──────────────────────────────────────
SEQ_LEN       = 30          # ventana para LSTM
STRIDE        = 2           # cada cuántos frames ejecutar LSTM
CONF_LSTM_MIN = 0.50        # mostrar dinámico si ≥ 0.8
CONF_SVM_MIN  = 0.50        # mostrar estático si ≥ 0.5
VOTE_STATIC   = 5           # majority vote para SVM
COOLDOWN_FR   = 20          # evitar rebote entre letras

# ── MediaPipe Hands ────────────────────────────────
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(False, 1, 1, 0.5, 0.5)

def extract_landmarks(bgr):
    h,w = bgr.shape[:2]
    res = hands.process(cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB))
    if not res.multi_hand_landmarks: return None
    lm = res.multi_hand_landmarks[0]
    xyz = np.array([[p.x,p.y,p.z] for p in lm.landmark])
    xyz -= xyz[0]; xyz /= (np.linalg.norm(xyz).mean() or 1)
    return xyz.flatten()

def svm_static(feats):
    x = scaler.transform([feats])
    pred = svm.predict(x)[0]
    conf = svm.decision_function(x).max() if hasattr(svm,"decision_function") else svm.predict_proba(x).max()
    letter = LABELS_STATIC[int(pred)] if isinstance(pred,(int,np.integer)) else str(pred)
    return letter, conf

def lstm_dynamic(seq):
    seq_n = (seq - x_mean)/x_std
    p = lstm.predict(seq_n[None,...],verbose=0)[0]
    idx = int(np.argmax(p))
    return le_dyn.inverse_transform([idx])[0], p[idx]

# ── Buffers y estado ───────────────────────────────
clip      = collections.deque(maxlen=SEQ_LEN)
vote_svm  = collections.deque(maxlen=VOTE_STATIC)
frame_cnt = 0
cooldown  = disp_cnt = 0
last_txt  = ''

cap = cv2.VideoCapture(0)
print("🎥 Ready — ESC to quit")

try:
    while True:
        ok, frame = cap.read()
        if not ok: break
        frame_cnt += 1

        feats = extract_landmarks(frame)
        if feats is None:
            vote_svm.clear()
            clip.clear()
        else:
            # 1) SVM estático
            letter_s, conf_s = svm_static(feats)
            if conf_s >= CONF_SVM_MIN:
                vote_svm.append(letter_s)

            # 2) Buffer para LSTM
            clip.append(feats)

        # ----- Cada STRIDE frames probamos LSTM -----
        if len(clip)==SEQ_LEN and frame_cnt % STRIDE==0:
            letter_d, conf_d = lstm_dynamic(np.array(clip))
        else:
            letter_d, conf_d = None, 0.0

        # ----- Decisión de salida (late fusion) -----
        if cooldown==0:
            if conf_d >= CONF_LSTM_MIN:
                last_txt, disp_cnt, cooldown = f'{letter_d} ({conf_d:.2f})', 30, COOLDOWN_FR
                vote_svm.clear()                # reinicia votos
            elif len(vote_svm)==VOTE_STATIC:
                maj = max(set(vote_svm), key=vote_svm.count)
                last_txt, disp_cnt, cooldown = f'{maj}', 25, COOLDOWN_FR
                vote_svm.clear()

        # ----- Overlay -----
        cooldown = max(0,cooldown-1); disp_cnt=max(0,disp_cnt-1)
        if disp_cnt:
            cv2.putText(frame,last_txt,(10,80),cv2.FONT_HERSHEY_SIMPLEX,1.6,(0,255,0),3)
        cv2.imshow('Late Fusion (SVM+LSTM)',frame)
        if cv2.waitKey(1)&0xFF==27: break
finally:
    cap.release(); cv2.destroyAllWindows()




🎥 Ready — ESC to quit


# Implementacion para videos

In [1]:
"""
Reconocimiento de letras en vídeo (estáticas + dinámicas) con
fusión tardía SVM + LSTM. Al finalizar muestra la palabra detectada.
"""

import cv2, collections, pickle, joblib, numpy as np, mediapipe as mp, tensorflow as tf
from pathlib import Path
from sklearn.preprocessing import StandardScaler

# ── Rutas y modelos ────────────────────────────────────────────────
ROOT = Path("..").resolve()
M    = ROOT / "models"                      # ajusta si están en otro sitio
scaler = joblib.load(M / "letters_landmarks_scaler.pkl")
svm    = joblib.load(M / "letters_landmarks_svm.pkl")
lstm   = tf.keras.models.load_model(M / "dynamic_letters_lstm.h5")
le_dyn = pickle.load(open(M / "dynamic_letters_label_encoder.pkl", "rb"))
x_mean = np.load(M / "X_mean.npy")
x_std  = np.load(M / "X_std.npy")

LABELS_STATIC = list("ABCDEFGHILMNOPRSTUVWY")

# ── Hiperparámetros ────────────────────────────────────────────────
SEQ_LEN       = 30           # ventana para LSTM
STRIDE        = 2            # cada cuántos frames evaluar LSTM
CONF_LSTM_MIN = 0.60
CONF_SVM_MIN  = 0.50
VOTE_STATIC   = 5            # mayoría simple para SVM
COOLDOWN_FR   = 15           # evita rebote entre letras

# ── MediaPipe Hands ────────────────────────────────────────────────
mp_hands = mp.solutions.hands
hands    = mp_hands.Hands(False,            # static_image_mode
                          1,                # max_num_hands
                          1,                # model_complexity
                          0.5, 0.5)         # conf. detec / track

def extract_landmarks(bgr):
    """Devuelve 63 features (21 puntos × xyz) normalizados o None."""
    h, w = bgr.shape[:2]
    res  = hands.process(cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB))
    if not res.multi_hand_landmarks:
        return None
    lm   = res.multi_hand_landmarks[0]
    xyz  = np.array([[p.x, p.y, p.z] for p in lm.landmark])
    xyz -= xyz[0]                              # traslación al landmark 0
    xyz /= (np.linalg.norm(xyz).mean() or 1)   # escala invariante
    return xyz.flatten()                       # (63,)

def svm_static(feats):
    x    = scaler.transform([feats])
    pred = svm.predict(x)[0]
    conf = (svm.decision_function(x).max()
            if hasattr(svm, "decision_function")
            else svm.predict_proba(x).max())
    letter = LABELS_STATIC[int(pred)] if isinstance(pred, (int, np.integer)) else str(pred)
    return letter, conf

def lstm_dynamic(seq):
    seq_n = (seq - x_mean) / x_std            # normalización global
    p     = lstm.predict(seq_n[None, ...], verbose=0)[0]
    idx   = int(np.argmax(p))
    return le_dyn.inverse_transform([idx])[0], p[idx]

# ── Buffers y estado ───────────────────────────────────────────────
clip       = collections.deque(maxlen=SEQ_LEN)     # ventana para LSTM
vote_svm   = collections.deque(maxlen=VOTE_STATIC) # votación estática
frame_cnt  = 0
cooldown   = 0
last_letter = ''                                   # última letra aceptada
word_letters = []                                  # salida final

# ── Ruta al vídeo ──────────────────────────────────────────────────
VIDEO_PATH = ROOT / "data" / "demo" / "deletreo_estatico.mp4"               # <── cambia aquí
cap = cv2.VideoCapture(str(VIDEO_PATH))
assert cap.isOpened(), f"No se pudo abrir {VIDEO_PATH}"
print(f"🎞️  Procesando {VIDEO_PATH.name}...")

try:
    while True:
        ok, frame = cap.read()
        if not ok:
            break                                  # fin de vídeo

        #frame = cv2.flip(frame, 1)
        frame_cnt += 1

        feats = extract_landmarks(frame)
        if feats is None:
            vote_svm.clear()
            clip.clear()
        else:
            # 1) SVM para letras estáticas
            letter_s, conf_s = svm_static(feats)
            if conf_s >= CONF_SVM_MIN:
                vote_svm.append(letter_s)

            # 2) Buffer de características para LSTM
            clip.append(feats)

        # ── Cada STRIDE frames evaluamos LSTM ──────────────────────
        if len(clip) == SEQ_LEN and frame_cnt % STRIDE == 0:
            letter_d, conf_d = lstm_dynamic(np.array(clip))
        else:
            letter_d, conf_d = None, 0.0

        # ── Decisión de fusión tardía ──────────────────────────────
        if cooldown == 0:
            if conf_d >= CONF_LSTM_MIN:
                letter_out = letter_d
                cooldown   = COOLDOWN_FR
                vote_svm.clear()
            elif len(vote_svm) == VOTE_STATIC:
                # mayoría simple entre los últimos VOTE_STATIC votos
                letter_out = max(set(vote_svm), key=vote_svm.count)
                cooldown   = COOLDOWN_FR
                vote_svm.clear()
            else:
                letter_out = None
        else:
            cooldown -= 1
            letter_out = None

        # ── Acumular palabra sin duplicados consecutivos ───────────
        if letter_out and letter_out != last_letter:
            word_letters.append(letter_out)
            last_letter = letter_out

        # ── (Opcional) Mostrar progreso en la ventana -------------
        cv2.putText(frame, ''.join(word_letters), (10, 60),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), 3)
        cv2.imshow('Reconocimiento en Vídeo', frame)
        if cv2.waitKey(1) & 0xFF == 27:   # ESC para abortar temprano
            break

finally:
    cap.release(); cv2.destroyAllWindows()
    hands.close()

# ── Resultado final ────────────────────────────────────────────────
palabra = ''.join(word_letters)
print(f"\n✅ Palabra detectada: {palabra if palabra else '(sin detección)'}")



🎞️  Procesando deletreo_estatico.mp4...

✅ Palabra detectada: STRBSR
