<a href="https://colab.research.google.com/github/Gibonn24/MexicanSignLanguage/blob/main/Proyecto_Final_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Traductor de Lenguaje de Se√±as a Texto

**Proyecto Final ‚Äì Machine Learning**



---

## 1. Integrantes
| Nombre | % de contribuci√≥n |
|--------|-------------------|
| Giordano Fuentes | 100% |

> Ajusta la tabla seg√∫n corresponda.

## 2. Introducci√≥n


> La comunicaci√≥n entre personas sordas y oyentes sigue siendo una barrera. Este proyecto busca traducir autom√°ticamente videos de Lengua de Se√±as a texto en espa√±ol, usando aprendizaje profundo y visi√≥n computacional, para facilitar la inclusi√≥n.

In [1]:
#Se encuentra en ("./notebooks/EDA_dynamics.ipynb") y ("./notebooks/EDA_letters.ipynb")

## 4. Metodolog√≠a
Describe la arquitectura general:
1. **Extracci√≥n de caracter√≠sticas** con un modelo preentrenado (p.ej. *I3D* / *S3D*) usando [`video_features`](https://github.com/v-iashin/video_features).
2. **Modelo de traducci√≥n** secuencia‚Äìa‚Äìsecuencia (GRU/Transformer) que mapea embeddings de video ‚Üí texto (glosas o frases).
3. **P√©rdida** CTC o CrossEntropy seg√∫n alineaci√≥n.

Incluye un diagrama opcional.

In [2]:
from models.r21d.extract_r21d import ExtractR21D
from utils.utils import build_cfg_path
from omegaconf import OmegaConf
import pandas as pd
import os
import glob
import numpy as np
import torch
from pyprojroot import here
from pathlib import Path
import random
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1660 Ti'

In [3]:
import av, torchvision, sys, importlib.metadata
print("PyAV:", av.__version__)              # deber√≠a mostrar 14.4.0
print("TorchVision:", torchvision.__version__)


PyAV: 12.2.0
TorchVision: 0.20.1+cu121


In [4]:
from torchvision.io import read_video
rgb, _, info = read_video("C:/Users/User/Documents/ML/data/letters/dynamics/J/S1-J-perfil-1.mp4")
print(rgb.shape, info)



torch.Size([57, 900, 900, 3]) {'video_fps': 30.0}


In [5]:
from omegaconf import OmegaConf
from utils.utils import build_cfg_path
from models.r21d.extract_r21d import ExtractR21D

# Cargar config base
args = OmegaConf.load(build_cfg_path("r21d"))
args.feature_type     = "r21d"
args.model_name       = "r2plus1d_34_8_ig65m_ft_kinetics"
args.stack_size       = 8
args.step_size        = 8
args.extraction_fps   = 15          # normaliza todos los v√≠deos
args.tmp_path         = "tmp"
args.output_path      = "feats"
args.on_extraction    = "return"    # o 'save_numpy'
args.device           = "cuda:0"    # o 'cpu'
args.show_pred        = False

extractor = ExtractR21D(args)


Using cache found in C:\Users\User/.cache\torch\hub\moabitcoin_ig65m-pytorch_master


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import pandas as pd

class CSVDataset(torch.utils.data.Dataset):
    def __init__(self, csv_path, transform=None, class_to_idx=None):
        self.data = pd.read_csv(csv_path)
        self.transform = transform

        # Si no se pasa mapeo externo, lo construye con las etiquetas del CSV
        if class_to_idx is None:
            classes = sorted(self.data["label"].unique())
            class_to_idx = {cls: idx for idx, cls in enumerate(classes)}

        self.class_to_idx = class_to_idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img = Image.open(row["image_path"]).convert("RGB")
        if self.transform:
            img = self.transform(img)
        label = self.class_to_idx[row["label"]]
        return img, label
    
all_labels = pd.concat([
    pd.read_csv("letter_labels.csv")['label'],
    pd.read_csv("dynamics_videos.csv")['label']
]).unique()

class_to_idx = {cls: idx for idx, cls in enumerate(sorted(all_labels))}

# Transformaciones igual que antes
tfm = transforms.Compose([
    transforms.Resize(128),
    transforms.CenterCrop(112),
    transforms.ToTensor(),
    transforms.Normalize([0.43216,0.39466,0.37645],
                         [0.22803,0.22145,0.21698]),
])

ds_static  = CSVDataset("letter_labels.csv",  tfm,   class_to_idx)
# 1. Split reproducible 80/10/10
from torch.utils.data import random_split, DataLoader
N = len(ds_static)
train_len = int(0.8*N); val_len = int(0.1*N); test_len = N - train_len - val_len

train_s, val_s, test_s = random_split(
    ds_static, [train_len, val_len, test_len],
    generator=torch.Generator().manual_seed(42)
)

# 2. DataLoaders
dl_train = DataLoader(train_s, batch_size=64, shuffle=True, num_workers=0)
dl_val   = DataLoader(val_s,   batch_size=64, shuffle=False, num_workers=0)
dl_test  = DataLoader(test_s,  batch_size=64, shuffle=False, num_workers=0)
# Modelo ResNet adaptado
from torchvision import models
import torch.nn as nn
# Cargar modelo preentrenado y adaptarlo
model_img = models.resnet18(weights="IMAGENET1K_V1")
model_img.fc = nn.Linear(model_img.fc.in_features, 27)
model_img = model_img.to("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
class VideoCSVDataset(torch.utils.data.Dataset):
    """
    Dataset que lee rutas de v√≠deo y etiquetas desde un CSV y
    extrae las caracter√≠sticas (embeddings) con un extractor 3D-CNN.

    El CSV debe tener al menos dos columnas:
        video_path,label
    """

    def __init__(self, csv_path, extractor, class_to_idx):
        self.data = pd.read_csv(csv_path)
        self.extractor = extractor
        self.class_to_idx = class_to_idx     # guardar mapeo externo
        """
        Args
        ----
        csv_path : str
            Ruta al archivo CSV (`video_path,label`).
        extractor : callable
            Objeto con un m√©todo `.extract(path)["r21d"]` que devuelve
            un ndarray (n_stacks, 512) por v√≠deo.
        class_to_idx : dict
            Diccionario compartido con el mismo mapeo letra ‚Üí √≠ndice que
            usas en el dataset de im√°genes est√°ticas.
        """

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        feats = self.extractor.extract(row["video_path"])["r21d"]  # (stacks,512)
        feats = torch.tensor(feats, dtype=torch.float32).mean(0)
        label = self.class_to_idx[row["label"]]
        return feats, label

# 0. Instancia del extractor R21D (ya lo tienes)
ds_dynamic = VideoCSVDataset("dynamics_videos.csv", extractor, class_to_idx)

# 1. Split
M = len(ds_dynamic)
tr_len = int(0.8*M); va_len = int(0.1*M); te_len = M - tr_len - va_len

video_train, video_val, video_test = random_split(
    ds_dynamic, [tr_len, va_len, te_len],
    generator=torch.Generator().manual_seed(42)
)

# 2. DataLoaders
dl_vtrain = DataLoader(video_train, batch_size=16, shuffle=True, num_workers=0)
dl_vval   = DataLoader(video_val,   batch_size=16, shuffle=False, num_workers=0)
dl_vtest  = DataLoader(video_test,  batch_size=16, shuffle=False, num_workers=0)



## 5. Implementaci√≥n
- Framework: **PyTorch**
- Semilla de reproducibilidad: `42`
- Enlace a notebook/Colab: <colab_link>

Describe cualquier optimizaci√≥n o t√©cnica especial (e.g., *gradient clipping*, *mixed precision*, *early stopping*).

## Entrenamiento de imagenes estaticas

In [None]:
from tqdm import tqdm
import torch.nn.functional as F

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_img.parameters(), lr=1e-4)

for epoch in range(10):
    model_img.train()
    running_loss = 0
    pbar = tqdm(dl_train, desc=f"Epoch {epoch+1:02d}", unit="batch")

    for imgs, labels in pbar:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        out = model_img(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)

        pbar.set_postfix(loss=loss.item())

    # Validaci√≥n
    model_img.eval()
    val_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for imgs, labels in dl_val:
            imgs, labels = imgs.to(device), labels.to(device)
            out = model_img(imgs)
            val_loss += criterion(out, labels).item() * imgs.size(0)
            pred = out.argmax(1)
            correct += (pred == labels).sum().item()
            total += labels.size(0)

    print(f"üìä Epoch {epoch+1:02d} | train_loss={running_loss/len(train_s):.4f} | "
          f"val_loss={val_loss/len(val_s):.4f} | val_acc={correct/total:.3f}")


Epoch 01:  15%|‚ñà‚ñç        | 155/1041 [02:05<12:45,  1.16batch/s, loss=0.019] 

In [None]:
from tqdm import tqdm
class DynClassifier(nn.Module):
    """
    Espera tensores (B, 512) ‚Üí logits (B, n_classes).
    Aplica media temporal antes de la fc.
    """
    def __init__(self, in_dim=512, n_classes=n_classes):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(in_dim, 256), nn.ReLU(),
            nn.Linear(256, n_classes)
        )

    def forward(self, feats):             # feats: (B, stacks, 512)
        x = feats.mean(1)                 # pooling temporal
        return self.fc(x)

model_vid = DynClassifier().to(device)

opt_v = torch.optim.Adam(model_vid.parameters(), lr=1e-4)
crit  = nn.CrossEntropyLoss()

for epoch in range(10):
    model_vid.train()
    running_loss = 0
    pbar = tqdm(dl_vtrain, desc=f"Video Epoch {epoch+1:02d}", unit="batch")

    for feats, labels in pbar:
        feats, labels = feats.to(device), labels.to(device)
        opt_v.zero_grad()
        out = model_vid(feats)
        loss = crit(out, labels)
        loss.backward()
        opt_v.step()
        running_loss += loss.item() * feats.size(0)

        pbar.set_postfix(loss=loss.item())

    # Evaluaci√≥n
    model_vid.eval()
    correct = total = val_loss = 0
    with torch.no_grad():
        for feats, labels in dl_vval:
            feats, labels = feats.to(device), labels.to(device)
            out = model_vid(feats)
            val_loss += crit(out, labels).item() * feats.size(0)
            preds = out.argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    print(f"üé• Epoch {epoch+1:02d} | train_loss={running_loss/len(video_train):.4f} | "
          f"val_loss={val_loss/len(video_val):.4f} | val_acc={correct/total:.3f}")

## 6. Experimentaci√≥n
Presenta las configuraciones de entrenamiento y resultados. Usa tablas o gr√°ficos (matplotlib) para loss y accuracy por √©poca.

In [None]:
img_path = "data/letters/statics/G/img_0123.jpg"
img = tfm(Image.open(img_path).convert("RGB")).unsqueeze(0).to(device)
pred = model_img(img).argmax(1).item()
print("Predicci√≥n imagen:", list(class_to_idx.keys())[pred])

In [None]:
vid = "data/letters/dynamics/J/S1-J-perfil-1.mp4"
feats = extractor.extract(vid)["r21d"]          # (stacks,512)
out = model_vid(torch.tensor(feats).unsqueeze(0).to(device))
pred = out.argmax(1).item()
print("Predicci√≥n v√≠deo :", list(class_to_idx.keys())[pred])

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import json, pandas as pd
import torch

def eval_model(model, dataloader, name):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)

            if X.dim() == 3:          # v√≠deos ‚Üí (B, stacks, 512)
                out = model(X)        # DynClassifier
            else:                     # im√°genes ‚Üí (B, 3, 112, 112)
                out = model(X)        # ResNet18

            preds = out.argmax(1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1  = f1_score(all_labels, all_preds, average="macro")
    cm  = confusion_matrix(all_labels, all_preds)

    # ----- guardar -----
    metrics_path = f"{name}_metrics.json"
    conf_path    = f"{name}_confusion.csv"

    with open(metrics_path, "w") as fp:
        json.dump({"accuracy": acc, "macro_f1": f1}, fp, indent=2)

    pd.DataFrame(cm, dtype=int).to_csv(conf_path, index=False, header=False)

    print(f"\n[{name.upper()}]  accuracy={acc:.3f}  macro-F1={f1:.3f}")
    print(f"Matriz de confusi√≥n guardada en  {conf_path}")
    print(f"M√©tricas guardadas en            {metrics_path}")

# --------- evaluaci√≥n -----------
eval_model(model_img, dl_test,  "static")     # im√°genes
eval_model(model_vid, dl_vtest, "dynamic")    # v√≠deos


## 7. Discusi√≥n
Analiza los resultados: ¬øqu√© patrones encuentras? ¬øQu√© gestos resultaron dif√≠ciles? ¬øC√≥mo influy√≥ la iluminaci√≥n o el background?

## 8. Conclusiones
Resume los hallazgos m√°s relevantes y menciona posibles mejoras futuras.

## 9. Declaraci√≥n de Contribuci√≥n
Describe el aporte de cada miembro del equipo con porcentajes de tiempo/actividad.