In [1]:
!pip install 'git+https://github.com/facebookresearch/fvcore'
!pip install simplejson
!pip install einops
!pip install timm
!pip install psutil
!pip install scikit-learn
!pip install opencv-python
!pip install tensorboard

Collecting git+https://github.com/facebookresearch/fvcore
  Cloning https://github.com/facebookresearch/fvcore to /tmp/pip-req-build-61mk65bb
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/fvcore /tmp/pip-req-build-61mk65bb
  Resolved https://github.com/facebookresearch/fvcore to commit b25ff8c84ebb2fe88b61b7a8994b9571a1e13bab
  Preparing metadata (setup.py) ... [?25ldone
Collecting yacs>=0.1.6 (from fvcore==0.1.6)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting iopath>=0.1.7 (from fvcore==0.1.6)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting portalocker (from iopath>=0.1.7->fvcore==0.1.6)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading yacs-0.1.8-py3-none-any.whl (14 kB)
Downloading portalocker-2.10.1-py3-n

In [2]:
!pip install transformers --upgrade

Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.45.1
    Uninstalling transformers-4.45.1:
      Successfully uninstalled transformers-4.45.1
Successfully installed transformers-4.45.2


In [11]:
import os
import cv2
import torch
import numpy as np
import torchvision.transforms as T
from torch.utils.data import Dataset

class VideoDataset(Dataset):
    def __init__(self, root_dir, transform=None, max_frames=None, frame_skip=1):
        """
        Dataset para cargar videos desde carpetas categorizadas por etiquetas.
        
        Args:
            root_dir (str): Directorio raíz que contiene las carpetas con los videos.
            transform (callable, optional): Transformaciones que se aplicarán a cada frame.
            max_frames (int, optional): Máximo de frames a extraer por video.
            frame_skip (int, optional): Número de frames a saltar entre extracciones.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.max_frames = max_frames
        self.frame_skip = frame_skip
        self.video_paths, self.labels = self._load_videos_and_labels(root_dir)
        self.label_map = {label: idx for idx, label in enumerate(sorted(set(self.labels)))}

    def _load_videos_and_labels(self, root_dir):
        video_paths = []
        labels = []
        # Recorre cada subdirectorio (etiqueta) dentro del directorio raíz
        for label in os.listdir(root_dir):
            label_dir = os.path.join(root_dir, label)
            if os.path.isdir(label_dir):
                # Recorre todos los videos en el subdirectorio
                for video in os.listdir(label_dir):
                    if video.endswith(('.MOV', '.mov', '.mp4')):  # Ajusta según los formatos de video
                        video_paths.append(os.path.join(label_dir, video))
                        labels.append(label)
        return video_paths, labels

    def __len__(self):
        return len(self.video_paths)

    def extract_frames_from_video(self, video_path):
        frames = []
        cap = cv2.VideoCapture(video_path)
        frame_count = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % self.frame_skip == 0:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                if self.transform:
                    frame_rgb = self.transform(frame_rgb)
                frames.append(frame_rgb)

            frame_count += 1
            if self.max_frames and len(frames) >= self.max_frames:
                break

        cap.release()

        # Verificar el número mínimo de frames y aplicar padding si es necesario
        min_frames = 32
        while len(frames) < min_frames:
            padding_frame = torch.zeros_like(frames[0])
            frames.append(padding_frame)

        return torch.stack(frames)


    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        frames = self.extract_frames_from_video(video_path)
        label = self.label_map[self.labels[idx]] # Convertir la etiqueta a índice numérico
        # Depurar las dimensiones
        #print(f"Video {idx}: {frames.shape}, Label: {label}")
        return frames, label


In [13]:
def pad_collate_fn(batch):
    """
    Asegura que todos los videos en un lote tengan el mismo número de frames 
    mediante padding (relleno con ceros).
    """
    videos, labels = zip(*batch)

    # Encontrar el número máximo de frames en el lote
    max_frames = max(video.size(0) for video in videos)

    # Rellenar los videos con frames de ceros para que tengan la misma longitud
    padded_videos = []
    for video in videos:
        padding = torch.zeros((max_frames - video.size(0), *video.size()[1:]))
        padded_video = torch.cat((video, padding), dim=0)
        padded_videos.append(padded_video)

    # Apilar los videos y convertir las etiquetas en un tensor
    batch_videos = torch.stack(padded_videos)
    batch_labels = torch.tensor(labels)

    return batch_videos, batch_labels


In [18]:
from torchvision import transforms
from torch.utils.data import DataLoader, random_split

# Definir las transformaciones que se aplicarán a cada frame (resize, normalización, etc.)
transform = transforms.Compose([
    transforms.ToPILImage(),  # Convertir cada frame a formato PIL para aplicar transformaciones
    transforms.Resize((224, 224)),  # Redimensionar los frames a 224x224 píxeles (requerido por TimeSformer)
    #transforms.RandomVerticalFlip(),
    transforms.RandomHorizontalFlip(),
    #transforms.AugMix(),
    transforms.ToTensor(),  # Convertir a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalizar
])

#image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")

# Crear el dataset
dataset = VideoDataset(root_dir='/kaggle/input/words-lsc/Words', transform=transform, max_frames=32, frame_skip=2)

# Porcentaje de datos que usarás para entrenamiento (ej. 80% para entrenamiento, 20% para validación)
train_size = int(0.70 * len(dataset))
val_size = len(dataset) - train_size

# Dividir el dataset en entrenamiento y validación
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Crear DataLoaders para los conjuntos de entrenamiento y validación
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4, collate_fn=pad_collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=pad_collate_fn)

print(f"Tamaño del DataLoader de entrenamiento: {len(train_dataloader)}, Tamaño del DataLoader de validación: {len(val_dataloader)}")



Tamaño del DataLoader de entrenamiento: 50, Tamaño del DataLoader de validación: 22


In [19]:
import torch
from torchvision.transforms.functional import to_pil_image
import matplotlib.pyplot as plt
from PIL import Image

def visualize_sample_as_gif(dataloader, output_path='sample.gif'):
    # Obtener un batch del dataloader
    data_iter = iter(dataloader)
    batch = next(data_iter)

    # Extraer los frames y las etiquetas (asumiendo batch = (video_frames, labels))
    video_frames, _ = batch  # video_frames shape: [batch_size, frames, channels, height, width]

    # Selecciona un video del batch (ej. el primero)
    frames = video_frames[0]  # frames shape: [frames, channels, height, width]

    # Convertir cada frame a formato PIL
    pil_frames = [to_pil_image(frame) for frame in frames]

    # Guardar los frames como un GIF
    pil_frames[0].save(
        output_path, save_all=True, append_images=pil_frames[1:], 
        duration=100, loop=0
    )

    print(f"GIF guardado en {output_path}")

# Uso del DataLoader para obtener una muestra (ejemplo con el val_dataloader)
visualize_sample_as_gif(val_dataloader, output_path='video_sample.gif')


GIF guardado en video_sample.gif


In [20]:
# Load model directly
from transformers import AutoImageProcessor, AutoModelForVideoClassification, AdamW, TimesformerForVideoClassification, TimesformerConfig
from transformers import VideoMAEConfig, VideoMAEModel, VideoMAEForVideoClassification
from transformers import TimesformerConfig, TimesformerModel


model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")
model.config.num_labels = 3  # Cambiar el número de clases a 3
model.classifier = torch.nn.Linear(model.config.hidden_size, 3)  # Reemplazar la capa de clasificación

#configuration = VideoMAEConfig( image_size=224, patch_size=16, num_channels=3, num_frames=32, num_labels=3)

#model = VideoMAEForVideoClassification(configuration)


# Optimización solo de las últimas capas (ajuste fino)
for param in model.base_model.parameters():
    param.requires_grad = False  # Congelamos las capas base
    



config.json:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/486M [00:00<?, ?B/s]

In [21]:
import torch
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = torch.nn.CrossEntropyLoss()  # Función de pérdida para clasificación
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)  # Optimizador

num_epochs = 5  # Número de épocas

for epoch in range(num_epochs):
    # ---- Entrenamiento ----
    model.train()  # Ponemos el modelo en modo entrenamiento
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for i, (frames, labels) in enumerate(train_dataloader):
        frames = frames.to(device)  # Enviar los frames al dispositivo (GPU)
        labels = labels.to(device)

        optimizer.zero_grad()  # Resetear gradientes

        outputs = model(pixel_values=frames)  # Paso hacia adelante
        loss = criterion(outputs.logits, labels)  # Calcular la pérdida

        loss.backward()  # Retropropagación
        optimizer.step()  # Actualizar los pesos

        running_loss += loss.item()

        # Cálculo de precisión en entrenamiento
        _, predicted = torch.max(outputs.logits, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    train_loss = running_loss / len(train_dataloader)
    train_acc = 100 * correct_train / total_train
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.2f}%')

    # ---- Validación ----
    model.eval()  # Ponemos el modelo en modo evaluación (sin cálculo de gradientes)
    val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():  # No calcular gradientes durante la validación
        for frames, labels in val_dataloader:
            frames = frames.to(device)
            labels = labels.to(device)

            outputs = model(pixel_values=frames)
            loss = criterion(outputs.logits, labels)
            val_loss += loss.item()

            # Cálculo de precisión en validación
            _, predicted = torch.max(outputs.logits, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_loss = val_loss / len(val_dataloader)
    val_acc = 100 * correct_val / total_val
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.2f}%')


Epoch [1/5], Train Loss: 1.0144, Train Accuracy: 50.00%
Epoch [1/5], Validation Loss: 0.8474, Validation Accuracy: 74.71%
Epoch [2/5], Train Loss: 0.6658, Train Accuracy: 92.50%
Epoch [2/5], Validation Loss: 0.5770, Validation Accuracy: 97.70%
Epoch [3/5], Train Loss: 0.4577, Train Accuracy: 98.50%
Epoch [3/5], Validation Loss: 0.4079, Validation Accuracy: 98.85%
Epoch [4/5], Train Loss: 0.3340, Train Accuracy: 100.00%
Epoch [4/5], Validation Loss: 0.3123, Validation Accuracy: 100.00%
Epoch [5/5], Train Loss: 0.2522, Train Accuracy: 100.00%
Epoch [5/5], Validation Loss: 0.2416, Validation Accuracy: 100.00%


In [9]:
# Guardar los pesos del modelo entrenado
torch.save(model, 'senaliza-videos.pth')

In [10]:
import torch
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)


