<h1 style="font-size:200%; font-family:cursive; color:white;">1. Import Required Libraries & Dataset</h1>

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path
import cv2
cwd = Path(os.getcwd()) / 'AI-Module'
#cwd = Path(os.getcwd()).parent
base_path = cwd / 'Modules'
sys.path.append(str(base_path))
from KeyFrameExtractorClass import KeyFrameExtractor

In [2]:
df_videos = pd.DataFrame(columns=['video_name', 'video_path', 'translation'])
videos_path = cwd / 'Resources' / 'Datasets' / 'Videos'
df_translation = pd.read_csv(str(videos_path.parent.parent / 'Translations' / 'how2sign_train.csv'), sep='\t', usecols=['VIDEO_NAME', 'SENTENCE']).groupby('VIDEO_NAME')

In [3]:
for video in os.listdir(str(videos_path)):
    video_path = videos_path / video
    video_name = video.removeprefix('filtered_').removesuffix('.mp4')
    sentence = df_translation.get_group(video_name)['SENTENCE'].str.cat(sep=' ')
    new_row = pd.DataFrame({'video_name': [video_name], 'video_path': [video_path], 'translation': [sentence]})
    df_videos = pd.concat([df_videos, new_row])
df_videos.head()

Unnamed: 0,video_name,video_path,translation
0,-00cp1iGiDw-5-rgb_front,/app/AI-Module/Resources/Datasets/Videos/filte...,If you like to find more about my services you...
0,-08ZGCviCm4-5-rgb_front,/app/AI-Module/Resources/Datasets/Videos/filte...,Hello welcome my name is Julio Nutt and I am a...
0,-1KpdDGPCq4-5-rgb_front,/app/AI-Module/Resources/Datasets/Videos/filte...,"All right, the drink we're about to make is ca..."
0,-1XUnputrgk-5-rgb_front,/app/AI-Module/Resources/Datasets/Videos/filte...,"Okay, another thing that for me is important. ..."
0,-3sFcDBFxc8-5-rgb_front,/app/AI-Module/Resources/Datasets/Videos/filte...,"All right, we're ready to start our fire. What..."


In [4]:
import torch
import torch.nn as nn
from torchvision import transforms
from transformers import ViTModel, GPT2LMHeadModel, GPT2Tokenizer


#specify GPU
device = torch.device("cpu")

### Check if Pytorch is using GPU

In [None]:
torch.cuda.is_available(), torch.cuda.device_count(), torch.cuda.current_device(), torch.cuda.device(0), torch.cuda.get_device_name(0), device

In [6]:
from sklearn.model_selection import train_test_split
X_df = df_videos['video_path'].to_frame()
y_df = df_videos['translation'].to_frame()
seed = 31991
batch_size = 32
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=(100/500), random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=(100/400), random_state=seed)

In [None]:
X_train.head(), y_train.head(), X_val.head(), y_val.head(), X_test.head(), y_test.head()

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

<h1 style="font-size:200%; font-family:cursive; color:white;">3. Import ViT</h1>

In [None]:
# import BERT-base pretrained model
model = ViTModel.from_pretrained('google/vit-base-patch16-224')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
gpt = GPT2LMHeadModel.from_pretrained('gpt2')

In [11]:
tokens_label_train = tokenizer(
    y_train['translation'].tolist(),
    padding=True,
    return_tensors="pt"
)

tokens_label_val = tokenizer(
    y_val['translation'].tolist(),
    padding=True,
    return_tensors="pt"
)

tokens_label_test = tokenizer(
    y_test['translation'].tolist(),
    padding=True,
    return_tensors="pt"
)

<h1 style="font-size:200%; font-family:cursive; color:white;">5. List to Tensors</h1>

In [12]:
y_train['token_ids'] = tokens_label_train['input_ids'].tolist()
y_val['token_ids'] = tokens_label_val['input_ids'].tolist()
y_test['token_ids'] = tokens_label_test['input_ids'].tolist()

In [8]:
# Path a un video de ejemplo
row = df_videos[df_videos['video_name'] == "1NnjqlLI_6s-8-rgb_front"]
video_path = row["video_path"].iloc[0]
translation = row["translation"].iloc[0]
print(video_path)
print(translation)

/app/AI-Module/Resources/Datasets/Videos/filtered_1NnjqlLI_6s-8-rgb_front.mp4
It's a history, a living history, paper, paper, paper. Just make sure that you take care of your paper because just like every other piece of collectibles condition is everything so pieces that are torn or damaged or water soiled, the value just goes down dramatically so it is all about condition so enjoy your paper and treat it with kindness. I'm Jan, see ya.


<h1 style="font-size:200%; font-family:cursive; color:white;">7. Model Architecture</h1>

In [14]:
class VideoToTextModel(nn.Module):
    def __init__(self, vit, gpt, tokenizer):
        super(VideoToTextModel, self).__init__()
        self.vit = vit
        self.gpt = gpt
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def extract_frames(self, video_path: str) -> torch.Tensor:
        keyframe_extractor = KeyFrameExtractor()  # Instanciamos el extractor de frames
        video = cv2.VideoCapture(video_path)
        frames = keyframe_extractor.extractKeyFrames(return_frame=True, draw=False, video=video)
        # Convertimos las imágenes a tensores para el modelo ViT
        preprocess = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
        ])
        frames_tensors = torch.stack([preprocess(frame) for frame in frames])
        return frames_tensors

    def forward(self, video_path: str, translation_df: pd.DataFrame = None) -> str:
        # Extraemos los frames del video y los convertimos a tensores
        frames = self.extract_frames(video_path)
        
        # Pasamos los frames por el modelo ViT para obtener embeddings visuales
        with torch.no_grad():
            visual_embeddings = self.vit(frames).last_hidden_state

        # Usamos la representación visual como contexto para el modelo GPT
        # Ajustamos la forma del input para que el GPT lo pueda manejar
        gpt_input = visual_embeddings.view(visual_embeddings.size(0), -1)

        # Generamos secuencia de tokens de texto con GPT
        generated_tokens = self.gpt.generate(inputs_embeds=gpt_input, pad_token_id=self.tokenizer.pad_token_id)

        # Decodificamos los tokens generados a texto
        predicted_text = self.tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

        # Si se proporciona el dataframe de traducción, obtenemos la traducción correcta
        if translation_df is not None:
            video_name = video_path.split('/')[-1].removeprefix('filtered_').removesuffix('.mp4')
            true_translation = translation_df.get_group(video_name)['SENTENCE'].str.cat(sep=' ')
            print(f"Traducción correcta: {true_translation}")
        
        # Retornamos la predicción del texto
        return predicted_text

# Inicializamos el modelo
video_to_text_model = VideoToTextModel(model, gpt, tokenizer)


In [None]:
a = video_to_text_model.extract_frames(video_path)

<h1 style="font-size:200%; font-family:cursive; color:white;">8. Fine - Tune</h1>

In [None]:
import wandb
wandb.login()

In [24]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu(preds, references):
    smoothie = SmoothingFunction().method4  # Smoothing method to avoid zero scores for short sentences
    # Calcula el BLEU score para cada predicción en el batch
    scores = []
    for pred, ref in zip(preds, references):
        score = sentence_bleu([ref], pred, smoothing_function=smoothie)
        scores.append(score)
    return scores

# Function to convert token ids back to words (predictions and labels)
def decode_predictions(predictions, tokenizer):
    decoded_preds = []
    for pred in predictions:
        # Convert token IDs to tokens (words) using the tokenizer's decode method
        decoded = tokenizer.decode(pred, skip_special_tokens=True)
        decoded_preds.append(decoded)
    return decoded_preds

In [25]:
def train(
        model: torch.nn.Module, 
        train_loader: DataLoader, 
        optimizer: torch.optim.Optimizer, 
        criterion: torch.nn.Module, 
        device: torch.device, 
        epoch: int):
    
    model.train()
    running_loss = 0.0
    total = 0
    correct = 0
    
    all_preds = []
    all_refs = []

    for batch_idx, (inputs, mask, labels) in enumerate(train_loader):
        inputs, mask, labels = inputs.to(device), mask.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)  # (batch_size, seq_length, vocab_size)
        loss = criterion(outputs.view(-1, outputs.size(-1)), labels.view(-1))
        
        # Backward pass y optimización
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Cálculo de métricas
        running_loss += loss.item()
        total += labels.numel()
        _, predicted = torch.max(outputs.view(-1, outputs.size(-1)), 1)
        correct += (predicted == labels.view(-1)).sum().item()

        # Decodificación para BLEU
        all_preds.extend(predicted.tolist())
        all_refs.extend(labels.view(-1).tolist())
        
        # Logs en consola
        if batch_idx % 100 == 0:
            print(f'Epoch [{epoch}], Step [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
        # Logs en W&B
        wandb.log({
            "epoch": epoch,
            "batch_idx": batch_idx,
            "loss": loss.item(),
            "accuracy_batch": 100 * correct / total
        })
    
    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct / total

    # Calcular BLEU
    train_bleu = calculate_bleu(all_preds, all_refs)

    print(f"Epoch [{epoch}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")
    
    return epoch_loss, train_bleu

def evaluate(
        model: torch.nn.Module, 
        val_loader: DataLoader, 
        criterion: torch.nn.Module, 
        device: torch.device, 
        epoch: int):
    
    model.eval()
    running_loss = 0.0
    total = 0
    correct = 0
    
    all_preds = []
    all_refs = []

    with torch.no_grad():
        for batch_idx, (inputs, mask, labels) in enumerate(val_loader):
            inputs, mask, labels = inputs.to(device), mask.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(-1)), labels.view(-1))
            
            # Cálculo de métricas
            running_loss += loss.item()
            total += labels.numel()
            _, predicted = torch.max(outputs.view(-1, outputs.size(-1)), 1)
            correct += (predicted == labels.view(-1)).sum().item()

            # Decodificación para BLEU
            all_preds.extend(predicted.tolist())
            all_refs.extend(labels.view(-1).tolist())
            
            # Logs en consola
            if batch_idx % 100 == 0:
                print(f'Eval Step [{batch_idx}/{len(val_loader)}], Loss: {loss.item():.4f}')
            
            # Logs en W&B
            wandb.log({
                "eval_batch_idx": batch_idx,
                "eval_loss": loss.item(),
                "eval_accuracy_batch": 100 * correct / total
            })
    
    epoch_loss = running_loss / len(val_loader)
    epoch_accuracy = 100 * correct / total

    # Calcular BLEU
    valid_bleu = calculate_bleu(all_preds, all_refs)

    print(f"Validation Epoch [{epoch}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")
    
    return epoch_loss, valid_bleu


In [None]:
# Inicialización de variables para seguimiento del mejor modelo
best_valid_loss = float('inf')
best_bleu_score = 0

# Listas para almacenar loss y BLEU de cada epoch
train_losses = []
valid_losses = []
train_bleu_scores = []
valid_bleu_scores = []

# Configuración de hiperparámetros
wandb.config = {
    "epochs": epochs,
    "batch_size": batch_size,
    "learning_rate": optimizer.param_groups[0]['lr']
}

# Ciclo de entrenamiento y evaluación por cada epoch
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    # Entrenar el modelo
    train_loss, train_bleu = train(model, train_dataloader, optimizer, cross_entropy, device, epoch)
    
    # Evaluar el modelo
    valid_loss, valid_bleu = evaluate(model, val_dataloader, cross_entropy, device, epoch)

    # Guardar el mejor modelo basado en la validación de loss
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model.pt')
        print(f"Modelo guardado en epoch {epoch + 1} con loss de validación {valid_loss:.4f}")

    # Guardar loss y BLEU para entrenamiento y validación
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    train_bleu_scores.append(train_bleu)
    valid_bleu_scores.append(valid_bleu)

    # Mostrar estadísticas de entrenamiento y validación
    print(f"Training Loss: {train_loss:.4f} | Training BLEU: {train_bleu:.4f}")
    print(f"Validation Loss: {valid_loss:.4f} | Validation BLEU: {valid_bleu:.4f}")
    
    # Registros de la época en W&B
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "train_bleu": train_bleu,
        "valid_loss": valid_loss,
        "valid_bleu": valid_bleu
    })

# Finaliza el seguimiento de W&B
wandb.finish()

In [None]:
#load weights of best model
path = 'best_model.pt'
model.load_state_dict(torch.load(path))

<h1 style="font-size:200%; font-family:cursive; color:white;">9. Make Predictions</h1>

In [None]:
# get predictions for test data
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

In [None]:
# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))