# Environement Setup

In [1]:
import sys, os
try:
    from google.colab import drive, userdata
    IS_COLAB = True
except ImportError:
    IS_COLAB = False

REPO_NAME = 'MistakeDetection'

if IS_COLAB:
    print("‚òÅÔ∏è Colab rilevato.")
    if not os.path.exists('/content/drive'): drive.mount('/content/drive')

    GITHUB_USER = 'MarcoPernoVDP'
    try:
        TOKEN = userdata.get('GITHUB_TOKEN')
        REPO_URL = f'https://{TOKEN}@github.com/{GITHUB_USER}/{REPO_NAME}.git'
    except:
        REPO_URL = f'https://github.com/{GITHUB_USER}/{REPO_NAME}.git'

    ROOT_DIR = f'/content/{REPO_NAME}'
    if not os.path.exists(ROOT_DIR):
        !git clone {REPO_URL}
    else:
        %cd {ROOT_DIR}
        !git pull
        %cd /content


else:
    print("Ambiente locale rilevato.")
    ROOT_DIR = os.getcwd()
    while not os.path.exists(os.path.join(ROOT_DIR, '.gitignore')) and ROOT_DIR != os.path.dirname(ROOT_DIR):
        ROOT_DIR = os.path.dirname(ROOT_DIR)

if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)


Ambiente locale rilevato.


# Dataset Setup

In [4]:
from utils import setup_project
# Ora puoi passare agli import del modello
from dataset.capitain_cook_4d_mlp_dataset import CaptainCook4DMLP_Dataset, DatasetSource
from models.BaselineV2_Transformer import BaselineV2_Transformer
from dataset.utils import SplitType

# Esegue: Setup Dati (unzip/copy), Login WandB, Setup Device
device = setup_project.initialize(ROOT_DIR)

# Import wandb
import wandb

Setup Progetto in: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection
source_path: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\_file
Setup Dati da: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\_file
Inizio setup dati...
   Sorgente: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\_file
   Destinazione: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\data
Copia cartella: annotation_json...
Copia cartella: omnivore...
Estrazione ZIP: omnivore.zip...


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\marco\_netrc


‚úÖ Setup completato! Dati pronti in: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\data


[34m[1mwandb[0m: Currently logged in as: [33ms339450[0m ([33ms339450-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


WandB Logged in.
Device: cuda


# Configuration

In [5]:
# Configurazione esperimento
DATASET_SOURCE = DatasetSource.HIERO

config = {
    "dataset": "CaptainCook4D",
    "feature_extractor": DATASET_SOURCE.value,
    "input_dim": DATASET_SOURCE.input_dims(),
    "batch_size": 1,  # DEVE essere 1 per sequenze di lunghezza variabile
    "learning_rate": 1e-4,
    "epochs": 1,
    "pos_weight": 0.75,
    "optimizer": "Adam",
    "loss_function": "BCEWithLogitsLoss",
    "seed": 42,
}

# Leave-One-Out Cross-Validation Setup

In [6]:
import os
from dataset.capitain_cook_4d_mlp_dataset import DatasetSource
from dataset.capitain_cook_4d_task2subtask2_dataset import CaptainCook4DTask2Subtask2_Dataset
from torch.utils.data import DataLoader, Subset
from collections import defaultdict

try:
    full_dataset = CaptainCook4DTask2Subtask2_Dataset(
        root_dir=ROOT_DIR
    )
    
    # Raggruppa i video per recipe (activity_id)
    # L'activity_id √® il primo numero del video_id (es: "1_10" -> activity_id = 1)
    recipe_to_indices = defaultdict(list)
    for idx, video_id in enumerate(full_dataset.video_ids):
        activity_id = video_id.split('_')[0]
        recipe_to_indices[activity_id].append(idx)
    
    # Ordina le ricette per avere un ordine consistente
    recipes = sorted(recipe_to_indices.keys())
    
    print(f"\n{'='*80}")
    print(f"LEAVE-ONE-OUT CROSS-VALIDATION SETUP")
    print(f"{'='*80}")
    print(f"Total videos: {len(full_dataset)}")
    print(f"Total recipes (activities): {len(recipes)}")
    print(f"\nVideos per recipe:")
    for recipe_id in recipes:
        print(f"  Recipe {recipe_id}: {len(recipe_to_indices[recipe_id])} videos")
    print(f"{'='*80}\n")

except Exception as e:
    print(f"‚ùå Errore: {e}")

Loading from: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\data\hiero...
Dataset creato: 384 video completi

LEAVE-ONE-OUT CROSS-VALIDATION SETUP
Total videos: 384
Total recipes (activities): 24

Videos per recipe:
  Recipe 1: 18 videos
  Recipe 10: 12 videos
  Recipe 12: 18 videos
  Recipe 13: 14 videos
  Recipe 15: 15 videos
  Recipe 16: 16 videos
  Recipe 17: 20 videos
  Recipe 18: 15 videos
  Recipe 2: 16 videos
  Recipe 20: 14 videos
  Recipe 21: 19 videos
  Recipe 22: 17 videos
  Recipe 23: 16 videos
  Recipe 25: 15 videos
  Recipe 26: 17 videos
  Recipe 27: 15 videos
  Recipe 28: 18 videos
  Recipe 29: 18 videos
  Recipe 3: 13 videos
  Recipe 4: 17 videos
  Recipe 5: 15 videos
  Recipe 7: 16 videos
  Recipe 8: 16 videos
  Recipe 9: 14 videos



In [6]:
# V2: quando accedi a dataset[idx], dove idx √® l'indice dello STEP
full_dataset.print_item(0)

VIDEO DATASET ITEM [0]
Features shape:       torch.Size([8, 1024]) (num_steps, n_features)
Number of steps:      8
Label:                0 (No Errors)
Video ID:             10_16


In [7]:
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score
import numpy as np
from models.BaselineV2_Transformer import BaselineV2_Transformer

# Inizializzazione W&B per l'intero esperimento LOO
run = wandb.init(
    project="mistake-detection",
    name=f"LOO-Task2Subtask2-{DATASET_SOURCE.value}",
    config=config,
    tags=["leave-one-out", "Task2Subtask2", DATASET_SOURCE.value],
    notes=f"Leave-One-Out CV with {DATASET_SOURCE.value} features for mistake detection"
)

print(f"üöÄ W&B Run: {run.name} (ID: {run.id})")

# Statistiche per aggregare i risultati di tutti i fold
all_fold_results = []

# LOO: per ogni ricetta, usala come test set
for fold_idx, test_recipe_id in enumerate(recipes):
    print(f"\n{'='*80}")
    print(f"FOLD {fold_idx + 1}/{len(recipes)} - Testing on Recipe {test_recipe_id}")
    print(f"{'='*80}")
    
    # Indici del test set (ricetta corrente)
    test_indices = recipe_to_indices[test_recipe_id]
    
    # Indici del training set (tutte le altre ricette)
    train_indices = []
    for recipe_id in recipes:
        if recipe_id != test_recipe_id:
            train_indices.extend(recipe_to_indices[recipe_id])
    
    print(f"Train videos: {len(train_indices)} | Test videos: {len(test_indices)}")
    
    # Crea i subset
    train_dataset = Subset(full_dataset, train_indices)
    test_dataset = Subset(full_dataset, test_indices)
    
    # Crea i DataLoader
    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False)
    
    # Inizializza un nuovo modello per questo fold
    model = BaselineV2_Transformer(DATASET_SOURCE.input_dims()).to(device)
    
    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])
    
    # Loss function con pos_weight
    train_pos_weight = torch.tensor([config["pos_weight"]], device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=train_pos_weight)
    
    # Training loop per questo fold
    best_train_loss = np.inf
    
    for epoch in range(config["epochs"]):
        # TRAIN
        model.train()
        total_loss = 0
        train_preds_list = []
        train_targets_list = []
        train_probs_list = []
        
        for inputs, labels, video_ids in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device).float()
            
            probs, logits = model(inputs)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            with torch.no_grad():
                pred = (probs >= 0.5).long().item()
                train_preds_list.append(pred)
                train_targets_list.append(labels.item())
                train_probs_list.append(probs.item())
        
        avg_train_loss = total_loss / len(train_loader)
        
        # Metriche di training
        train_preds = np.array(train_preds_list)
        train_targets = np.array(train_targets_list)
        train_probs = np.array(train_probs_list)
        
        train_acc = accuracy_score(train_targets, train_preds)
        train_f1 = f1_score(train_targets, train_preds, zero_division=0)
        
        # Log su W&B per questo fold
        wandb.log({
            f"fold_{fold_idx+1}/train_loss": avg_train_loss,
            f"fold_{fold_idx+1}/train_accuracy": train_acc,
            f"fold_{fold_idx+1}/train_f1": train_f1,
            f"fold_{fold_idx+1}/epoch": epoch + 1
        })
        
        print(f"  Epoch {epoch+1}/{config['epochs']} - Train Loss: {avg_train_loss:.4f} - Train Acc: {train_acc:.4f} - Train F1: {train_f1:.4f}")
        
        # Salva il miglior modello per questo fold
        if avg_train_loss < best_train_loss:
            best_train_loss = avg_train_loss
            best_model_state = model.state_dict()
    
    # Carica il miglior modello
    model.load_state_dict(best_model_state)
    
    # TEST per questo fold
    model.eval()
    test_preds_list = []
    test_targets_list = []
    test_probs_list = []
    
    with torch.no_grad():
        for inputs, labels, video_ids in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device).float()
            
            probs, logits = model(inputs)
            pred = (probs >= 0.5).long().item()
            
            test_preds_list.append(pred)
            test_targets_list.append(labels.item())
            test_probs_list.append(probs.item())
    
    # Metriche di test per questo fold
    test_preds = np.array(test_preds_list)
    test_targets = np.array(test_targets_list)
    test_probs = np.array(test_probs_list)
    
    test_acc = accuracy_score(test_targets, test_preds)
    test_f1 = f1_score(test_targets, test_preds, zero_division=0)
    test_precision = precision_score(test_targets, test_preds, zero_division=0)
    test_recall = recall_score(test_targets, test_preds, zero_division=0)
    
    try:
        test_auc = roc_auc_score(test_targets, test_probs)
    except ValueError:
        test_auc = 0.0
    
    # Salva i risultati di questo fold
    fold_result = {
        'fold': fold_idx + 1,
        'test_recipe': test_recipe_id,
        'accuracy': test_acc,
        'f1': test_f1,
        'precision': test_precision,
        'recall': test_recall,
        'auc': test_auc,
        'test_targets': test_targets,
        'test_preds': test_preds
    }
    all_fold_results.append(fold_result)
    
    # Log su W&B
    wandb.log({
        f"fold_{fold_idx+1}/test_accuracy": test_acc,
        f"fold_{fold_idx+1}/test_f1": test_f1,
        f"fold_{fold_idx+1}/test_precision": test_precision,
        f"fold_{fold_idx+1}/test_recall": test_recall,
        f"fold_{fold_idx+1}/test_auc": test_auc,
    })
    
    print(f"\n  Test Results for Recipe {test_recipe_id}:")
    print(f"    Accuracy: {test_acc:.4f}")
    print(f"    F1: {test_f1:.4f}")
    print(f"    Precision: {test_precision:.4f}")
    print(f"    Recall: {test_recall:.4f}")
    print(f"    AUC: {test_auc:.4f}")


print(f"\n{'='*80}")
print("üéâ Leave-One-Out Cross-Validation completato!")
print(f"{'='*80}")

üöÄ W&B Run: LOO-Task2Subtask2-hiero (ID: zob6a9ok)

FOLD 1/24 - Testing on Recipe 1
Train videos: 366 | Test videos: 18
  Epoch 1/1 - Train Loss: 0.6349 - Train Acc: 0.5109 - Train F1: 0.5491

  Test Results for Recipe 1:
    Accuracy: 0.7222
    F1: 0.8387
    Precision: 0.7222
    Recall: 1.0000
    AUC: 0.7846

FOLD 2/24 - Testing on Recipe 10
Train videos: 372 | Test videos: 12
  Epoch 1/1 - Train Loss: 0.6451 - Train Acc: 0.4919 - Train F1: 0.5166

  Test Results for Recipe 10:
    Accuracy: 0.3333
    F1: 0.0000
    Precision: 0.0000
    Recall: 0.0000
    AUC: 0.6250

FOLD 3/24 - Testing on Recipe 12
Train videos: 366 | Test videos: 18
  Epoch 1/1 - Train Loss: 0.6692 - Train Acc: 0.4536 - Train F1: 0.4318

  Test Results for Recipe 12:
    Accuracy: 0.6111
    F1: 0.0000
    Precision: 0.0000
    Recall: 0.0000
    AUC: 0.8961

FOLD 4/24 - Testing on Recipe 13
Train videos: 370 | Test videos: 14
  Epoch 1/1 - Train Loss: 0.6277 - Train Acc: 0.5000 - Train F1: 0.5067

  Test R

KeyboardInterrupt: 

In [8]:
# Calcola le statistiche aggregate su tutti i fold
accuracies = [r['accuracy'] for r in all_fold_results]
f1_scores = [r['f1'] for r in all_fold_results]
precisions = [r['precision'] for r in all_fold_results]
recalls = [r['recall'] for r in all_fold_results]
aucs = [r['auc'] for r in all_fold_results]

# Medie e deviazioni standard
mean_acc = np.mean(accuracies)
std_acc = np.std(accuracies)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_auc = np.mean(aucs)
std_auc = np.std(aucs)

# Stampa i risultati aggregati
print(f"\n{'='*80}")
print("AGGREGATED RESULTS ACROSS ALL FOLDS")
print(f"{'='*80}")
print(f"\nMetric            | Mean      | Std Dev")
print(f"{'-'*80}")
print(f"Accuracy          | {mean_acc:.4f}    | {std_acc:.4f}")
print(f"F1 Score          | {mean_f1:.4f}    | {std_f1:.4f}")
print(f"Precision         | {mean_precision:.4f}    | {std_precision:.4f}")
print(f"Recall            | {mean_recall:.4f}    | {std_recall:.4f}")
print(f"AUC               | {mean_auc:.4f}    | {std_auc:.4f}")
print(f"{'='*80}")

# Stampa i risultati per ogni fold
print(f"\nRESULTS PER FOLD:")
print(f"{'-'*80}")
print(f"Fold | Recipe | Accuracy | F1       | Precision | Recall   | AUC")
print(f"{'-'*80}")
for result in all_fold_results:
    print(f"{result['fold']:<4} | {result['test_recipe']:<6} | {result['accuracy']:.4f}   | {result['f1']:.4f}   | {result['precision']:.4f}    | {result['recall']:.4f}   | {result['auc']:.4f}")
print(f"{'='*80}")

# Log delle metriche aggregate su W&B
wandb.log({
    "overall/mean_accuracy": mean_acc,
    "overall/std_accuracy": std_acc,
    "overall/mean_f1": mean_f1,
    "overall/std_f1": std_f1,
    "overall/mean_precision": mean_precision,
    "overall/std_precision": std_precision,
    "overall/mean_recall": mean_recall,
    "overall/std_recall": std_recall,
    "overall/mean_auc": mean_auc,
    "overall/std_auc": std_auc,
})

# Crea una tabella per W&B con i risultati per fold
fold_table_data = []
for result in all_fold_results:
    fold_table_data.append([
        result['fold'],
        result['test_recipe'],
        result['accuracy'],
        result['f1'],
        result['precision'],
        result['recall'],
        result['auc']
    ])

wandb.log({
    "fold_results_table": wandb.Table(
        columns=["Fold", "Test Recipe", "Accuracy", "F1", "Precision", "Recall", "AUC"],
        data=fold_table_data
    )
})

# Confusion Matrix aggregata (concatena tutti i target e le predizioni)
all_targets = np.concatenate([r['test_targets'] for r in all_fold_results])
all_preds = np.concatenate([r['test_preds'] for r in all_fold_results])

cm_overall = confusion_matrix(all_targets, all_preds)
print(f"\nOVERALL CONFUSION MATRIX:")
print(cm_overall)

wandb.log({
    "overall/confusion_matrix": wandb.plot.confusion_matrix(
        probs=None,
        y_true=all_targets,
        preds=all_preds,
        class_names=["No Error", "Error"]
    )
})


AGGREGATED RESULTS ACROSS ALL FOLDS

Metric            | Mean      | Std Dev
--------------------------------------------------------------------------------
Accuracy          | 0.4249    | 0.1469
F1 Score          | 0.1763    | 0.3125
Precision         | 0.1403    | 0.2560
Recall            | 0.2500    | 0.4330
AUC               | 0.6927    | 0.1445

RESULTS PER FOLD:
--------------------------------------------------------------------------------
Fold | Recipe | Accuracy | F1       | Precision | Recall   | AUC
--------------------------------------------------------------------------------
1    | 1      | 0.7222   | 0.8387   | 0.7222    | 1.0000   | 0.7846
2    | 10     | 0.3333   | 0.0000   | 0.0000    | 0.0000   | 0.6250
3    | 12     | 0.6111   | 0.0000   | 0.0000    | 0.0000   | 0.8961
4    | 13     | 0.3571   | 0.0000   | 0.0000    | 0.0000   | 0.7333
5    | 15     | 0.3333   | 0.0000   | 0.0000    | 0.0000   | 0.4600
6    | 16     | 0.3750   | 0.0000   | 0.0000    | 0.0000   |

In [None]:
# Chiudi il run di W&B
wandb.finish()
print("üèÅ W&B run terminato")