# Environement Setup

In [1]:
import sys, os
try:
    from google.colab import drive, userdata
    IS_COLAB = True
except ImportError:
    IS_COLAB = False

REPO_NAME = 'MistakeDetection'

if IS_COLAB:
    print("‚òÅÔ∏è Colab rilevato.")
    if not os.path.exists('/content/drive'): drive.mount('/content/drive')

    GITHUB_USER = 'MarcoPernoVDP'
    try:
        TOKEN = userdata.get('GITHUB_TOKEN')
        REPO_URL = f'https://{TOKEN}@github.com/{GITHUB_USER}/{REPO_NAME}.git'
    except:
        REPO_URL = f'https://github.com/{GITHUB_USER}/{REPO_NAME}.git'

    ROOT_DIR = f'/content/{REPO_NAME}'
    if not os.path.exists(ROOT_DIR):
        !git clone {REPO_URL}
    else:
        %cd {ROOT_DIR}
        !git pull
        %cd /content


else:
    print("Ambiente locale rilevato.")
    ROOT_DIR = os.getcwd()
    while not os.path.exists(os.path.join(ROOT_DIR, '.gitignore')) and ROOT_DIR != os.path.dirname(ROOT_DIR):
        ROOT_DIR = os.path.dirname(ROOT_DIR)

if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)


‚òÅÔ∏è Colab rilevato.
Mounted at /content/drive
Cloning into 'MistakeDetection'...
remote: Enumerating objects: 769, done.[K
remote: Counting objects: 100% (236/236), done.[K
remote: Compressing objects: 100% (182/182), done.[K
remote: Total 769 (delta 146), reused 106 (delta 51), pack-reused 533 (from 1)[K
Receiving objects: 100% (769/769), 86.94 MiB | 43.03 MiB/s, done.
Resolving deltas: 100% (418/418), done.


# Dataset Setup

In [2]:
from utils import setup_project
# Ora puoi passare agli import del modello
from dataset.capitain_cook_4d_mlp_dataset import CaptainCook4DMLP_Dataset, DatasetSource
from models.BaselineV2_Transformer import BaselineV2_Transformer
from dataset.utils import SplitType

# Esegue: Setup Dati (unzip/copy), Login WandB, Setup Device
device = setup_project.initialize(ROOT_DIR)

# Import wandb
import wandb

Setup Progetto in: /content/MistakeDetection
source_path: /content/drive/MyDrive/MistakeDetection
Setup Dati da: /content/drive/MyDrive/MistakeDetection
Inizio setup dati...
   Sorgente: /content/drive/MyDrive/MistakeDetection
   Destinazione: /content/MistakeDetection/data
Estrazione ZIP: omnivore.zip...
Copia cartella: annotation_json...
Estrazione ZIP: slowfast.zip...
Estrazione ZIP: 3dresnet.zip...
Estrazione ZIP: x3d.zip...
Estrazione ZIP: omnivore_test.zip...
Estrazione ZIP: error_recognition_best.zip...
Estrazione ZIP: features.zip...
Estrazione ZIP: perceptionencoder.zip...
Estrazione ZIP: egovlp.zip...
Copia cartella: hungarian_results...
Copia cartella: task_graphs...
Copia cartella: recipe_text_step_embeddings...
‚úÖ Setup completato! Dati pronti in: /content/MistakeDetection/data


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33ms339450[0m ([33ms339450-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


WandB Logged in.
Device: cuda


# Configuration

In [28]:
# Configurazione esperimento
DATASET_SOURCE = DatasetSource.HIERO

config = {
    "dataset": "CaptainCook4D",
    "feature_extractor": DATASET_SOURCE.value,
    "input_dim": DATASET_SOURCE.input_dims(),
    "batch_size": 32,
    "learning_rate": 1e-4,
    "epochs": 15,
    "pos_weight": 0.75,
    "optimizer": "Adam",
    "loss_function": "BCEWithLogitsLoss",
    "seed": 42,
    "weight_decay": 1e-6,
    "modality": "max_steps+1"
}

# Dataloader

In [4]:
from dataset.dagnn_dataset import DAGNNDataset

if IS_COLAB:
  dataset = DAGNNDataset(
      video_embeddings_path=os.path.join("/content/drive/MyDrive/MistakeDetection", "hiero_all_video_steps_max_steps_plus_1.npz"),
      recipe_embeddings_dir=os.path.join("/content/drive/MyDrive/MistakeDetection", "recipe_text_step_embeddings"),
      hungarian_results_path=os.path.join("/content/drive/MyDrive/MistakeDetection", "hungarian_results", "hungarian_matching_results_max_1_step.json"),
      annotation_path=os.path.join("/content/drive/MyDrive/MistakeDetection", "annotation_json", "video_level_annotations.json"),
  )
else:
  dataset = DAGNNDataset(
      video_embeddings_path=os.path.join(ROOT_DIR, "data", "hiero_all_video_steps.npz"),
      recipe_embeddings_dir=os.path.join(ROOT_DIR, "data", "recipe_text_step_embeddings"),
      hungarian_results_path=os.path.join(ROOT_DIR, "hungarian_results", "hungarian_matching_results.json"),
      annotation_path=os.path.join(ROOT_DIR, "data", "annotation_json", "video_level_annotations.json"),
  )

Loading video embeddings...
Loading Hungarian matching results...
Loading error annotations...
Dataset initialized with 384 samples


In [5]:
print(dataset[1]['node_features'].shape)  # Esempio di accesso ai dati
print(dataset[1]['edge_index'])  # Esempio di accesso agli indici degli edge

torch.Size([21, 1536])
tensor([[14, 15,  1,  7, 18,  2, 11,  5,  3, 17, 19, 13, 16,  8, 10,  4,  6, 12,
          0,  9, 15],
        [ 1,  2,  3,  4,  5, 20,  7,  8,  9, 10, 11, 12, 13, 14, 15, 17, 18, 19,
         16, 20,  6]])


# Leave-One-Out Cross-Validation Setup

Raggruppiamo i video per ricetta per fare LOO CV

In [6]:
from collections import defaultdict
from torch.utils.data import DataLoader, Subset

# Raggruppa i video per ricetta
recipe_to_indices = defaultdict(list)
for idx in range(len(dataset)):
    sample = dataset.samples[idx]
    recipe_name = sample['recipe_name']
    recipe_to_indices[recipe_name].append(idx)

# Ordina le ricette per avere un ordine consistente
recipes = sorted(recipe_to_indices.keys())

print(f"\n{'='*80}")
print(f"LEAVE-ONE-OUT CROSS-VALIDATION SETUP")
print(f"{'='*80}")
print(f"Total videos: {len(dataset)}")
print(f"Total recipes: {len(recipes)}")
print(f"\nVideos per recipe:")
for recipe_name in recipes:
    print(f"  {recipe_name:<30}: {len(recipe_to_indices[recipe_name])} videos")
print(f"{'='*80}\n")


LEAVE-ONE-OUT CROSS-VALIDATION SETUP
Total videos: 384
Total recipes: 24

Videos per recipe:
  blenderbananapancakes         : 19 videos
  breakfastburritos             : 16 videos
  broccolistirfry               : 16 videos
  buttercorncup                 : 14 videos
  capresebruschetta             : 18 videos
  cheesepimiento                : 15 videos
  coffee                        : 15 videos
  cucumberraita                 : 20 videos
  dressedupmeatballs            : 16 videos
  herbomeletwithfriedtomatoes   : 17 videos
  microwaveeggsandwich          : 18 videos
  microwavefrenchtoast          : 14 videos
  microwavemugpizza             : 13 videos
  mugcake                       : 17 videos
  panfriedtofu                  : 15 videos
  pinwheels                     : 12 videos
  ramen                         : 17 videos
  sautedmushrooms               : 14 videos
  scrambledeggs                 : 16 videos
  spicedhotchocolate            : 16 videos
  spicytunaavocadowraps   

# DAGNN Model

Implementiamo la DAGNN per error detection con:
- ProjectionLayer per ridurre dimensioni (1536 ‚Üí 256)
- Graph Convolutional layers
- Global pooling
- Binary classifier

In [7]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m63.7/63.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.3/1.3 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GCNConv, global_mean_pool

class DAGNN(nn.Module):
    """
    DAGNN model for cooking mistake detection.

    Architecture:
    1. ProjectionLayer: [1536] ‚Üí [128] (ridotto da 256)
    2. 2 GCN layers (ridotto da 3)
    3. Global pooling over nodes
    4. Binary classifier (error/no error)
    """

    def __init__(
        self,
        input_dim: int = 1536,
        hidden_dim: int = 128,  # Ridotto da 256
        num_gnn_layers: int = 1,  # Ridotto da 3
        dropout: float = 0.4,  # Bilanciato
    ):
        super().__init__()

        # Feature projection (learnable combination of text + visual)
        self.projection = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.LayerNorm(hidden_dim),
        )

        # GNN layers
        self.convs = nn.ModuleList()
        self.norms = nn.ModuleList()  # Aggiunto Layer Norm per ogni GNN layer
        for i in range(num_gnn_layers):
            self.convs.append(GCNConv(hidden_dim, hidden_dim))
            self.norms.append(nn.LayerNorm(hidden_dim))

        self.dropout = nn.Dropout(dropout)

        # Binary classifier - pi√π semplice
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1),  # Binary output
        )

    def forward(self, batch_data):
        """
        Args:
            batch_data: Batched PyG Data object with:
                - x: [total_nodes, 1536] node features
                - edge_index: [2, total_edges] edges
                - batch: [total_nodes] batch assignment

        Returns:
            logits: [batch_size, 1] - logits for binary classification
            probs: [batch_size, 1] - probabilities after sigmoid
        """
        x = batch_data.x
        edge_index = batch_data.edge_index
        batch = batch_data.batch

        # 1. Project features [total_nodes, 1536] ‚Üí [total_nodes, 128]
        x = self.projection(x)

        # 2. GNN layers with normalization
        for conv, norm in zip(self.convs, self.norms):
            x = conv(x, edge_index)
            x = norm(x)  # Layer norm per stabilit√†
            x = F.relu(x)
            x = self.dropout(x)

        # 3. Global pooling (one embedding per graph)
        x = global_mean_pool(x, batch)  # [batch_size, 128]

        # 4. Classification
        logits = self.classifier(x)  # [batch_size, 1]
        probs = torch.sigmoid(logits)

        return probs, logits

print("‚úÖ DAGNN model implementato con regolarizzazione aumentata")

‚úÖ DAGNN model implementato con regolarizzazione aumentata


# Helper Functions

Funzioni per convertire batch in formato PyTorch Geometric

In [9]:
from dataset.dagnn_dataset import collate_fn

def collate_to_pyg(batch_dict):
    """
    Convert batch from DAGNNDataset to PyTorch Geometric format.

    Args:
        batch_dict: Dictionary from DAGNN collate_fn

    Returns:
        Batched PyG Data object
    """
    graphs = []

    for i in range(len(batch_dict['node_features'])):
        # Verifica che edge_index sia nel formato corretto
        edge_index = batch_dict['edge_index'][i]
        num_nodes = batch_dict['node_features'][i].shape[0]

        # Debug: controlla che gli indici siano validi
        if edge_index.numel() > 0:
            max_idx = edge_index.max().item()
            if max_idx >= num_nodes:
                print(f"‚ö†Ô∏è Warning: edge_index contiene indice {max_idx} ma ci sono solo {num_nodes} nodi")
                # Filtra edge invalidi
                valid_edges = (edge_index[0] < num_nodes) & (edge_index[1] < num_nodes)
                edge_index = edge_index[:, valid_edges]

        graph = Data(
            x=batch_dict['node_features'][i],        # [N_i, 1536]
            edge_index=edge_index,                    # [2, E_i]
            y=batch_dict['labels'][i],               # Scalar
        )
        graphs.append(graph)

    # Batch graphs
    batched = Batch.from_data_list(graphs)

    return batched

print("‚úÖ Helper functions aggiornate con validazione edge_index")

‚úÖ Helper functions aggiornate con validazione edge_index


# Leave-One-Out Cross-Validation Training

Training con LOO CV: ogni ricetta usata come test set una volta

In [22]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score

# Inizializzazione W&B per l'intero esperimento LOO
run = wandb.init(
    project="mistake-detection",
    name=f"LOO-Task2Subtask4-DAGNN-{DATASET_SOURCE.value}-balanced",
    config={
        **config,
        "model": "DAGNN",
        "hidden_dim": 256,
        "num_gnn_layers": 1,
        "dropout": 0.3,
    },
    tags=["leave-one-out", "Task2Subtask4", "DAGNN", DATASET_SOURCE.value, "balanced"],
    notes=f"Leave-One-Out CV with DAGNN (balanced regularization) for mistake detection using {DATASET_SOURCE.value} features"
)

print(f"üöÄ W&B Run: {run.name} (ID: {run.id})")

# Aggiorna config
config.update({
    "model": "DAGNN",
    "hidden_dim": 128,
    "num_gnn_layers": 1,
    "dropout": 0.4,
})

# Statistiche per aggregare i risultati di tutti i fold
all_fold_results = []

# LOO: per ogni ricetta, usala come test set
for fold_idx, test_recipe_name in enumerate(recipes):
    print(f"\n{'='*80}")
    print(f"FOLD {fold_idx + 1}/{len(recipes)} - Testing on Recipe: {test_recipe_name}")
    print(f"{'='*80}")

    # Indici del test set (ricetta corrente)
    test_indices = recipe_to_indices[test_recipe_name]

    # Indici del training set (tutte le altre ricette)
    train_indices = []
    for recipe_name in recipes:
        if recipe_name != test_recipe_name:
            train_indices.extend(recipe_to_indices[recipe_name])

    print(f"Train videos: {len(train_indices)} | Test videos: {len(test_indices)}")

    # Crea i subset
    train_dataset = Subset(dataset, train_indices)
    test_dataset = Subset(dataset, test_indices)

    # Crea i DataLoader con collate_fn custom
    train_loader = DataLoader(
        train_dataset,
        batch_size=config["batch_size"],
        shuffle=True,
        collate_fn=collate_fn
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=config["batch_size"],
        shuffle=False,
        collate_fn=collate_fn
    )

    # Inizializza un nuovo modello per questo fold
    model = DAGNN(
        input_dim=1536,
        hidden_dim=config["hidden_dim"],
        num_gnn_layers=config["num_gnn_layers"],
        dropout=config["dropout"]
    ).to(device)

    # Optimizer con weight decay
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=config["learning_rate"],
        weight_decay=config.get("weight_decay", 1e-4)  # L2 regularization leggera
    )

    # Loss function con pos_weight
    train_pos_weight = torch.tensor([config["pos_weight"]], device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=train_pos_weight)

    # Training loop per questo fold
    best_train_loss = np.inf
    best_model_state = None

    for epoch in range(config["epochs"]):
        # TRAIN
        model.train()
        total_loss = 0
        train_preds_list = []
        train_targets_list = []
        train_probs_list = []

        for batch_dict in train_loader:
            # Converti a PyG format
            pyg_batch = collate_to_pyg(batch_dict).to(device)

            # Forward
            probs, logits = model(pyg_batch)

            # Loss
            labels = pyg_batch.y.float().unsqueeze(1)  # [batch_size, 1]
            loss = criterion(logits, labels)

            # Backward
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping per stabilit√†
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            # Metriche
            total_loss += loss.item()
            preds = (probs >= 0.5).long().cpu().numpy().flatten()
            targets = labels.long().cpu().numpy().flatten()
            probs_np = probs.detach().cpu().numpy().flatten()

            train_preds_list.extend(preds)
            train_targets_list.extend(targets)
            train_probs_list.extend(probs_np)

        avg_train_loss = total_loss / len(train_loader)

        # Metriche di training
        train_preds = np.array(train_preds_list)
        train_targets = np.array(train_targets_list)
        train_probs = np.array(train_probs_list)

        train_acc = accuracy_score(train_targets, train_preds)
        train_f1 = f1_score(train_targets, train_preds, zero_division=0)

        # Log su W&B per questo fold
        wandb.log({
            f"fold_{fold_idx+1}/train_loss": avg_train_loss,
            f"fold_{fold_idx+1}/train_accuracy": train_acc,
            f"fold_{fold_idx+1}/train_f1": train_f1,
            f"fold_{fold_idx+1}/epoch": epoch + 1
        })

        print(f"  Epoch {epoch+1}/{config['epochs']} - Loss: {avg_train_loss:.4f} - Acc: {train_acc:.4f} - F1: {train_f1:.4f}")

        # Salva il miglior modello per questo fold
        if avg_train_loss < best_train_loss:
            best_train_loss = avg_train_loss
            best_model_state = model.state_dict().copy()

    # Carica il miglior modello
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    # TEST per questo fold
    model.eval()
    test_preds_list = []
    test_targets_list = []
    test_probs_list = []

    with torch.no_grad():
        for batch_dict in test_loader:
            # Converti a PyG format
            pyg_batch = collate_to_pyg(batch_dict).to(device)

            # Forward
            probs, logits = model(pyg_batch)

            # Predictions
            labels = pyg_batch.y.long().cpu().numpy()
            preds = (probs >= 0.5).long().cpu().numpy().flatten()
            probs_np = probs.cpu().numpy().flatten()

            test_preds_list.extend(preds)
            test_targets_list.extend(labels)
            test_probs_list.extend(probs_np)

    # Metriche di test per questo fold
    test_preds = np.array(test_preds_list)
    test_targets = np.array(test_targets_list)
    test_probs = np.array(test_probs_list)

    test_acc = accuracy_score(test_targets, test_preds)
    test_f1 = f1_score(test_targets, test_preds, zero_division=0)
    test_precision = precision_score(test_targets, test_preds, zero_division=0)
    test_recall = recall_score(test_targets, test_preds, zero_division=0)

    try:
        test_auc = roc_auc_score(test_targets, test_probs)
    except ValueError:
        test_auc = 0.0

    # Salva i risultati di questo fold
    fold_result = {
        'fold': fold_idx + 1,
        'test_recipe': test_recipe_name,
        'accuracy': test_acc,
        'f1': test_f1,
        'precision': test_precision,
        'recall': test_recall,
        'auc': test_auc,
        'test_targets': test_targets,
        'test_preds': test_preds
    }
    all_fold_results.append(fold_result)

    # Log su W&B
    wandb.log({
        f"fold_{fold_idx+1}/test_accuracy": test_acc,
        f"fold_{fold_idx+1}/test_f1": test_f1,
        f"fold_{fold_idx+1}/test_precision": test_precision,
        f"fold_{fold_idx+1}/test_recall": test_recall,
        f"fold_{fold_idx+1}/test_auc": test_auc,
    })

    print(f"\n  Test Results for Recipe {test_recipe_name}:")
    print(f"    Accuracy: {test_acc:.4f}")
    print(f"    F1: {test_f1:.4f}")
    print(f"    Precision: {test_precision:.4f}")
    print(f"    Recall: {test_recall:.4f}")
    print(f"    AUC: {test_auc:.4f}")

print(f"\n{'='*80}")
print("üéâ Leave-One-Out Cross-Validation completato!")
print(f"{'='*80}")

0,1
fold_1/epoch,‚ñÅ‚ñÉ‚ñÖ‚ñÜ‚ñà
fold_1/test_accuracy,‚ñÅ
fold_1/test_auc,‚ñÅ
fold_1/test_f1,‚ñÅ
fold_1/test_precision,‚ñÅ
fold_1/test_recall,‚ñÅ
fold_1/train_accuracy,‚ñÅ‚ñÖ‚ñÖ‚ñà‚ñà
fold_1/train_f1,‚ñá‚ñÅ‚ñÉ‚ñà‚ñÜ
fold_1/train_loss,‚ñà‚ñÇ‚ñÑ‚ñÅ‚ñÅ
fold_2/epoch,‚ñÅ‚ñÉ‚ñÖ‚ñÜ‚ñà

0,1
fold_1/epoch,5
fold_1/test_accuracy,0.31579
fold_1/test_auc,0.29762
fold_1/test_f1,0.43478
fold_1/test_precision,0.45455
fold_1/test_recall,0.41667
fold_1/train_accuracy,0.6137
fold_1/train_f1,0.65012
fold_1/train_loss,0.57826
fold_2/epoch,5


üöÄ W&B Run: LOO-Task2Subtask4-DAGNN-hiero-balanced (ID: wh118teu)

FOLD 1/24 - Testing on Recipe: blenderbananapancakes
Train videos: 365 | Test videos: 19
  Epoch 1/20 - Loss: 0.6037 - Acc: 0.4137 - F1: 0.3476
  Epoch 2/20 - Loss: 0.5916 - Acc: 0.5616 - F1: 0.6209
  Epoch 3/20 - Loss: 0.5868 - Acc: 0.6137 - F1: 0.7006
  Epoch 4/20 - Loss: 0.5830 - Acc: 0.6027 - F1: 0.6523
  Epoch 5/20 - Loss: 0.5842 - Acc: 0.6000 - F1: 0.6589
  Epoch 6/20 - Loss: 0.5713 - Acc: 0.6548 - F1: 0.6595
  Epoch 7/20 - Loss: 0.5720 - Acc: 0.6411 - F1: 0.6797
  Epoch 8/20 - Loss: 0.5739 - Acc: 0.6055 - F1: 0.6269
  Epoch 9/20 - Loss: 0.5639 - Acc: 0.6438 - F1: 0.6717
  Epoch 10/20 - Loss: 0.5551 - Acc: 0.6959 - F1: 0.7560
  Epoch 11/20 - Loss: 0.5528 - Acc: 0.6685 - F1: 0.6790
  Epoch 12/20 - Loss: 0.5418 - Acc: 0.7260 - F1: 0.7573
  Epoch 13/20 - Loss: 0.5254 - Acc: 0.7397 - F1: 0.7666
  Epoch 14/20 - Loss: 0.5198 - Acc: 0.7534 - F1: 0.7761
  Epoch 15/20 - Loss: 0.4995 - Acc: 0.7918 - F1: 0.8182
  Epoch 16/

# Results Analysis

Analisi dei risultati aggregati su tutti i fold

In [23]:
# Calcola le statistiche aggregate su tutti i fold
accuracies = [r['accuracy'] for r in all_fold_results]
f1_scores = [r['f1'] for r in all_fold_results]
precisions = [r['precision'] for r in all_fold_results]
recalls = [r['recall'] for r in all_fold_results]
aucs = [r['auc'] for r in all_fold_results]

# Medie e deviazioni standard
mean_acc = np.mean(accuracies)
std_acc = np.std(accuracies)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_auc = np.mean(aucs)
std_auc = np.std(aucs)

# Stampa i risultati aggregati
print(f"\n{'='*80}")
print("AGGREGATED RESULTS ACROSS ALL FOLDS")
print(f"{'='*80}")
print(f"\nMetric            | Mean      | Std Dev")
print(f"{'-'*80}")
print(f"Accuracy          | {mean_acc:.4f}    | {std_acc:.4f}")
print(f"F1 Score          | {mean_f1:.4f}    | {std_f1:.4f}")
print(f"Precision         | {mean_precision:.4f}    | {std_precision:.4f}")
print(f"Recall            | {mean_recall:.4f}    | {std_recall:.4f}")
print(f"AUC               | {mean_auc:.4f}    | {std_auc:.4f}")
print(f"{'='*80}")

# Stampa i risultati per ogni fold
print(f"\nRESULTS PER FOLD:")
print(f"{'-'*80}")
print(f"Fold | Recipe                         | Accuracy | F1       | Precision | Recall   | AUC")
print(f"{'-'*80}")
for result in all_fold_results:
    print(f"{result['fold']:<4} | {result['test_recipe']:<30} | {result['accuracy']:.4f}   | {result['f1']:.4f}   | {result['precision']:.4f}    | {result['recall']:.4f}   | {result['auc']:.4f}")
print(f"{'='*80}")

# Log delle metriche aggregate su W&B
wandb.log({
    "overall/mean_accuracy": mean_acc,
    "overall/std_accuracy": std_acc,
    "overall/mean_f1": mean_f1,
    "overall/std_f1": std_f1,
    "overall/mean_precision": mean_precision,
    "overall/std_precision": std_precision,
    "overall/mean_recall": mean_recall,
    "overall/std_recall": std_recall,
    "overall/mean_auc": mean_auc,
    "overall/std_auc": std_auc,
})

# Crea una tabella per W&B con i risultati per fold
fold_table_data = []
for result in all_fold_results:
    fold_table_data.append([
        result['fold'],
        result['test_recipe'],
        result['accuracy'],
        result['f1'],
        result['precision'],
        result['recall'],
        result['auc']
    ])

wandb.log({
    "fold_results_table": wandb.Table(
        columns=["Fold", "Test Recipe", "Accuracy", "F1", "Precision", "Recall", "AUC"],
        data=fold_table_data
    )
})

# Confusion Matrix aggregata (concatena tutti i target e le predizioni)
all_targets = np.concatenate([r['test_targets'] for r in all_fold_results])
all_preds = np.concatenate([r['test_preds'] for r in all_fold_results])

cm_overall = confusion_matrix(all_targets, all_preds)
print(f"\nOVERALL CONFUSION MATRIX:")
print(cm_overall)

wandb.log({
    "overall/confusion_matrix": wandb.plot.confusion_matrix(
        probs=None,
        y_true=all_targets,
        preds=all_preds,
        class_names=["No Error", "Error"]
    )
})


AGGREGATED RESULTS ACROSS ALL FOLDS

Metric            | Mean      | Std Dev
--------------------------------------------------------------------------------
Accuracy          | 0.5147    | 0.1329
F1 Score          | 0.5865    | 0.1551
Precision         | 0.5915    | 0.1844
Recall            | 0.6577    | 0.2327
AUC               | 0.5169    | 0.1797

RESULTS PER FOLD:
--------------------------------------------------------------------------------
Fold | Recipe                         | Accuracy | F1       | Precision | Recall   | AUC
--------------------------------------------------------------------------------
1    | blenderbananapancakes          | 0.3684   | 0.5385   | 0.5000    | 0.5833   | 0.1786
2    | breakfastburritos              | 0.5625   | 0.7200   | 0.6000    | 0.9000   | 0.3667
3    | broccolistirfry                | 0.1875   | 0.2353   | 0.1818    | 0.3333   | 0.1667
4    | buttercorncup                  | 0.6429   | 0.7826   | 0.6429    | 1.0000   | 0.4000
5    | c

In [24]:
# Chiudi il run di W&B
wandb.finish()
print("üèÅ W&B run terminato")

0,1
fold_1/epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà
fold_1/test_accuracy,‚ñÅ
fold_1/test_auc,‚ñÅ
fold_1/test_f1,‚ñÅ
fold_1/test_precision,‚ñÅ
fold_1/test_recall,‚ñÅ
fold_1/train_accuracy,‚ñÅ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÑ‚ñÖ‚ñÜ‚ñÖ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
fold_1/train_f1,‚ñÅ‚ñÖ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÖ‚ñÜ‚ñá‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñá‚ñà‚ñà‚ñà‚ñà
fold_1/train_loss,‚ñà‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÅ
fold_10/epoch,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà

0,1
fold_1/epoch,20
fold_1/test_accuracy,0.36842
fold_1/test_auc,0.17857
fold_1/test_f1,0.53846
fold_1/test_precision,0.5
fold_1/test_recall,0.58333
fold_1/train_accuracy,0.82466
fold_1/train_f1,0.84541
fold_1/train_loss,0.44882
fold_10/epoch,20


üèÅ W&B run terminato


# K-Fold Cross-Validation

K-Fold stratificato dove ogni fold contiene video di ricette diverse

In [29]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold

# Configurazione K-Fold
K_FOLDS = 5  # Numero di fold per la cross-validation

# Inizializzazione W&B per l'intero esperimento K-Fold CV
run = wandb.init(
    project="mistake-detection",
    name=f"KFold{K_FOLDS}-Task2Subtask4-DAGNN-{DATASET_SOURCE.value}-balanced",
    config={
        **config,
        "model": "DAGNN",
        "hidden_dim": 128,
        "num_gnn_layers": 1,
        "dropout": 0.4,
        "cv_type": f"{K_FOLDS}-fold-stratified",
        "k_folds": K_FOLDS
    },
    tags=[f"{K_FOLDS}-fold-cv", "Task2Subtask4", "DAGNN", DATASET_SOURCE.value, "balanced"],
    notes=f"{K_FOLDS}-Fold Stratified CV with DAGNN for mistake detection using {DATASET_SOURCE.value} features"
)

print(f"üöÄ W&B Run: {run.name} (ID: {run.id})")

# Aggiorna config
config.update({
    "model": "DAGNN",
    "hidden_dim": 128,
    "num_gnn_layers": 1,
    "dropout": 0.4,
    "cv_type": f"{K_FOLDS}-fold-stratified",
    "k_folds": K_FOLDS
})

# Statistiche per aggregare i risultati di tutti i fold
all_fold_results = []

# Prepara i labels per lo StratifiedKFold
all_labels = []
for idx in range(len(dataset)):
    # Usa dataset[idx] che restituisce il dizionario completo con tutte le chiavi
    data = dataset[idx]
    label = data['label']  # Il label √® nella chiave 'label' del dizionario restituito da __getitem__
    all_labels.append(label)
all_labels = np.array(all_labels)

# Crea lo StratifiedKFold (mantiene la distribuzione delle classi in ogni fold)
skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=config["seed"])

print(f"\n{'='*80}")
print(f"{K_FOLDS}-FOLD STRATIFIED CROSS-VALIDATION")
print(f"{'='*80}")
print(f"Total videos: {len(dataset)}")
print(f"Total folds: {K_FOLDS}")
print(f"Class distribution:")
print(f"  No Error (0): {np.sum(all_labels == 0)} samples ({100*np.sum(all_labels == 0)/len(all_labels):.1f}%)")
print(f"  Error (1):    {np.sum(all_labels == 1)} samples ({100*np.sum(all_labels == 1)/len(all_labels):.1f}%)")
print(f"{'='*80}\n")

# K-Fold CV: ogni fold contiene video di ricette diverse
for fold_idx, (train_indices, test_indices) in enumerate(skf.split(np.arange(len(dataset)), all_labels)):
    print(f"\n{'='*80}")
    print(f"FOLD {fold_idx + 1}/{K_FOLDS}")
    print(f"{'='*80}")

    # Converti da numpy array a list
    train_indices = train_indices.tolist()
    test_indices = test_indices.tolist()

    # Analizza la distribuzione delle ricette nel test set
    test_recipes = set()
    for idx in test_indices:
        recipe_name = dataset.samples[idx]['recipe_name']
        test_recipes.add(recipe_name)

    print(f"Train videos: {len(train_indices)} | Test videos: {len(test_indices)}")
    print(f"Test set recipes: {len(test_recipes)} different recipes")
    print(f"Test recipes: {', '.join(sorted(test_recipes))}")

    # Crea i subset
    train_dataset = Subset(dataset, train_indices)
    test_dataset = Subset(dataset, test_indices)

    # Crea i DataLoader con collate_fn custom
    train_loader = DataLoader(
        train_dataset,
        batch_size=config["batch_size"],
        shuffle=True,
        collate_fn=collate_fn
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=config["batch_size"],
        shuffle=False,
        collate_fn=collate_fn
    )

    # Inizializza un nuovo modello per questo fold
    model = DAGNN(
        input_dim=1536,
        hidden_dim=config["hidden_dim"],
        num_gnn_layers=config["num_gnn_layers"],
        dropout=config["dropout"]
    ).to(device)

    # Optimizer con weight decay
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=config["learning_rate"],
        weight_decay=config.get("weight_decay", 1e-4)
    )

    # Loss function con pos_weight
    train_pos_weight = torch.tensor([config["pos_weight"]], device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=train_pos_weight)

    # Training loop per questo fold
    best_train_loss = np.inf
    best_model_state = None

    for epoch in range(config["epochs"]):
        # TRAIN
        model.train()
        total_loss = 0
        train_preds_list = []
        train_targets_list = []
        train_probs_list = []

        for batch_dict in train_loader:
            # Converti a PyG format
            pyg_batch = collate_to_pyg(batch_dict).to(device)

            # Forward
            probs, logits = model(pyg_batch)

            # Loss
            labels = pyg_batch.y.float().unsqueeze(1)
            loss = criterion(logits, labels)

            # Backward
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            # Metriche
            total_loss += loss.item()
            preds = (probs >= 0.5).long().cpu().numpy().flatten()
            targets = labels.long().cpu().numpy().flatten()
            probs_np = probs.detach().cpu().numpy().flatten()

            train_preds_list.extend(preds)
            train_targets_list.extend(targets)
            train_probs_list.extend(probs_np)

        avg_train_loss = total_loss / len(train_loader)

        # Metriche di training
        train_preds = np.array(train_preds_list)
        train_targets = np.array(train_targets_list)
        train_probs = np.array(train_probs_list)

        train_acc = accuracy_score(train_targets, train_preds)
        train_f1 = f1_score(train_targets, train_preds, zero_division=0)
        train_balanced_acc = balanced_accuracy_score(train_targets, train_preds)

        # Log ogni 5 epoche per non intasare W&B
        if (epoch + 1) % 5 == 0 or epoch == 0:
            wandb.log({
                f"fold_{fold_idx+1}/train_loss": avg_train_loss,
                f"fold_{fold_idx+1}/train_accuracy": train_acc,
                f"fold_{fold_idx+1}/train_balanced_accuracy": train_balanced_acc,
                f"fold_{fold_idx+1}/train_f1": train_f1,
                f"fold_{fold_idx+1}/epoch": epoch + 1
            })

        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"  Epoch {epoch+1}/{config['epochs']} - Loss: {avg_train_loss:.4f} - Acc: {train_acc:.4f} - Bal.Acc: {train_balanced_acc:.4f} - F1: {train_f1:.4f}")

        # Salva il miglior modello
        if avg_train_loss < best_train_loss:
            best_train_loss = avg_train_loss
            best_model_state = model.state_dict().copy()

    # Carica il miglior modello
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    # TEST per questo fold
    model.eval()
    test_preds_list = []
    test_targets_list = []
    test_probs_list = []

    with torch.no_grad():
        for batch_dict in test_loader:
            # Converti a PyG format
            pyg_batch = collate_to_pyg(batch_dict).to(device)

            # Forward
            probs, logits = model(pyg_batch)

            # Predictions
            labels = pyg_batch.y.long().cpu().numpy()
            preds = (probs >= 0.5).long().cpu().numpy().flatten()
            probs_np = probs.cpu().numpy().flatten()

            test_preds_list.extend(preds)
            test_targets_list.extend(labels)
            test_probs_list.extend(probs_np)

    # Metriche di test per questo fold
    test_preds = np.array(test_preds_list)
    test_targets = np.array(test_targets_list)
    test_probs = np.array(test_probs_list)

    test_acc = accuracy_score(test_targets, test_preds)
    test_f1 = f1_score(test_targets, test_preds, zero_division=0)
    test_precision = precision_score(test_targets, test_preds, zero_division=0)
    test_recall = recall_score(test_targets, test_preds, zero_division=0)
    test_balanced_acc = balanced_accuracy_score(test_targets, test_preds)

    try:
        test_auc = roc_auc_score(test_targets, test_probs)
    except ValueError:
        test_auc = 0.0

    # Analizza i risultati per ricetta nel test set
    test_results_by_recipe = defaultdict(lambda: {'targets': [], 'preds': []})
    for idx_pos, idx in enumerate(test_indices):
        sample = dataset.samples[idx]
        recipe_name = sample['recipe_name']
        # Ogni video ha un solo target/pred
        test_results_by_recipe[recipe_name]['targets'].append(test_targets[idx_pos])
        test_results_by_recipe[recipe_name]['preds'].append(test_preds[idx_pos])

    # Salva i risultati di questo fold
    fold_result = {
        'fold': fold_idx + 1,
        'test_indices': test_indices,
        'num_test_videos': len(test_indices),
        'test_recipes': sorted(test_recipes),
        'accuracy': test_acc,
        'balanced_accuracy': test_balanced_acc,
        'f1': test_f1,
        'precision': test_precision,
        'recall': test_recall,
        'auc': test_auc,
        'test_targets': test_targets,
        'test_preds': test_preds,
        'results_by_recipe': dict(test_results_by_recipe)
    }
    all_fold_results.append(fold_result)

    # Log su W&B
    wandb.log({
        f"fold_{fold_idx+1}/test_accuracy": test_acc,
        f"fold_{fold_idx+1}/test_balanced_accuracy": test_balanced_acc,
        f"fold_{fold_idx+1}/test_f1": test_f1,
        f"fold_{fold_idx+1}/test_precision": test_precision,
        f"fold_{fold_idx+1}/test_recall": test_recall,
        f"fold_{fold_idx+1}/test_auc": test_auc,
        f"fold_{fold_idx+1}/num_test_videos": len(test_indices),
        f"fold_{fold_idx+1}/num_test_recipes": len(test_recipes)
    })

    print(f"\n  Test Results for Fold {fold_idx+1}:")
    print(f"    Test videos: {len(test_indices)} from {len(test_recipes)} recipes")
    print(f"    Accuracy: {test_acc:.4f}")
    print(f"    Balanced Accuracy: {test_balanced_acc:.4f}")
    print(f"    F1: {test_f1:.4f}")
    print(f"    Precision: {test_precision:.4f}")
    print(f"    Recall: {test_recall:.4f}")
    print(f"    AUC: {test_auc:.4f}")

print(f"\n{'='*80}")
print(f"üéâ {K_FOLDS}-Fold Cross-Validation completato!")
print(f"{'='*80}")

0,1
fold_1/epoch,‚ñÅ‚ñÇ‚ñÑ‚ñÜ‚ñà
fold_1/num_test_recipes,‚ñÅ
fold_1/num_test_videos,‚ñÅ
fold_1/test_accuracy,‚ñÅ
fold_1/test_auc,‚ñÅ
fold_1/test_balanced_accuracy,‚ñÅ
fold_1/test_f1,‚ñÅ
fold_1/test_precision,‚ñÅ
fold_1/test_recall,‚ñÅ
fold_1/train_accuracy,‚ñÅ‚ñÇ‚ñÑ‚ñÜ‚ñà

0,1
fold_1/epoch,20
fold_1/num_test_recipes,24
fold_1/num_test_videos,77
fold_1/test_accuracy,0.46753
fold_1/test_auc,0.48554
fold_1/test_balanced_accuracy,0.48106
fold_1/test_f1,0.45333
fold_1/test_precision,0.54839
fold_1/test_recall,0.38636
fold_1/train_accuracy,0.88925


üöÄ W&B Run: KFold5-Task2Subtask4-DAGNN-hiero-balanced (ID: oaj2kqno)

5-FOLD STRATIFIED CROSS-VALIDATION
Total videos: 384
Total folds: 5
Class distribution:
  No Error (0): 164 samples (42.7%)
  Error (1):    220 samples (57.3%)


FOLD 1/5
Train videos: 307 | Test videos: 77
Test set recipes: 24 different recipes
Test recipes: blenderbananapancakes, breakfastburritos, broccolistirfry, buttercorncup, capresebruschetta, cheesepimiento, coffee, cucumberraita, dressedupmeatballs, herbomeletwithfriedtomatoes, microwaveeggsandwich, microwavefrenchtoast, microwavemugpizza, mugcake, panfriedtofu, pinwheels, ramen, sautedmushrooms, scrambledeggs, spicedhotchocolate, spicytunaavocadowraps, tomatochutney, tomatomozzarellasalad, zoodles
  Epoch 1/15 - Loss: 0.6049 - Acc: 0.4430 - Bal.Acc: 0.4986 - F1: 0.1972
  Epoch 5/15 - Loss: 0.5755 - Acc: 0.6254 - Bal.Acc: 0.6069 - F1: 0.6917
  Epoch 10/15 - Loss: 0.5573 - Acc: 0.6515 - Bal.Acc: 0.6521 - F1: 0.6806
  Epoch 15/15 - Loss: 0.5178 - Acc: 0.7948

# K-Fold Cross-Validation

K-Fold stratificato dove ogni fold contiene video di ricette diverse

In [32]:
# Calcola le statistiche aggregate su tutti i fold
accuracies = [r['accuracy'] for r in all_fold_results]
balanced_accuracies = [r['balanced_accuracy'] for r in all_fold_results]
f1_scores = [r['f1'] for r in all_fold_results]
precisions = [r['precision'] for r in all_fold_results]
recalls = [r['recall'] for r in all_fold_results]
aucs = [r['auc'] for r in all_fold_results]

# Medie e deviazioni standard
mean_acc = np.mean(accuracies)
std_acc = np.std(accuracies)
mean_balanced_acc = np.mean(balanced_accuracies)
std_balanced_acc = np.std(balanced_accuracies)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_auc = np.mean(aucs)
std_auc = np.std(aucs)

# Stampa i risultati aggregati
print(f"\n{'='*80}")
print(f"AGGREGATED RESULTS ACROSS ALL FOLDS ({K_FOLDS}-Fold CV)")
print(f"{'='*80}")
print(f"\nMetric                 | Mean      | Std Dev")
print(f"{'-'*80}")
print(f"Accuracy               | {mean_acc:.4f}    | {std_acc:.4f}")
print(f"Balanced Accuracy      | {mean_balanced_acc:.4f}    | {std_balanced_acc:.4f}")
print(f"F1 Score               | {mean_f1:.4f}    | {std_f1:.4f}")
print(f"Precision              | {mean_precision:.4f}    | {std_precision:.4f}")
print(f"Recall                 | {mean_recall:.4f}    | {std_recall:.4f}")
print(f"AUC                    | {mean_auc:.4f}    | {std_auc:.4f}")
print(f"{'='*80}")

# Log delle metriche aggregate su W&B
wandb.log({
    "overall/mean_accuracy": mean_acc,
    "overall/std_accuracy": std_acc,
    "overall/mean_balanced_accuracy": mean_balanced_acc,
    "overall/std_balanced_accuracy": std_balanced_acc,
    "overall/mean_f1": mean_f1,
    "overall/std_f1": std_f1,
    "overall/mean_precision": mean_precision,
    "overall/std_precision": std_precision,
    "overall/mean_recall": mean_recall,
    "overall/std_recall": std_recall,
    "overall/mean_auc": mean_auc,
    "overall/std_auc": std_auc,
})

# Analisi per ricetta (raggruppa i risultati per ricetta)
from collections import defaultdict
results_by_recipe = defaultdict(list)

for result in all_fold_results:
    for recipe_name in result['test_recipes']:
        recipe_data = result['results_by_recipe'][recipe_name]
        for target, pred in zip(recipe_data['targets'], recipe_data['preds']):
            results_by_recipe[recipe_name].append({
                'target': target,
                'pred': pred,
                'fold': result['fold']
            })

print(f"\nRESULTS BY RECIPE:")
print(f"{'-'*80}")
print(f"Recipe                         | Videos | Avg Acc  | Avg Bal.Acc | Avg F1   ")
print(f"{'-'*80}")

recipe_summary = []
for recipe_name in sorted(results_by_recipe.keys()):
    recipe_results = results_by_recipe[recipe_name]
    num_videos = len(recipe_results)

    targets = [r['target'] for r in recipe_results]
    preds = [r['pred'] for r in recipe_results]

    avg_acc = accuracy_score(targets, preds)
    avg_balanced_acc = balanced_accuracy_score(targets, preds)
    avg_f1 = f1_score(targets, preds, zero_division=0)

    print(f"{recipe_name:<30} | {num_videos:<6} | {avg_acc:.4f}   | {avg_balanced_acc:.4f}      | {avg_f1:.4f}")

    recipe_summary.append({
        'recipe': recipe_name,
        'num_videos': num_videos,
        'avg_accuracy': avg_acc,
        'avg_balanced_accuracy': avg_balanced_acc,
        'avg_f1': avg_f1
    })

print(f"{'='*80}")

# Confusion Matrix aggregata
all_targets = np.concatenate([r['test_targets'] for r in all_fold_results])
all_preds = np.concatenate([r['test_preds'] for r in all_fold_results])

cm_overall = confusion_matrix(all_targets, all_preds)
print(f"\nOVERALL CONFUSION MATRIX:")
print(cm_overall)
print(f"\nClass distribution:")
print(f"  No Error (0): {np.sum(all_targets == 0)} samples")
print(f"  Error (1):    {np.sum(all_targets == 1)} samples")

# Log confusion matrix su W&B
wandb.log({
    "overall/confusion_matrix": wandb.plot.confusion_matrix(
        probs=None,
        y_true=all_targets,
        preds=all_preds,
        class_names=["No Error", "Error"]
    )
})

# Crea una tabella riassuntiva per ricetta su W&B
recipe_table_data = []
for summary in recipe_summary:
    recipe_table_data.append([
        summary['recipe'],
        summary['num_videos'],
        summary['avg_accuracy'],
        summary['avg_balanced_accuracy'],
        summary['avg_f1']
    ])

wandb.log({
    "recipe_summary_table": wandb.Table(
        columns=["Recipe", "Num Videos", "Avg Accuracy", "Avg Balanced Accuracy", "Avg F1"],
        data=recipe_table_data
    )
})

print("\nüìä Analisi completata!")


AGGREGATED RESULTS ACROSS ALL FOLDS (5-Fold CV)

Metric                 | Mean      | Std Dev
--------------------------------------------------------------------------------
Accuracy               | nan    | nan
Balanced Accuracy      | nan    | nan
F1 Score               | nan    | nan
Precision              | nan    | nan
Recall                 | nan    | nan
AUC                    | nan    | nan

RESULTS BY RECIPE:
--------------------------------------------------------------------------------
Recipe                         | Videos | Avg Acc  | Avg Bal.Acc | Avg F1   
--------------------------------------------------------------------------------


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


ValueError: need at least one array to concatenate

In [30]:
# Chiudi il run di W&B
wandb.finish()
print(f"üèÅ W&B run {K_FOLDS}-Fold CV terminato")

0,1
fold_1/epoch,‚ñÅ‚ñÉ‚ñÖ‚ñà
fold_1/num_test_recipes,‚ñÅ
fold_1/num_test_videos,‚ñÅ
fold_1/test_accuracy,‚ñÅ
fold_1/test_auc,‚ñÅ
fold_1/test_balanced_accuracy,‚ñÅ
fold_1/test_f1,‚ñÅ
fold_1/test_precision,‚ñÅ
fold_1/test_recall,‚ñÅ
fold_1/train_accuracy,‚ñÅ‚ñÖ‚ñÖ‚ñà

0,1
fold_1/epoch,15
fold_1/num_test_recipes,24
fold_1/num_test_videos,77
fold_1/test_accuracy,0.51948
fold_1/test_auc,0.46143
fold_1/test_balanced_accuracy,0.49621
fold_1/test_f1,0.61053
fold_1/test_precision,0.56863
fold_1/test_recall,0.65909
fold_1/train_accuracy,0.79479


üèÅ W&B run 5-Fold CV terminato
