# Environement Setup

In [1]:
import sys, os
try:
    from google.colab import drive, userdata
    IS_COLAB = True
except ImportError:
    IS_COLAB = False

REPO_NAME = 'MistakeDetection'

if IS_COLAB:
    print("‚òÅÔ∏è Colab rilevato.")
    if not os.path.exists('/content/drive'): drive.mount('/content/drive')

    GITHUB_USER = 'MarcoPernoVDP'
    try:
        TOKEN = userdata.get('GITHUB_TOKEN')
        REPO_URL = f'https://{TOKEN}@github.com/{GITHUB_USER}/{REPO_NAME}.git'
    except:
        REPO_URL = f'https://github.com/{GITHUB_USER}/{REPO_NAME}.git'

    ROOT_DIR = f'/content/{REPO_NAME}'
    if not os.path.exists(ROOT_DIR):
        !git clone {REPO_URL}
    else:
        %cd {ROOT_DIR}
        !git pull
        %cd /content


else:
    print("Ambiente locale rilevato.")
    ROOT_DIR = os.getcwd()
    while not os.path.exists(os.path.join(ROOT_DIR, '.gitignore')) and ROOT_DIR != os.path.dirname(ROOT_DIR):
        ROOT_DIR = os.path.dirname(ROOT_DIR)

if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)


Ambiente locale rilevato.


# Dataset Setup

In [2]:
from utils import setup_project
# Ora puoi passare agli import del modello
from dataset.capitain_cook_4d_mlp_dataset import CaptainCook4DMLP_Dataset, DatasetSource
from models.BaselineV3_LSTM import BaselineV3_LSTM
from dataset.utils import SplitType

# Esegue: Setup Dati (unzip/copy), Login WandB, Setup Device
device = setup_project.initialize(ROOT_DIR)

# Import wandb
import wandb

Setup Progetto in: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection
source_path: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\_file
Setup Dati da: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\_file
Inizio setup dati...
   Sorgente: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\_file
   Destinazione: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\data
Copia cartella: annotation_json...
Copia cartella: omnivore...
‚úÖ Setup completato! Dati pronti in: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\data


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\marco\_netrc
[34m[1mwandb[0m: Currently logged in as: [33ms339450[0m ([33ms339450-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


WandB Logged in.
Device: cuda


# Configuration

In [3]:
# Configurazione esperimento
DATASET_SOURCE = DatasetSource.OMNIVORE
SPLIT_TYPE = SplitType.STEP_ID

config = {
    "architecture": "BaselineV3_LSTM_" + DATASET_SOURCE.value + "_" + SPLIT_TYPE.value,
    "dataset": "CaptainCook4D",
    "feature_extractor": DATASET_SOURCE.value,
    "input_dim": DATASET_SOURCE.input_dims(),
    "batch_size": 1,  # DEVE essere 1 per sequenze di lunghezza variabile
    "learning_rate": 1e-5,
    "epochs": 1,
    "pos_weight": 1.5,
    "optimizer": "Adam",
    "loss_function": "BCEWithLogitsLoss",
    "seed": 42,
    "split_type": SPLIT_TYPE.value
}

# Dataset Split

In [4]:
import os
from dataset.capitain_cook_4d_mlp_dataset import DatasetSource
from dataset.capitain_cook_4d_transformer_dataset import CaptainCook4DTransformer_Dataset
from dataset.utils import get_transformer_loaders

try:
    full_dataset = CaptainCook4DTransformer_Dataset(
        dataset_source=DATASET_SOURCE,
        root_dir=ROOT_DIR
    )
    train_loader, val_loader, test_loader = get_transformer_loaders(
        full_dataset,
        batch_size=config["batch_size"],
        seed=config["seed"],
        split_type=SPLIT_TYPE
    )

except Exception as e:
    print(f"‚ùå Errore: {e}")

Loading from: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\data\omnivore...
Dataset creato: 200 step completi da 8828 secondi

DATASET INFO [TRANSFORMER - STEP-BASED]
   Total Steps: 200
   Features per second: 1024
   Step duration: min=3s, max=184s, avg=44.14s
FULL DATASET       | Tot: 200    | OK: 117   (58.5%) | ERR: 83    (41.5%) | Ratio: 1:1.4
-------------------------------------------------------------------------------------
TRAIN SET          | Tot: 140    | OK: 82    (58.6%) | ERR: 58    (41.4%) | Ratio: 1:1.4
VALIDATION SET     | Tot: 20     | OK: 11    (55.0%) | ERR: 9     (45.0%) | Ratio: 1:1.2
TEST SET           | Tot: 40     | OK: 24    (60.0%) | ERR: 16    (40.0%) | Ratio: 1:1.5



In [5]:
# V2: quando accedi a dataset[idx], dove idx √® l'indice dello STEP
full_dataset.print_item(0)

STEP DATASET ITEM [0]
Features shape:       torch.Size([36, 1024]) (durata_step, n_features)
Step duration:        36 secondi
Label:                0 (OK)
Step ID:              3
Video ID:             1_10
Start time:           11.749052505026611 seconds


In [6]:
from utils.inspect_npz import inspect_npz_from_dataset

dataset_folder = DATASET_SOURCE.value
npz_filename = "1_7_360p.mp4_1s_1s.npz"

# Ispezione del file .npz
inspect_npz_from_dataset(full_dataset.features_dir(), npz_filename, n_rows=3)

File: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\data\omnivore\1_7_360p.mp4_1s_1s.npz
Chiavi presenti nel file: ['arr_0']

Array 'arr_0' - shape: (604, 1024), dtype: float32
[[ 0.6910985   0.09298898 -0.6608225  ... -0.75679165  1.2401273
  -0.5683658 ]
 [ 0.40254688 -0.4466254  -0.8645446  ... -1.2709565   0.7917245
  -0.5052321 ]
 [ 0.643613   -0.48683766 -0.88651866 ... -1.0358062   0.658605
  -0.27201462]]


In [5]:
# Inizializzazione W&B
run = wandb.init(
    project="mistake-detection",
    name=f"baseline-LSTM-v3-{DATASET_SOURCE.value}-{SPLIT_TYPE.value}",
    config=config,
    tags=["baseline", "LSTM", DATASET_SOURCE.value],
    notes=f"Baseline LSTM with {DATASET_SOURCE.value} features for mistake detection and {SPLIT_TYPE.value} split"
)

print(f"üöÄ W&B Run: {run.name} (ID: {run.id})")

üöÄ W&B Run: baseline-LSTM-v3-omnivore-step_id (ID: toapz0hs)


In [6]:
import torch
import torch.nn as nn

model = BaselineV3_LSTM(DATASET_SOURCE.input_dims()).to(device)

# Watch del modello per tracciare gradienti e parametri
wandb.watch(model, log="all", log_freq=10)

In [7]:
lr = config["learning_rate"]
optimizer = torch.optim.Adam(model.parameters(), lr)

# Quanto pesa la classe "positiva" = classe "1" = classe "error":
# - CASO 1: rapporto effettivo del dataset
#train_pos_weight = train_cnt_0 / train_cnt_1

# - CASO 2: rapporto usato nel paper
train_pos_weight = config["pos_weight"]

print(f"Peso classe positiva: {train_pos_weight}")
train_pos_weight = torch.tensor([train_pos_weight], device=device)

criterion = nn.BCEWithLogitsLoss(pos_weight=train_pos_weight)

epochs = config["epochs"]

Peso classe positiva: 1.5


In [8]:
import json

error_annotations_path = os.path.join(ROOT_DIR, 'data', 'annotation_json', 'error_annotations.json')
with open(error_annotations_path, 'r') as f:
    annotations = json.load(f)

def get_error_category_label(step_id, recording_id, start_time) -> list[str] | None:
    # Apri error_annotations.json e cerca la categoria di errore corrispondente
    # a step_id, recording_id e start_time, ritornando le labels corrette, se non trovata ritorna None
    for annotation in annotations:
        if(annotation['recording_id'] == recording_id):
            for step in annotation['step_annotations']:
                if(step['step_id'] == step_id and
                   step['start_time'] == start_time and
                   'errors' in step):
                    return [error['tag'] for error in step['errors']]
    return None

In [9]:
import copy
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score
import numpy as np

best_avg_val_loss = np.inf
final_val_acc = 0
final_val_f1 = 0
final_val_precision = 0
final_val_recall = 0
final_val_auc = 0

for epoch in range(epochs):

    # -------------------------
    #        TRAIN
    # -------------------------
    model.train()
    total_loss = 0
    train_preds_list = []
    train_targets_list = []
    train_probs_list = []

    for inputs, labels, step_ids, video_ids, start_times in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device).float()

        probs, logits = model(inputs)   # probs: scalare, logits: scalare
        
        loss = criterion(logits, labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Metriche train - ogni batch ha una sola predizione (scalare)
        with torch.no_grad():
            pred = (probs >= 0.5).long().item()  # converti a scalare Python
            
            train_preds_list.append(pred)
            train_targets_list.append(labels.item())
            train_probs_list.append(probs.item())

    avg_train_loss = total_loss / len(train_loader)

    # Metriche di training - converti liste a numpy
    train_preds = np.array(train_preds_list)
    train_targets = np.array(train_targets_list)
    train_probs = np.array(train_probs_list)

    train_acc = accuracy_score(train_targets, train_preds)
    train_f1 = f1_score(train_targets, train_preds, zero_division=0)
    train_precision = precision_score(train_targets, train_preds, zero_division=0)
    train_recall = recall_score(train_targets, train_preds, zero_division=0)

    # AUC train (usa probabilit√†, NON predizioni)
    try:
        train_auc = roc_auc_score(train_targets, train_probs)
    except ValueError:
        train_auc = 0.0  # Caso raro con classe mancante nel batch

    # -------------------------
    #        EVAL
    # -------------------------
    model.eval()
    total_val_loss = 0
    all_preds = []
    all_targets = []
    all_probs = []

    with torch.no_grad():
        for inputs, labels, step_ids, video_ids, start_times in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device).float()

            probs, logits = model(inputs)   # probs: scalare, logits: scalare

            val_loss = criterion(logits, labels)
            total_val_loss += val_loss.item()

            # metriche - converti a scalari Python
            pred = (probs >= 0.5).long().item()
            
            all_preds.append(pred)
            all_targets.append(labels.item())
            all_probs.append(probs.item())

        # Converti liste a numpy
        all_preds = np.array(all_preds)
        all_targets = np.array(all_targets)
        all_probs = np.array(all_probs)

        avg_val_loss = total_val_loss / len(val_loader)
        val_acc = accuracy_score(all_targets, all_preds)
        val_f1 = f1_score(all_targets, all_preds, zero_division=0)
        val_precision = precision_score(all_targets, all_preds, zero_division=0)
        val_recall = recall_score(all_targets, all_preds, zero_division=0)

        # AUC validation
        try:
            val_auc = roc_auc_score(all_targets, all_probs)
        except ValueError:
            val_auc = 0.0

        # Confusion Matrix
        cm = confusion_matrix(all_targets, all_preds)

        # Log su W&B
        wandb.log({
            # Training metrics
            "train/loss": avg_train_loss,
            "train/accuracy": train_acc,
            "train/f1": train_f1,
            "train/precision": train_precision,
            "train/recall": train_recall,
            "train/auc": train_auc,

            # Validation metrics
            "val/loss": avg_val_loss,
            "val/accuracy": val_acc,
            "val/f1": val_f1,
            "val/precision": val_precision,
            "val/recall": val_recall,
            "val/auc": val_auc,

            # Confusion Matrix
            "val/confusion_matrix": wandb.plot.confusion_matrix(
                probs=None,
                y_true=all_targets,
                preds=all_preds,
                class_names=["No Error", "Error"]
            ),

            "learning_rate": optimizer.param_groups[0]['lr'],
            "epoch": epoch + 1
        })

        print(f"Epoch {epoch+1}/{epochs} "
            f"- Train Loss: {avg_train_loss:.4f} "
            f"- Val Loss: {avg_val_loss:.4f} "
            f"- Val Acc: {val_acc:.4f} "
            f"- Val F1: {val_f1:.4f} "
            f"- Val Precision: {val_precision:.4f} "
            f"- Val Recall: {val_recall:.4f} "
            f"- Val AUC: {val_auc:.4f}")

        # Salvataggio miglior modello
        if avg_val_loss < best_avg_val_loss:
            best_avg_val_loss = avg_val_loss
            final_val_acc = val_acc
            final_val_f1 = val_f1
            final_val_precision = val_precision
            final_val_recall = val_recall
            final_val_auc = val_auc
            checkpoint_path = os.path.join(ROOT_DIR, "checkpoints", f"best_model_avg_val_loss_{best_avg_val_loss:.4f}.pth")
            os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_f1': val_f1,
                'val_acc': val_acc,
                'val_auc': val_auc,
            }, checkpoint_path)

            if IS_COLAB:
                artifact = wandb.Artifact(
                    name=f"model-{run.id}",
                    type="model",
                    description=f"Best model with avg_val_loss={best_avg_val_loss:.4f}",
                    metadata={
                        "epoch": epoch + 1,
                        "val_f1": val_f1,
                        "val_acc": val_acc,
                        "val_auc": val_auc,
                        "architecture": config["architecture"]
                    }
                )
                artifact.add_file(checkpoint_path)
                wandb.log_artifact(artifact)

            print(f"‚úÖ Nuovo miglior modello salvato! avg_val_loss: {best_avg_val_loss:.4f}")

print("\nüéâ Training completato!")
print(f"Miglior avg_val_loss Score: {best_avg_val_loss:.4f}")

Epoch 1/1 - Train Loss: 0.8348 - Val Loss: 0.8411 - Val Acc: 0.6500 - Val F1: 0.5333 - Val Precision: 0.6667 - Val Recall: 0.4444 - Val AUC: 0.8182
‚úÖ Nuovo miglior modello salvato! avg_val_loss: 0.8411

üéâ Training completato!
Miglior avg_val_loss Score: 0.8411


# Test Evaluation

In [12]:
# -------------------------
#        TEST
# -------------------------
# Carica il miglior modello salvato
checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
print(f"‚úÖ Modello migliore caricato da: {checkpoint_path}")
print(f"   Epoch: {checkpoint['epoch']}, Val F1: {checkpoint['val_f1']:.4f}, Val Acc: {checkpoint['val_acc']:.4f}")

model.eval()
total_test_loss = 0
test_preds_list = []
test_targets_list = []
test_probs_list = []
test_error_categories_list = []

with torch.no_grad():
    for inputs, labels, step_ids, video_ids, start_times in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device).float()

        probs, logits = model(inputs)   # probs: scalare, logits: scalare

        test_loss = criterion(logits, labels)
        total_test_loss += test_loss.item()

        # metriche - converti a scalari Python
        pred = (probs >= 0.5).long().item()
        
        test_preds_list.append(pred)
        test_targets_list.append(labels.item())
        test_probs_list.append(probs.item())

        for step_id, video_id, start_time in zip(step_ids, video_ids, start_times):
            error_categories = get_error_category_label(step_id.item(), video_id, start_time.item())
            test_error_categories_list.append(error_categories)

    # Converti liste a numpy
    test_preds = np.array(test_preds_list)
    test_targets = np.array(test_targets_list)
    test_probs = np.array(test_probs_list)

    avg_test_loss = total_test_loss / len(test_loader)
    test_acc = accuracy_score(test_targets, test_preds)
    test_f1 = f1_score(test_targets, test_preds, zero_division=0)
    test_precision = precision_score(test_targets, test_preds, zero_division=0)
    test_recall = recall_score(test_targets, test_preds, zero_division=0)

    # AUC test
    try:
        test_auc = roc_auc_score(test_targets, test_probs)
    except ValueError:
        test_auc = 0.0

    # Confusion Matrices
    cm_test = confusion_matrix(test_targets, test_preds)

    # Log su W&B
    wandb.log({
        "test/loss": avg_test_loss,
        "test/accuracy": test_acc,
        "test/f1": test_f1,
        "test/precision": test_precision,
        "test/recall": test_recall,
        "test/auc": test_auc,

        # Confusion Matrix (dataset labels)
        "test/confusion_matrix": wandb.plot.confusion_matrix(
            probs=None,
            y_true=test_targets,
            preds=test_preds,
            class_names=["No Error", "Error"]
        ),
    })

    print(f"\nüìä Test Results:")
    print(f"Test Loss: {avg_test_loss:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Test F1: {test_f1:.4f}")
    print(f"Test Precision: {test_precision:.4f}")
    print(f"Test Recall: {test_recall:.4f}")
    print(f"Test AUC: {test_auc:.4f}")
    print(f"\nüìà Confusion Matrix (Dataset Labels):")
    print(cm_test)

‚úÖ Modello migliore caricato da: c:\Users\marco\Desktop\Marco\Programmazione\C\EsPoli\Advanced Machine Learning\MistakeDetection\checkpoints\best_model_avg_val_loss_0.8411.pth
   Epoch: 1, Val F1: 0.5333, Val Acc: 0.6500

üìä Test Results:
Test Loss: 0.8247
Test Accuracy: 0.6000
Test F1: 0.4286
Test Precision: 0.5000
Test Recall: 0.3750
Test AUC: 0.6589

üìà Confusion Matrix (Dataset Labels):
[[18  6]
 [10  6]]


In [13]:
import pandas as pd

# Prepara i dati per la tabella
data_for_table = []
for pred, error_cats in zip(test_preds_list, test_error_categories_list):
    pred_label = "Error Predicted" if pred == 1 else "No Error Predicted"
    
    if error_cats is None or len(error_cats) == 0:
        # Nessun errore reale
        data_for_table.append({
            'Prediction': pred_label,
            'Actual Error Category': 'No Error'
        })
    else:
        # Uno o pi√π errori - crea una riga per ogni categoria
        for cat in error_cats:
            data_for_table.append({
                'Prediction': pred_label,
                'Actual Error Category': cat
            })

# Crea DataFrame
df = pd.DataFrame(data_for_table)

# Crea tabella di contingenza
contingency_table = pd.crosstab(
    df['Actual Error Category'], 
    df['Prediction'],
    margins=True,
    margins_name='Total'
)

print("üìä Tabella di Contingenza: Predizioni vs Categorie di Errore Reali\n")
print(contingency_table)
print("\n")

# Visualizza anche le percentuali
contingency_table_pct = pd.crosstab(
    df['Actual Error Category'], 
    df['Prediction'],
    normalize='columns'
) * 100

print("üìä Tabella Percentuali (per colonna):\n")
print(contingency_table_pct.round(2))

# Log su W&B come tabella
wandb.log({
    "test/predictions_vs_error_categories": wandb.Table(dataframe=contingency_table.reset_index())
})

üìä Tabella di Contingenza: Predizioni vs Categorie di Errore Reali

Prediction             Error Predicted  No Error Predicted  Total
Actual Error Category                                            
Measurement Error                    0                   1      1
No Error                             6                  18     24
Order Error                          3                   4      7
Other                                0                   1      1
Preparation Error                    1                   2      3
Technique Error                      0                   1      1
Temperature Error                    3                   2      5
Timing Error                         3                   3      6
Total                               16                  32     48


üìä Tabella Percentuali (per colonna):

Prediction             Error Predicted  No Error Predicted
Actual Error Category                                     
Measurement Error                 0.00     

In [14]:
# Chiudi il run di W&B
wandb.finish()
print("üèÅ W&B run terminato")

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
epoch,‚ñÅ
learning_rate,‚ñÅ
test/accuracy,‚ñÅ‚ñÅ
test/annotation_accuracy,‚ñÅ
test/annotation_f1,‚ñÅ
test/annotation_precision,‚ñÅ
test/annotation_recall,‚ñÅ
test/auc,‚ñÅ‚ñÅ
test/f1,‚ñÅ‚ñÅ
test/loss,‚ñÅ‚ñÅ

0,1
epoch,1
learning_rate,1e-05
test/accuracy,0.6
test/annotation_accuracy,0.6
test/annotation_f1,0.42857
test/annotation_precision,0.5
test/annotation_recall,0.375
test/auc,0.65885
test/f1,0.42857
test/loss,0.82469


üèÅ W&B run terminato
