# 03 - Model Training and Evaluation

This notebook demonstrates training the transformer model and evaluating with MC Dropout uncertainty.

## Contents
1. Model architecture overview
2. Training the model
3. MC Dropout uncertainty quantification
4. Evaluation and visualisation

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, roc_auc_score

from threatsim.data import get_dataloaders
from threatsim.models import create_model, mc_dropout_predict
from threatsim.utils import set_seed, get_device

plt.style.use('seaborn-v0_8-whitegrid')
set_seed(42)

In [None]:
# Setup
device = get_device()
print(f"Using device: {device}")

# Load data
train_loader, val_loader, test_loader, class_weight = get_dataloaders(
    window_size=50, batch_size=32
)
print(f"Train batches: {len(train_loader)}, Val: {len(val_loader)}, Test: {len(test_loader)}")
print(f"Class weight: {class_weight.item():.2f}")

## 1. Model Architecture

Our transformer architecture:
- Linear projection to model dimension (64)
- Sinusoidal positional encoding
- 2 transformer encoder layers (4 attention heads)
- Mean pooling + classification head
- Dropout throughout for MC Dropout at inference

In [None]:
# Create model
model = create_model(window_size=50, d_model=64, num_layers=2, dropout=0.2)
model = model.to(device)

# Count parameters
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model parameters: {num_params:,}")
print(f"\nArchitecture:\n{model}")

## 2. Training

In [None]:
# Training setup
criterion = torch.nn.BCELoss(reduction='none')
optimiser = torch.optim.AdamW(model.parameters(), lr=1e-3)

def weighted_loss(predictions, labels):
    loss = criterion(predictions, labels)
    weights = torch.where(labels == 1, class_weight.to(device), torch.ones_like(labels))
    return (loss * weights).mean()

# Training loop
train_losses = []
val_losses = []
n_epochs = 30

for epoch in range(1, n_epochs + 1):
    # Train
    model.train()
    epoch_loss = 0
    for windows, labels in train_loader:
        windows, labels = windows.to(device), labels.to(device)
        optimiser.zero_grad()
        preds = model(windows)
        loss = weighted_loss(preds, labels)
        loss.backward()
        optimiser.step()
        epoch_loss += loss.item()
    train_losses.append(epoch_loss / len(train_loader))
    
    # Validate
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for windows, labels in val_loader:
            windows, labels = windows.to(device), labels.to(device)
            preds = model(windows)
            val_loss += weighted_loss(preds, labels).item()
    val_losses.append(val_loss / len(val_loader))
    
    if epoch % 5 == 0:
        print(f"Epoch {epoch:2d} | Train Loss: {train_losses[-1]:.4f} | Val Loss: {val_losses[-1]:.4f}")

In [None]:
# Plot training history
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(train_losses, 'b-', label='Training')
ax.plot(val_losses, 'r-', label='Validation')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Training History')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()

## 3. MC Dropout Uncertainty Quantification

Monte Carlo Dropout: keep dropout active at inference time and run multiple forward passes. The variance across passes gives us uncertainty estimates.

In [None]:
# Collect predictions with uncertainty
all_labels = []
all_means = []
all_stds = []

for windows, labels in test_loader:
    windows = windows.to(device)
    mean, std = mc_dropout_predict(model, windows, n_samples=30)
    all_labels.append(labels.numpy())
    all_means.append(mean.cpu().numpy())
    all_stds.append(std.cpu().numpy())

test_labels = np.concatenate(all_labels)
test_preds = np.concatenate(all_means)
test_uncertainty = np.concatenate(all_stds)

print(f"Test samples: {len(test_labels)}")
print(f"Mean uncertainty: {test_uncertainty.mean():.4f}")

## 4. Evaluation

In [None]:
# Classification report
binary_preds = (test_preds >= 0.5).astype(int)
print("Classification Report:")
print(classification_report(test_labels, binary_preds, target_names=['Normal', 'Anomaly']))

if len(np.unique(test_labels)) > 1:
    print(f"ROC-AUC: {roc_auc_score(test_labels, test_preds):.4f}")

In [None]:
# Uncertainty analysis
correct = binary_preds == test_labels

print(f"\nUncertainty Analysis:")
print(f"  Average uncertainty (correct predictions): {test_uncertainty[correct].mean():.4f}")
print(f"  Average uncertainty (incorrect predictions): {test_uncertainty[~correct].mean():.4f}")

# Plot
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(test_uncertainty[correct], bins=30, alpha=0.6, label=f'Correct (n={correct.sum()})', color='green')
ax.hist(test_uncertainty[~correct], bins=30, alpha=0.6, label=f'Incorrect (n={(~correct).sum()})', color='red')
ax.set_xlabel('Uncertainty (Std Dev)')
ax.set_ylabel('Count')
ax.set_title('Uncertainty Distribution: Correct vs Incorrect')
ax.legend()
plt.show()

In [None]:
# Predictions with uncertainty bands
n_show = min(200, len(test_labels))
x = np.arange(n_show)

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 6), sharex=True)

ax1.fill_between(x, 
    np.clip(test_preds[:n_show] - 2*test_uncertainty[:n_show], 0, 1),
    np.clip(test_preds[:n_show] + 2*test_uncertainty[:n_show], 0, 1),
    alpha=0.3, color='blue', label='95% CI')
ax1.plot(x, test_preds[:n_show], 'b-', linewidth=1, label='Prediction')
ax1.axhline(0.5, color='gray', linestyle='--', alpha=0.5)
ax1.set_ylabel('Probability')
ax1.set_title('Predictions with MC Dropout Uncertainty')
ax1.legend()

ax2.fill_between(x, 0, test_labels[:n_show], alpha=0.5, color='red')
ax2.set_xlabel('Sample')
ax2.set_ylabel('True Label')
plt.tight_layout()
plt.show()

## Summary

We have:
1. Trained a transformer for anomaly detection
2. Applied MC Dropout for uncertainty quantification
3. Demonstrated that the model is more uncertain when making incorrect predictions

For a complete training run with more epochs and proper checkpointing, use:
```bash
python scripts/train.py
python scripts/evaluate.py
```