In [1]:
#!/usr/bin/env python
# coding: utf-8
#
# Autoencoder Anomaly Detection Champion
#
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Dict, Any

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore", category=UserWarning)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Autoencoder(nn.Module):
    """A simple Autoencoder for anomaly detection."""
    def __init__(self, input_dim, encoding_dim=32):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128), nn.ReLU(),
            nn.Linear(128, 64), nn.ReLU(),
            nn.Linear(64, encoding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 64), nn.ReLU(),
            nn.Linear(64, 128), nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

def load_and_prepare_data(config: Dict[str, Any]) -> (np.ndarray, np.ndarray):
    """Loads and prepares the flattened sequence data."""
    print("─" * 60 + "\n1. Loading and preparing sequence data...")
    df = pd.read_csv(config['csv_path']).loc[:, ~pd.read_csv(config['csv_path']).columns.duplicated()]
    df[config['quarter_col']] = pd.to_datetime(df[config['quarter_col']])
    df.sort_values([config['id_col'], config['quarter_col']], inplace=True)
    df = df.dropna()
    
    feat_cols = [c for c in df.columns if c not in config['meta_cols']]
    X, y = [], []
    for _, g in df.groupby(config['id_col']):
        g = g.sort_values(config['quarter_col'])
        arr, lbl = g[feat_cols].to_numpy(), g[config['target_col']].to_numpy()
        for i in range(config['lags'], len(g)):
            X.append(arr[i - config['lags']:i].ravel())
            y.append(lbl[i])
            
    return np.asarray(X), np.asarray(y)

def run_autoencoder_championship(config: Dict[str, Any]):
    """Orchestrates the entire Autoencoder experiment."""
    X_all, y_all = load_and_prepare_data(config)
    config['n_features'] = X_all.shape[1]
    
    # --- Step 1: Split data chronologically ---
    print("\n2. Splitting data into Train/History (60%), Validation (20%), and Test (20%) sets...")
    n = len(y_all)
    train_end = int(n * 0.6)
    val_end = int(n * 0.8)
    
    X_train, y_train = X_all[:train_end], y_all[:train_end]
    X_val, y_val = X_all[train_end:val_end], y_all[train_end:val_end]
    X_test, y_test = X_all[val_end:], y_all[val_end:]
    
    # --- Step 2: Train the Autoencoder ONLY on healthy data ---
    print("\n3. Training Autoencoder on 'healthy' data from the Train set...")
    X_train_healthy = X_train[y_train == 0]
    
    # Fit scaler ONLY on this healthy training data
    scaler = StandardScaler().fit(X_train_healthy)
    X_train_healthy_std = scaler.transform(X_train_healthy)
    
    train_loader = DataLoader(TensorDataset(torch.tensor(X_train_healthy_std, dtype=torch.float32)), 
                              batch_size=config['batch_size'], shuffle=True)
    
    model = Autoencoder(input_dim=config['n_features']).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    loss_fn = nn.MSELoss()

    for epoch in range(config['epochs']):
        for [xb] in train_loader:
            xb = xb.to(DEVICE)
            recon = model(xb)
            loss = loss_fn(recon, xb)
            optimizer.zero_grad(); loss.backward(); optimizer.step()
    print("   Training complete.")
    
    # --- Step 3: Find the best anomaly threshold on the Validation Set ---
    print("\n4. Finding optimal anomaly threshold on Validation Set...")
    model.eval()
    with torch.no_grad():
        X_val_std = scaler.transform(X_val)
        val_tensor = torch.tensor(X_val_std, dtype=torch.float32).to(DEVICE)
        reconstructions = model(val_tensor)
        val_errors = torch.mean((val_tensor - reconstructions)**2, axis=1).cpu().numpy()

    best_f1, best_thresh = 0, 0
    # Use percentiles of the error distribution as candidate thresholds
    for q in np.arange(80, 100, 0.5):
        threshold = np.percentile(val_errors, q)
        preds = (val_errors >= threshold).astype(int)
        current_f1 = f1_score(y_val, preds)
        if current_f1 > best_f1:
            best_f1, best_thresh = current_f1, threshold
    print(f"   Best threshold found: {best_thresh:.4f} (yields F1={best_f1:.4f} on Val Set)")

    # --- Step 4: Final Exam on the Test Set ---
    print("\n5. Final evaluation on unseen Test Set...")
    with torch.no_grad():
        X_test_std = scaler.transform(X_test)
        test_tensor = torch.tensor(X_test_std, dtype=torch.float32).to(DEVICE)
        reconstructions = model(test_tensor)
        test_errors = torch.mean((test_tensor - reconstructions)**2, axis=1).cpu().numpy()
    
    final_preds = (test_errors >= best_thresh).astype(int)
    
    final_f1 = f1_score(y_test, final_preds, zero_division=0)
    final_prec = precision_score(y_test, final_preds, zero_division=0)
    final_rec = recall_score(y_test, final_preds, zero_division=0)
    final_auc = roc_auc_score(y_test, test_errors)
    final_gmean = np.sqrt(final_prec * final_rec) if final_prec > 0 and final_rec > 0 else 0

    print(f"\n[Autoencoder Anomaly Detection] Final Test Set Performance:")
    print(f"  Used Threshold = {best_thresh:.4f}")
    print(f"  F1-Score       = {final_f1:.4f}")
    print(f"  AUC            = {final_auc:.4f}")
    print(f"  G-Mean         = {final_gmean:.4f}")
    print(f"  Precision      = {final_prec:.4f}")
    print(f"  Recall         = {final_rec:.4f}")
    print("\nAutoencoder Championship Complete!")
    
    
if __name__ == "__main__":
    CONFIG = {
        "csv_path": r'cvm_indicators_dataset_2011-2021.csv',
        "id_col": "ID", "quarter_col": "QUARTER", "target_col": "LABEL",
        "meta_cols": ["ID", "QUARTER", "LABEL"],
        "lags": 4,
        
        # Autoencoder specific parameters
        "epochs": 75, # Needs more epochs to learn the distribution well
        "batch_size": 256,
        "lr": 0.001,
    }
    
    run_autoencoder_championship(config=CONFIG)

────────────────────────────────────────────────────────────
1. Loading and preparing sequence data...

2. Splitting data into Train/History (60%), Validation (20%), and Test (20%) sets...

3. Training Autoencoder on 'healthy' data from the Train set...
   Training complete.

4. Finding optimal anomaly threshold on Validation Set...
   Best threshold found: 0.0889 (yields F1=0.0618 on Val Set)

5. Final evaluation on unseen Test Set...

[Autoencoder Anomaly Detection] Final Test Set Performance:
  Used Threshold = 0.0889
  F1-Score       = 0.2150
  AUC            = 0.7054
  G-Mean         = 0.2372
  Precision      = 0.1511
  Recall         = 0.3725

Autoencoder Championship Complete!
