# Hyperparameter Tuning Summary

This notebook aggregates and summarizes results from all hyperparameter tuning studies (notebooks 10-16).

**Contents**:
1. Summary table of best results per model
2. Top 3 trials per model to assess optimization quality
3. Runtime comparison
4. Best hyperparameters for each model

**Outputs**:
- Summary table: `outputs/hyperparams/summary.csv`
- Top trials table: `outputs/hyperparams/top_trials.csv`

In [1]:
import os
import json
import pickle
from pathlib import Path
import pandas as pd
import numpy as np

print("="*70)
print("Hyperparameter Tuning Summary")
print("="*70)

# Check mode
QUICK_MODE = os.getenv('QUICK_MODE', 'False').lower() in ('true', '1', 'yes')
MODE_SUFFIX = "_quick" if QUICK_MODE else ""
print(f"Mode: {'QUICK' if QUICK_MODE else 'FULL'}")

OUTPUT_DIR = Path('../outputs')
HYPERPARAM_DIR = OUTPUT_DIR / 'hyperparams'
STUDY_DIR = OUTPUT_DIR / 'optuna_studies'

# Model configurations
MODELS = {
    'XGBoost': {'task': 'multiclass', 'file': f'xgboost_best{MODE_SUFFIX}.json', 'study': f'xgboost_study{MODE_SUFFIX}.pkl'},
    'LSTM': {'task': 'multiclass', 'file': f'lstm_best{MODE_SUFFIX}.json', 'study': f'lstm_study{MODE_SUFFIX}.pkl'},
    'LSTM-FCN': {'task': 'multiclass', 'file': f'lstm_fcn_best{MODE_SUFFIX}.json', 'study': f'lstm_fcn_study{MODE_SUFFIX}.pkl'},
    'CNN-Transformer': {'task': 'multiclass', 'file': f'cnn_transformer_best{MODE_SUFFIX}.json', 'study': f'cnn_transformer_study{MODE_SUFFIX}.pkl'},
    'TransKal': {'task': 'multiclass', 'file': f'transkal_best{MODE_SUFFIX}.json', 'study': f'transkal_study{MODE_SUFFIX}.pkl'},
    'LSTM-Autoencoder': {'task': 'binary', 'file': f'lstm_autoencoder_best{MODE_SUFFIX}.json', 'study': f'lstm_autoencoder_study{MODE_SUFFIX}.pkl'},
    'Conv-Autoencoder': {'task': 'binary', 'file': f'conv_autoencoder_best{MODE_SUFFIX}.json', 'study': f'conv_autoencoder_study{MODE_SUFFIX}.pkl'},
}

print(f"Looking for {len(MODELS)} model results...")
print("="*70)

Hyperparameter Tuning Summary
Mode: QUICK
Looking for 7 model results...


## Load Results

In [2]:
results = {}
studies = {}
missing = []

for model_name, config in MODELS.items():
    json_path = HYPERPARAM_DIR / config['file']
    study_path = STUDY_DIR / config['study']
    
    if json_path.exists():
        with open(json_path) as f:
            results[model_name] = json.load(f)
        print(f"✓ Loaded {model_name}")
        
        if study_path.exists():
            with open(study_path, 'rb') as f:
                studies[model_name] = pickle.load(f)
    else:
        missing.append(model_name)
        print(f"✗ Missing {model_name} ({config['file']})")

print(f"\nLoaded {len(results)}/{len(MODELS)} models")
if missing:
    print(f"Missing: {', '.join(missing)}")

✓ Loaded XGBoost
✓ Loaded LSTM
✓ Loaded LSTM-FCN
✓ Loaded CNN-Transformer
✓ Loaded TransKal


✓ Loaded LSTM-Autoencoder
✓ Loaded Conv-Autoencoder

Loaded 7/7 models


## Summary Table

In [3]:
if results:
    summary_data = []
    
    for model_name, result in results.items():
        timing = result.get('timing', {})
        # Handle both run_fraction (new) and data_fraction (legacy)
        fraction = result.get('run_fraction', result.get('data_fraction', 0))
        summary_data.append({
            'Model': model_name,
            'Task': result.get('task', 'unknown'),
            'Best F1 (weighted)': result.get('best_f1_weighted', np.nan),
            'Trials': result.get('num_trials', 0),
            'Run Fraction': f"{fraction*100:.0f}%",
            'Max Epochs': result.get('max_epochs', 'N/A'),
            'Optimization Time': f"{result.get('optimization_time_seconds', 0):.1f}s",
            'Total Runtime': timing.get('total_runtime_formatted', 'N/A'),
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values('Best F1 (weighted)', ascending=False)
    
    print("\n" + "="*70)
    print("SUMMARY: Best Results by Model")
    print("="*70)
    
    # Format F1 as percentage for display
    display_df = summary_df.copy()
    display_df['Best F1 (weighted)'] = display_df['Best F1 (weighted)'].apply(lambda x: f"{x*100:.2f}%")
    print(display_df.to_string(index=False))
    
    # Save to CSV
    summary_df.to_csv(HYPERPARAM_DIR / f'summary{MODE_SUFFIX}.csv', index=False)
    print(f"\n✓ Saved to {HYPERPARAM_DIR / f'summary{MODE_SUFFIX}.csv'}")
else:
    print("No results to summarize.")


SUMMARY: Best Results by Model
           Model       Task Best F1 (weighted)  Trials Run Fraction Max Epochs Optimization Time Total Runtime
        TransKal multiclass             97.16%       5           1%         10            299.7s        5m 17s
LSTM-Autoencoder     binary             96.40%       5           1%         10             23.2s        0m 30s
 CNN-Transformer multiclass             94.91%       5           1%         10            344.7s        5m 52s
Conv-Autoencoder     binary             91.63%       5           1%         10             31.3s        0m 37s
            LSTM multiclass             91.50%       5           1%         10            225.6s        3m 52s
        LSTM-FCN multiclass             91.46%       5           1%         10            172.7s         3m 0s
         XGBoost multiclass             87.07%       5           1%        N/A            232.3s        3m 59s

✓ Saved to ../outputs/hyperparams/summary_quick.csv


## Top 3 Trials per Model

Shows the top 3 performing configurations for each model to assess how much better the best is compared to other good configurations.

In [4]:
if studies:
    top_trials_data = []
    
    for model_name, study in studies.items():
        # Get all completed trials sorted by value
        completed_trials = [t for t in study.trials if t.value is not None]
        sorted_trials = sorted(completed_trials, key=lambda t: t.value, reverse=True)
        
        # Get top 3
        for rank, trial in enumerate(sorted_trials[:3], 1):
            top_trials_data.append({
                'Model': model_name,
                'Rank': rank,
                'F1 Score': trial.value,
                'Trial #': trial.number,
            })
    
    top_trials_df = pd.DataFrame(top_trials_data)
    
    print("\n" + "="*70)
    print("TOP 3 TRIALS PER MODEL")
    print("="*70)
    
    # Pivot for better display
    for model_name in studies.keys():
        model_trials = top_trials_df[top_trials_df['Model'] == model_name]
        if len(model_trials) > 0:
            print(f"\n{model_name}:")
            best = model_trials.iloc[0]['F1 Score']
            for _, row in model_trials.iterrows():
                diff = (best - row['F1 Score']) * 100
                print(f"  #{row['Rank']}: F1={row['F1 Score']*100:.2f}% (trial {row['Trial #']}) {f'(-{diff:.2f}%)' if row['Rank'] > 1 else '(best)'}")
    
    # Save to CSV
    top_trials_df.to_csv(HYPERPARAM_DIR / f'top_trials{MODE_SUFFIX}.csv', index=False)
    print(f"\n✓ Saved to {HYPERPARAM_DIR / f'top_trials{MODE_SUFFIX}.csv'}")
else:
    print("No Optuna studies found.")


TOP 3 TRIALS PER MODEL

XGBoost:
  #1: F1=87.07% (trial 4) (best)
  #2: F1=86.27% (trial 1) (-0.80%)
  #3: F1=84.44% (trial 0) (-2.63%)

LSTM:
  #1: F1=91.50% (trial 3) (best)
  #2: F1=90.04% (trial 4) (-1.46%)
  #3: F1=88.81% (trial 1) (-2.69%)

LSTM-FCN:
  #1: F1=91.46% (trial 1) (best)
  #2: F1=91.30% (trial 2) (-0.16%)
  #3: F1=90.60% (trial 4) (-0.86%)

CNN-Transformer:
  #1: F1=94.91% (trial 2) (best)
  #2: F1=94.81% (trial 3) (-0.10%)
  #3: F1=94.32% (trial 4) (-0.59%)

TransKal:
  #1: F1=97.16% (trial 3) (best)
  #2: F1=96.64% (trial 0) (-0.52%)
  #3: F1=91.87% (trial 4) (-5.28%)

LSTM-Autoencoder:
  #1: F1=96.40% (trial 2) (best)
  #2: F1=95.26% (trial 1) (-1.14%)
  #3: F1=93.88% (trial 4) (-2.52%)

Conv-Autoencoder:
  #1: F1=91.63% (trial 1) (best)
  #2: F1=91.35% (trial 2) (-0.29%)
  #3: F1=90.79% (trial 3) (-0.84%)



✓ Saved to ../outputs/hyperparams/top_trials_quick.csv


## Performance Spread Analysis

Analyzes how much variation exists between the best and worst trials, and the distribution of results.

In [5]:
if studies:
    spread_data = []
    
    for model_name, study in studies.items():
        values = [t.value for t in study.trials if t.value is not None]
        if values:
            spread_data.append({
                'Model': model_name,
                'Best F1': max(values),
                'Worst F1': min(values),
                'Mean F1': np.mean(values),
                'Std F1': np.std(values),
                'Spread (Best-Worst)': max(values) - min(values),
                'Total Trials': len(values),
            })
    
    spread_df = pd.DataFrame(spread_data)
    spread_df = spread_df.sort_values('Best F1', ascending=False)
    
    print("\n" + "="*70)
    print("PERFORMANCE SPREAD ANALYSIS")
    print("="*70)
    
    for _, row in spread_df.iterrows():
        print(f"\n{row['Model']}:")
        print(f"  Best:  {row['Best F1']*100:.2f}%")
        print(f"  Worst: {row['Worst F1']*100:.2f}%")
        print(f"  Mean:  {row['Mean F1']*100:.2f}% ± {row['Std F1']*100:.2f}%")
        print(f"  Spread: {row['Spread (Best-Worst)']*100:.2f}% over {row['Total Trials']} trials")
else:
    print("No Optuna studies found.")


PERFORMANCE SPREAD ANALYSIS

TransKal:
  Best:  97.16%
  Worst: 84.13%
  Mean:  91.04% ± 5.46%
  Spread: 13.02% over 5 trials

LSTM-Autoencoder:
  Best:  96.40%
  Worst: 89.74%
  Mean:  93.51% ± 2.34%
  Spread: 6.66% over 5 trials

CNN-Transformer:
  Best:  94.91%
  Worst: 88.44%
  Mean:  92.51% ± 2.71%
  Spread: 6.46% over 5 trials

Conv-Autoencoder:
  Best:  91.63%
  Worst: 90.15%
  Mean:  90.82% ± 0.60%
  Spread: 1.49% over 5 trials

LSTM:
  Best:  91.50%
  Worst: 84.90%
  Mean:  88.32% ± 2.40%
  Spread: 6.60% over 5 trials

LSTM-FCN:
  Best:  91.46%
  Worst: 87.80%
  Mean:  90.07% ± 1.39%
  Spread: 3.66% over 5 trials

XGBoost:
  Best:  87.07%
  Worst: 82.15%
  Mean:  84.77% ± 1.74%
  Spread: 4.91% over 5 trials


## Runtime Comparison

In [6]:
if results:
    runtime_data = []
    
    for model_name, result in results.items():
        timing = result.get('timing', {})
        total_seconds = timing.get('total_runtime_seconds', 0)
        n_trials = result.get('num_trials', 1)
        
        runtime_data.append({
            'Model': model_name,
            'Total Runtime (s)': total_seconds,
            'Trials': n_trials,
            'Time per Trial (s)': total_seconds / n_trials if n_trials > 0 else 0,
        })
    
    runtime_df = pd.DataFrame(runtime_data)
    runtime_df = runtime_df.sort_values('Total Runtime (s)', ascending=False)
    
    print("\n" + "="*70)
    print("RUNTIME COMPARISON")
    print("="*70)
    
    total_time = runtime_df['Total Runtime (s)'].sum()
    
    for _, row in runtime_df.iterrows():
        mins = int(row['Total Runtime (s)'] // 60)
        secs = int(row['Total Runtime (s)'] % 60)
        print(f"  {row['Model']:20s}: {mins:3d}m {secs:02d}s ({row['Time per Trial (s)']:.1f}s/trial)")
    
    print(f"\n  {'TOTAL':20s}: {int(total_time // 60):3d}m {int(total_time % 60):02d}s")
else:
    print("No results to analyze.")


RUNTIME COMPARISON
  CNN-Transformer     :   5m 52s (70.5s/trial)
  TransKal            :   5m 17s (63.5s/trial)
  XGBoost             :   3m 59s (48.0s/trial)
  LSTM                :   3m 52s (46.6s/trial)
  LSTM-FCN            :   3m 00s (36.0s/trial)
  Conv-Autoencoder    :   0m 37s (7.5s/trial)
  LSTM-Autoencoder    :   0m 30s (6.1s/trial)

  TOTAL               :  23m 10s


## Best Hyperparameters per Model

In [7]:
if results:
    print("\n" + "="*70)
    print("BEST HYPERPARAMETERS")
    print("="*70)
    
    for model_name, result in results.items():
        params = result.get('best_params', {})
        f1 = result.get('best_f1_weighted', 0)
        
        print(f"\n{model_name} (F1={f1*100:.2f}%):")
        for param, value in sorted(params.items()):
            if isinstance(value, float):
                print(f"  {param}: {value:.6g}")
            else:
                print(f"  {param}: {value}")
else:
    print("No results to display.")


BEST HYPERPARAMETERS

XGBoost (F1=87.07%):
  colsample_bytree: 0.948077
  gamma: 0.461309
  learning_rate: 0.204621
  max_depth: 4
  min_child_weight: 3
  n_estimators: 461
  reg_alpha: 0.0439618
  reg_lambda: 0.0730616
  subsample: 0.762399

LSTM (F1=91.50%):
  batch_size: 32
  dropout: 0.43764
  hidden_size: 64
  learning_rate: 0.00176532
  num_layers: 3
  sequence_length: 25

LSTM-FCN (F1=91.46%):
  batch_size: 64
  dropout: 0.490196
  learning_rate: 0.00279691
  lstm_hidden: 64
  lstm_layers: 1
  sequence_length: 37

CNN-Transformer (F1=94.91%):
  batch_size: 64
  conv_filters: 32
  d_model: 32
  dim_feedforward: 256
  dropout: 0.290604
  kernel_size: 3
  learning_rate: 0.00061075
  nhead: 4
  num_encoder_layers: 3
  sequence_length: 39

TransKal (F1=97.16%):
  batch_size: 128
  d_model: 128
  dropout: 0.109919
  kalman_Q: 4.64536e-06
  kalman_R: 0.136209
  learning_rate: 0.00167567
  nhead: 4
  num_layers: 1
  sequence_length: 28

LSTM-Autoencoder (F1=96.40%):
  batch_size: 64
  

## Results by Task

In [8]:
if results:
    print("\n" + "="*70)
    print("RESULTS BY TASK")
    print("="*70)
    
    # Group by task
    multiclass = [(name, r) for name, r in results.items() if r.get('task') == 'multiclass']
    binary = [(name, r) for name, r in results.items() if r.get('task') == 'binary']
    
    if multiclass:
        print("\nMulticlass Classification (18 fault types):")
        multiclass_sorted = sorted(multiclass, key=lambda x: x[1].get('best_f1_weighted', 0), reverse=True)
        for rank, (name, r) in enumerate(multiclass_sorted, 1):
            f1 = r.get('best_f1_weighted', 0)
            print(f"  {rank}. {name:20s}: {f1*100:.2f}%")
    
    if binary:
        print("\nBinary Anomaly Detection (normal vs fault):")
        binary_sorted = sorted(binary, key=lambda x: x[1].get('best_f1_weighted', 0), reverse=True)
        for rank, (name, r) in enumerate(binary_sorted, 1):
            f1 = r.get('best_f1_weighted', 0)
            print(f"  {rank}. {name:20s}: {f1*100:.2f}%")
else:
    print("No results to display.")


RESULTS BY TASK

Multiclass Classification (18 fault types):
  1. TransKal            : 97.16%
  2. CNN-Transformer     : 94.91%
  3. LSTM                : 91.50%
  4. LSTM-FCN            : 91.46%
  5. XGBoost             : 87.07%

Binary Anomaly Detection (normal vs fault):
  1. LSTM-Autoencoder    : 96.40%
  2. Conv-Autoencoder    : 91.63%


In [9]:
print("\n" + "="*70)
print("✓ Hyperparameter Summary Complete")
print("="*70)


✓ Hyperparameter Summary Complete
