# Hyperparameter Tuning Summary

This notebook aggregates and summarizes results from all hyperparameter tuning studies (notebooks 10-16).

**Contents**:
1. Summary table of best results per model
2. Top 3 trials per model to assess optimization quality
3. Runtime comparison
4. Best hyperparameters for each model

**Outputs**:
- Summary table: `outputs/hyperparams/summary.csv`
- Top trials table: `outputs/hyperparams/top_trials.csv`

In [None]:
import os
import json
import pickle
from pathlib import Path
import pandas as pd
import numpy as np

print("="*70)
print("Hyperparameter Tuning Summary")
print("="*70)

# Check mode
QUICK_MODE = os.getenv('QUICK_MODE', 'False').lower() in ('true', '1', 'yes')
MODE_SUFFIX = "_quick" if QUICK_MODE else ""
print(f"Mode: {'QUICK' if QUICK_MODE else 'FULL'}")

OUTPUT_DIR = Path('../outputs')
HYPERPARAM_DIR = OUTPUT_DIR / 'hyperparams'
STUDY_DIR = OUTPUT_DIR / 'optuna_studies'

# Model configurations
MODELS = {
    'XGBoost': {'task': 'multiclass', 'file': f'xgboost_best{MODE_SUFFIX}.json', 'study': f'xgboost_study{MODE_SUFFIX}.pkl'},
    'LSTM': {'task': 'multiclass', 'file': f'lstm_best{MODE_SUFFIX}.json', 'study': f'lstm_study{MODE_SUFFIX}.pkl'},
    'LSTM-FCN': {'task': 'multiclass', 'file': f'lstm_fcn_best{MODE_SUFFIX}.json', 'study': f'lstm_fcn_study{MODE_SUFFIX}.pkl'},
    'CNN-Transformer': {'task': 'multiclass', 'file': f'cnn_transformer_best{MODE_SUFFIX}.json', 'study': f'cnn_transformer_study{MODE_SUFFIX}.pkl'},
    'TransKal': {'task': 'multiclass', 'file': f'transkal_best{MODE_SUFFIX}.json', 'study': f'transkal_study{MODE_SUFFIX}.pkl'},
    'LSTM-Autoencoder': {'task': 'binary', 'file': f'lstm_autoencoder_best{MODE_SUFFIX}.json', 'study': f'lstm_autoencoder_study{MODE_SUFFIX}.pkl'},
    'Conv-Autoencoder': {'task': 'binary', 'file': f'conv_autoencoder_best{MODE_SUFFIX}.json', 'study': f'conv_autoencoder_study{MODE_SUFFIX}.pkl'},
}

print(f"Looking for {len(MODELS)} model results...")
print("="*70)

Hyperparameter Tuning Summary
Mode: FULL
Looking for 7 model results...


## Load Results

In [None]:
results = {}
studies = {}
missing = []

for model_name, config in MODELS.items():
    json_path = HYPERPARAM_DIR / config['file']
    study_path = STUDY_DIR / config['study']
    
    if json_path.exists():
        with open(json_path) as f:
            results[model_name] = json.load(f)
        print(f"✓ Loaded {model_name}")
        
        if study_path.exists():
            with open(study_path, 'rb') as f:
                studies[model_name] = pickle.load(f)
    else:
        missing.append(model_name)
        print(f"✗ Missing {model_name} ({config['file']})")

print(f"\nLoaded {len(results)}/{len(MODELS)} models")
if missing:
    print(f"Missing: {', '.join(missing)}")

✓ Loaded XGBoost


✓ Loaded LSTM
✓ Loaded LSTM-FCN
✓ Loaded CNN-Transformer
✓ Loaded TransKal
✓ Loaded LSTM-Autoencoder
✓ Loaded Conv-Autoencoder

Loaded 7/7 models


## Summary Table

In [None]:
if results:
    summary_data = []
    
    for model_name, result in results.items():
        timing = result.get('timing', {})
        # Handle both run_fraction (new) and data_fraction (legacy)
        fraction = result.get('run_fraction', result.get('data_fraction', 0))
        summary_data.append({
            'Model': model_name,
            'Task': result.get('task', 'unknown'),
            'Best F1 (weighted)': result.get('best_f1_weighted', np.nan),
            'Trials': result.get('num_trials', 0),
            'Run Fraction': f"{fraction*100:.0f}%",
            'Max Epochs': result.get('max_epochs', 'N/A'),
            'Optimization Time': f"{result.get('optimization_time_seconds', 0):.1f}s",
            'Total Runtime': timing.get('total_runtime_formatted', 'N/A'),
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values('Best F1 (weighted)', ascending=False)
    
    print("\n" + "="*70)
    print("SUMMARY: Best Results by Model")
    print("="*70)
    
    # Format F1 as percentage for display
    display_df = summary_df.copy()
    display_df['Best F1 (weighted)'] = display_df['Best F1 (weighted)'].apply(lambda x: f"{x*100:.2f}%")
    print(display_df.to_string(index=False))
    
    # Save to CSV
    summary_df.to_csv(HYPERPARAM_DIR / f'summary{MODE_SUFFIX}.csv', index=False)
    print(f"\n✓ Saved to {HYPERPARAM_DIR / f'summary{MODE_SUFFIX}.csv'}")
else:
    print("No results to summarize.")


SUMMARY: Best Results by Model
           Model       Task Best F1 (weighted)  Trials Run Fraction Max Epochs Optimization Time Total Runtime
Conv-Autoencoder     binary             99.37%      50          50%         50           4248.6s       70m 54s
        LSTM-FCN multiclass             98.96%      50          50%         50          39642.2s      660m 52s
 CNN-Transformer multiclass             98.90%      50          50%         50          30135.2s      502m 29s
            LSTM multiclass             98.78%      50          50%         50          23823.3s      397m 21s
        TransKal multiclass             98.78%      50          50%         50          43541.0s      725m 52s
LSTM-Autoencoder     binary             98.17%      50          50%         50           9221.7s      153m 47s
         XGBoost multiclass             92.36%      50          50%        N/A          26863.1s      447m 53s

✓ Saved to ../outputs/hyperparams/summary.csv


## Top 3 Trials per Model

Shows the top 3 performing configurations for each model to assess how much better the best is compared to other good configurations.

In [None]:
if studies:
    top_trials_data = []
    
    for model_name, study in studies.items():
        # Get all completed trials sorted by value
        completed_trials = [t for t in study.trials if t.value is not None]
        sorted_trials = sorted(completed_trials, key=lambda t: t.value, reverse=True)
        
        # Get top 3
        for rank, trial in enumerate(sorted_trials[:3], 1):
            top_trials_data.append({
                'Model': model_name,
                'Rank': rank,
                'F1 Score': trial.value,
                'Trial #': trial.number,
            })
    
    top_trials_df = pd.DataFrame(top_trials_data)
    
    print("\n" + "="*70)
    print("TOP 3 TRIALS PER MODEL")
    print("="*70)
    
    # Pivot for better display
    for model_name in studies.keys():
        model_trials = top_trials_df[top_trials_df['Model'] == model_name]
        if len(model_trials) > 0:
            print(f"\n{model_name}:")
            best = model_trials.iloc[0]['F1 Score']
            for _, row in model_trials.iterrows():
                diff = (best - row['F1 Score']) * 100
                print(f"  #{row['Rank']}: F1={row['F1 Score']*100:.2f}% (trial {row['Trial #']}) {f'(-{diff:.2f}%)' if row['Rank'] > 1 else '(best)'}")
    
    # Save to CSV
    top_trials_df.to_csv(HYPERPARAM_DIR / f'top_trials{MODE_SUFFIX}.csv', index=False)
    print(f"\n✓ Saved to {HYPERPARAM_DIR / f'top_trials{MODE_SUFFIX}.csv'}")
else:
    print("No Optuna studies found.")


TOP 3 TRIALS PER MODEL

XGBoost:
  #1: F1=92.36% (trial 38) (best)
  #2: F1=92.35% (trial 41) (-0.02%)
  #3: F1=92.33% (trial 39) (-0.03%)

LSTM:
  #1: F1=98.78% (trial 47) (best)
  #2: F1=98.74% (trial 15) (-0.04%)
  #3: F1=98.72% (trial 27) (-0.06%)

LSTM-FCN:
  #1: F1=98.96% (trial 32) (best)
  #2: F1=98.94% (trial 31) (-0.02%)
  #3: F1=98.84% (trial 34) (-0.12%)

CNN-Transformer:
  #1: F1=98.90% (trial 47) (best)
  #2: F1=98.86% (trial 46) (-0.04%)
  #3: F1=98.85% (trial 44) (-0.05%)

TransKal:
  #1: F1=98.78% (trial 31) (best)
  #2: F1=98.77% (trial 17) (-0.00%)
  #3: F1=98.77% (trial 33) (-0.00%)

LSTM-Autoencoder:
  #1: F1=98.17% (trial 33) (best)
  #2: F1=98.00% (trial 26) (-0.17%)
  #3: F1=97.89% (trial 25) (-0.29%)

Conv-Autoencoder:
  #1: F1=99.37% (trial 49) (best)
  #2: F1=99.27% (trial 46) (-0.10%)
  #3: F1=99.20% (trial 44) (-0.17%)

✓ Saved to ../outputs/hyperparams/top_trials.csv


## Performance Spread Analysis

Analyzes how much variation exists between the best and worst trials, and the distribution of results.

In [None]:
if studies:
    spread_data = []
    
    for model_name, study in studies.items():
        values = [t.value for t in study.trials if t.value is not None]
        if values:
            spread_data.append({
                'Model': model_name,
                'Best F1': max(values),
                'Worst F1': min(values),
                'Mean F1': np.mean(values),
                'Std F1': np.std(values),
                'Spread (Best-Worst)': max(values) - min(values),
                'Total Trials': len(values),
            })
    
    spread_df = pd.DataFrame(spread_data)
    spread_df = spread_df.sort_values('Best F1', ascending=False)
    
    print("\n" + "="*70)
    print("PERFORMANCE SPREAD ANALYSIS")
    print("="*70)
    
    for _, row in spread_df.iterrows():
        print(f"\n{row['Model']}:")
        print(f"  Best:  {row['Best F1']*100:.2f}%")
        print(f"  Worst: {row['Worst F1']*100:.2f}%")
        print(f"  Mean:  {row['Mean F1']*100:.2f}% ± {row['Std F1']*100:.2f}%")
        print(f"  Spread: {row['Spread (Best-Worst)']*100:.2f}% over {row['Total Trials']} trials")
else:
    print("No Optuna studies found.")


PERFORMANCE SPREAD ANALYSIS

Conv-Autoencoder:
  Best:  99.37%
  Worst: 94.13%
  Mean:  98.04% ± 1.43%
  Spread: 5.24% over 50 trials

LSTM-FCN:
  Best:  98.96%
  Worst: 97.31%
  Mean:  98.52% ± 0.29%
  Spread: 1.65% over 50 trials

CNN-Transformer:
  Best:  98.90%
  Worst: 0.58%
  Mean:  94.79% ± 16.90%
  Spread: 98.31% over 50 trials

LSTM:
  Best:  98.78%
  Worst: 97.54%
  Mean:  98.39% ± 0.28%
  Spread: 1.24% over 50 trials

TransKal:
  Best:  98.78%
  Worst: 42.02%
  Mean:  96.15% ± 8.66%
  Spread: 56.75% over 50 trials

LSTM-Autoencoder:
  Best:  98.17%
  Worst: 78.69%
  Mean:  94.79% ± 4.73%
  Spread: 19.48% over 50 trials

XGBoost:
  Best:  92.36%
  Worst: 74.34%
  Mean:  91.21% ± 2.85%
  Spread: 18.02% over 50 trials


## Runtime Comparison

In [None]:
if results:
    runtime_data = []
    
    for model_name, result in results.items():
        timing = result.get('timing', {})
        total_seconds = timing.get('total_runtime_seconds', 0)
        n_trials = result.get('num_trials', 1)
        
        runtime_data.append({
            'Model': model_name,
            'Total Runtime (s)': total_seconds,
            'Trials': n_trials,
            'Time per Trial (s)': total_seconds / n_trials if n_trials > 0 else 0,
        })
    
    runtime_df = pd.DataFrame(runtime_data)
    runtime_df = runtime_df.sort_values('Total Runtime (s)', ascending=False)
    
    print("\n" + "="*70)
    print("RUNTIME COMPARISON")
    print("="*70)
    
    total_time = runtime_df['Total Runtime (s)'].sum()
    
    for _, row in runtime_df.iterrows():
        mins = int(row['Total Runtime (s)'] // 60)
        secs = int(row['Total Runtime (s)'] % 60)
        print(f"  {row['Model']:20s}: {mins:3d}m {secs:02d}s ({row['Time per Trial (s)']:.1f}s/trial)")
    
    print(f"\n  {'TOTAL':20s}: {int(total_time // 60):3d}m {int(total_time % 60):02d}s")
else:
    print("No results to analyze.")


RUNTIME COMPARISON
  TransKal            : 725m 52s (871.0s/trial)
  LSTM-FCN            : 660m 52s (793.1s/trial)
  CNN-Transformer     : 502m 29s (603.0s/trial)
  XGBoost             : 447m 53s (537.5s/trial)
  LSTM                : 397m 21s (476.8s/trial)
  LSTM-Autoencoder    : 153m 47s (184.6s/trial)
  Conv-Autoencoder    :  70m 54s (85.1s/trial)

  TOTAL               : 2959m 11s


## Best Hyperparameters per Model

In [None]:
if results:
    print("\n" + "="*70)
    print("BEST HYPERPARAMETERS")
    print("="*70)
    
    for model_name, result in results.items():
        params = result.get('best_params', {})
        f1 = result.get('best_f1_weighted', 0)
        
        print(f"\n{model_name} (F1={f1*100:.2f}%):")
        for param, value in sorted(params.items()):
            if isinstance(value, float):
                print(f"  {param}: {value:.6g}")
            else:
                print(f"  {param}: {value}")
else:
    print("No results to display.")


BEST HYPERPARAMETERS

XGBoost (F1=92.36%):
  colsample_bytree: 0.841271
  gamma: 0.56356
  learning_rate: 0.195773
  max_depth: 6
  min_child_weight: 8
  n_estimators: 499
  reg_alpha: 0.329311
  reg_lambda: 0.0630955
  subsample: 0.9669

LSTM (F1=98.78%):
  batch_size: 32
  dropout: 0.0662255
  hidden_size: 32
  learning_rate: 0.00195641
  num_layers: 3
  sequence_length: 39

LSTM-FCN (F1=98.96%):
  batch_size: 64
  dropout: 0.436581
  learning_rate: 0.00139245
  lstm_hidden: 24
  lstm_layers: 1
  sequence_length: 40

CNN-Transformer (F1=98.90%):
  batch_size: 32
  conv_filters: 32
  d_model: 32
  dim_feedforward: 128
  dropout: 0.239729
  kernel_size: 3
  learning_rate: 0.00240574
  nhead: 4
  num_encoder_layers: 1
  sequence_length: 39

TransKal (F1=98.78%):
  batch_size: 64
  d_model: 32
  dropout: 0.195007
  kalman_Q: 5.03691e-05
  kalman_R: 0.0222507
  learning_rate: 0.000427603
  nhead: 4
  num_layers: 2
  sequence_length: 40

LSTM-Autoencoder (F1=98.17%):
  batch_size: 32
  dr

## Results by Task

In [None]:
if results:
    print("\n" + "="*70)
    print("RESULTS BY TASK")
    print("="*70)
    
    # Group by task
    multiclass = [(name, r) for name, r in results.items() if r.get('task') == 'multiclass']
    binary = [(name, r) for name, r in results.items() if r.get('task') == 'binary']
    
    if multiclass:
        print("\nMulticlass Classification (18 fault types):")
        multiclass_sorted = sorted(multiclass, key=lambda x: x[1].get('best_f1_weighted', 0), reverse=True)
        for rank, (name, r) in enumerate(multiclass_sorted, 1):
            f1 = r.get('best_f1_weighted', 0)
            print(f"  {rank}. {name:20s}: {f1*100:.2f}%")
    
    if binary:
        print("\nBinary Anomaly Detection (normal vs fault):")
        binary_sorted = sorted(binary, key=lambda x: x[1].get('best_f1_weighted', 0), reverse=True)
        for rank, (name, r) in enumerate(binary_sorted, 1):
            f1 = r.get('best_f1_weighted', 0)
            print(f"  {rank}. {name:20s}: {f1*100:.2f}%")
else:
    print("No results to display.")


RESULTS BY TASK

Multiclass Classification (18 fault types):
  1. LSTM-FCN            : 98.96%
  2. CNN-Transformer     : 98.90%
  3. LSTM                : 98.78%
  4. TransKal            : 98.78%
  5. XGBoost             : 92.36%

Binary Anomaly Detection (normal vs fault):
  1. Conv-Autoencoder    : 99.37%
  2. LSTM-Autoencoder    : 98.17%


In [None]:
print("\n" + "="*70)
print("✓ Hyperparameter Summary Complete")
print("="*70)


✓ Hyperparameter Summary Complete
