# TransKal Hyperparameter Tuning

This notebook performs hyperparameter optimization for TransKal (Transformer + Kalman Filter) using Optuna.

**Task**: Multiclass fault classification (18 classes)

**Architecture**: Transformer classifier with Kalman filter post-processing for smoothed predictions.

**Data Handling**:
- Windows are created within simulation runs only (no cross-run windows)
- Subsampling is done by simulation runs, not individual rows

**Outputs**:
- Best hyperparameters: `outputs/hyperparams/transkal_best.json`
- Optuna study: `outputs/optuna_studies/transkal_study.pkl`

In [1]:
import os
import sys
import time
import json
import pickle
from pathlib import Path

start_time = time.time()
print("="*60)
print("TransKal Hyperparameter Tuning")
print("="*60)
print(f"Started at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")

QUICK_MODE = os.getenv('QUICK_MODE', 'False').lower() in ('true', '1', 'yes')

if QUICK_MODE:
    RUN_FRACTION = 0.01
    MIN_RUNS_PER_CLASS = 5
    N_TRIALS = 5
    MAX_EPOCHS = 10
    PATIENCE = 3
    print("ðŸš€ QUICK MODE (1% runs, min 5/class, 5 trials, max 10 epochs)")
else:
    RUN_FRACTION = 0.50
    MIN_RUNS_PER_CLASS = 5
    N_TRIALS = 50
    MAX_EPOCHS = 50
    PATIENCE = 5
    print("ðŸ”¬ TUNING MODE (50% runs, 50 trials, max 50 epochs)")

DATA_DIR = Path('../data')
OUTPUT_DIR = Path('../outputs')
HYPERPARAM_DIR = OUTPUT_DIR / 'hyperparams'
STUDY_DIR = OUTPUT_DIR / 'optuna_studies'
PROGRESS_FILE = OUTPUT_DIR / 'transkal_progress.log'

HYPERPARAM_DIR.mkdir(parents=True, exist_ok=True)
STUDY_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42

print(f"Run fraction: {RUN_FRACTION*100}%")
print(f"Trials: {N_TRIALS}, Max epochs: {MAX_EPOCHS}, Patience: {PATIENCE}")
print("="*60)

def log_progress(message):
    print(message, flush=True)
    with open(PROGRESS_FILE, 'a') as f:
        f.write(f"{message}\n")
        f.flush()

PROGRESS_FILE.write_text("")

TransKal Hyperparameter Tuning
Started at: 2026-01-03 23:06:23
ðŸš€ QUICK MODE (1% runs, min 5/class, 5 trials, max 10 epochs)
Run fraction: 1.0%
Trials: 5, Max epochs: 10, Patience: 3


0

In [2]:
log_progress("\n[Step 1/6] Loading libraries...")
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
import optuna
from optuna.pruners import MedianPruner
import math
import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
log_progress(f"âœ“ Using device: {device}")


[Step 1/6] Loading libraries...


âœ“ Using device: cuda


In [3]:
log_progress("\n[Step 2/6] Loading data...")

train = pd.read_csv(DATA_DIR / 'multiclass_train.csv')
val = pd.read_csv(DATA_DIR / 'multiclass_val.csv')

log_progress(f"âœ“ Full data - Train: {train.shape}, Val: {val.shape}")

def sample_by_runs(df, fraction, seed, min_runs=5):
    """Sample complete simulation runs, preserving temporal structure."""
    runs = df.groupby(['faultNumber', 'simulationRun']).size().reset_index()[['faultNumber', 'simulationRun']]
    
    def sample_class(x):
        n_total = len(x)
        n_sample = max(min_runs, int(n_total * fraction))
        n_sample = min(n_sample, n_total)
        return x.sample(n=n_sample, random_state=seed)
    
    sampled_runs = runs.groupby('faultNumber', group_keys=False).apply(sample_class)
    df_sampled = df.merge(sampled_runs, on=['faultNumber', 'simulationRun'])
    return df_sampled.sort_values(['faultNumber', 'simulationRun', 'sample']).reset_index(drop=True)

train_sampled = sample_by_runs(train, RUN_FRACTION, RANDOM_SEED, MIN_RUNS_PER_CLASS)
val_sampled = sample_by_runs(val, RUN_FRACTION, RANDOM_SEED, MIN_RUNS_PER_CLASS)

n_train_runs = train_sampled.groupby(['faultNumber', 'simulationRun']).ngroups
n_val_runs = val_sampled.groupby(['faultNumber', 'simulationRun']).ngroups

log_progress(f"âœ“ Sampled - Train: {train_sampled.shape} ({n_train_runs} runs), Val: {val_sampled.shape} ({n_val_runs} runs)")

features = [col for col in train.columns if 'xmeas' in col or 'xmv' in col]
num_features = len(features)

scaler = StandardScaler()
scaler.fit(train_sampled[features])

label_encoder = LabelEncoder()
label_encoder.fit(train_sampled['faultNumber'])
num_classes = len(label_encoder.classes_)

log_progress(f"âœ“ Features: {num_features}, Classes: {num_classes}")


[Step 2/6] Loading data...


âœ“ Full data - Train: (864000, 57), Val: (432000, 57)


âœ“ Sampled - Train: (43200, 57) (90 runs), Val: (43200, 57) (90 runs)


âœ“ Features: 52, Classes: 18


In [4]:
log_progress("\n[Step 3/6] Defining TransKal model...")

class SimulationRunDataset(Dataset):
    """Dataset that creates windows WITHIN simulation runs only."""
    def __init__(self, df, features, label_col, scaler, label_encoder, sequence_length=10):
        self.seq_len = sequence_length
        self.windows = []
        self.labels = []
        
        for (fault, run), group in df.groupby(['faultNumber', 'simulationRun']):
            group = group.sort_values('sample')
            X = scaler.transform(group[features].values)
            y = label_encoder.transform(group['faultNumber'].values)
            
            for i in range(len(X) - sequence_length + 1):
                self.windows.append(X[i:i+sequence_length])
                self.labels.append(y[i+sequence_length-1])
        
        self.windows = np.array(self.windows, dtype=np.float32)
        self.labels = np.array(self.labels, dtype=np.int64)
    
    def __len__(self):
        return len(self.windows)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.windows[idx]), torch.tensor(self.labels[idx])

class SequentialRunDataset:
    """
    Dataset that maintains run structure for sequential Kalman filtering.
    Returns data organized by (faultNumber, simulationRun) for proper temporal evaluation.
    """
    def __init__(self, df, features, scaler, label_encoder, sequence_length=10):
        self.seq_len = sequence_length
        self.runs = []  # List of (windows, labels) per run
        
        for (fault, run), group in df.groupby(['faultNumber', 'simulationRun']):
            group = group.sort_values('sample')
            X = scaler.transform(group[features].values)
            y = label_encoder.transform(group['faultNumber'].values)
            
            run_windows = []
            run_labels = []
            for i in range(len(X) - sequence_length + 1):
                run_windows.append(X[i:i+sequence_length])
                run_labels.append(y[i+sequence_length-1])
            
            if run_windows:
                self.runs.append({
                    'windows': np.array(run_windows, dtype=np.float32),
                    'labels': np.array(run_labels, dtype=np.int64),
                    'fault': fault,
                    'run': run
                })
    
    def __len__(self):
        return len(self.runs)
    
    def __iter__(self):
        return iter(self.runs)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class TransformerClassifier(nn.Module):
    """Transformer classifier for time series."""
    def __init__(self, input_dim, num_classes, d_model=32, nhead=2, 
                 num_layers=1, dropout=0.3):
        super().__init__()
        self.d_model = d_model
        
        self.embedding = nn.Sequential(
            nn.Linear(input_dim, d_model),
            nn.LayerNorm(d_model),
            nn.Dropout(dropout)
        )
        
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, 
            dim_feedforward=d_model * 2,
            dropout=dropout, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, num_classes)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_encoder(x)
        x = self.transformer(x)
        x = x.mean(dim=1)
        x = self.dropout(x)
        return self.fc(x)

class KalmanFilter:
    """Simple Kalman filter for smoothing predictions within a single run."""
    def __init__(self, Q=1e-5, R=0.1):
        self.Q = Q
        self.R = R
        self.reset()
        
    def reset(self):
        """Reset filter state for a new run."""
        self.x = None
        self.P = 1.0
        
    def update(self, z):
        if self.x is None:
            self.x = z
            return z
        
        self.P = self.P + self.Q
        K = self.P / (self.P + self.R)
        self.x = self.x + K * (z - self.x)
        self.P = (1 - K) * self.P
        
        return self.x

log_progress("âœ“ TransKal model defined")


[Step 3/6] Defining TransKal model...


âœ“ TransKal model defined


In [5]:
log_progress("\n[Step 4/6] Setting up optimization...")

def objective(trial):
    sequence_length = trial.suggest_int('sequence_length', 5, 20)
    
    nhead = trial.suggest_categorical('nhead', [2, 4])
    d_model = trial.suggest_categorical('d_model', [32, 64, 128])
    d_model = (d_model // nhead) * nhead
    
    num_layers = trial.suggest_int('num_layers', 1, 3)
    dropout = trial.suggest_float('dropout', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    
    kalman_Q = trial.suggest_float('kalman_Q', 1e-6, 1e-3, log=True)
    kalman_R = trial.suggest_float('kalman_R', 0.01, 1.0, log=True)
    
    # Training dataset (shuffled for training)
    train_dataset = SimulationRunDataset(
        train_sampled, features, 'faultNumber', scaler, label_encoder, sequence_length
    )
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # Validation dataset organized by runs for proper Kalman filtering
    val_runs = SequentialRunDataset(
        val_sampled, features, scaler, label_encoder, sequence_length
    )
    
    model = TransformerClassifier(
        num_features, num_classes, d_model, nhead, num_layers, dropout
    ).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # Early stopping variables
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    # Training loop with early stopping
    for epoch in range(MAX_EPOCHS):
        # Training phase
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
        
        # Validation phase for early stopping (without Kalman for speed)
        model.eval()
        val_loss = 0.0
        n_samples = 0
        with torch.no_grad():
            for run_data in val_runs:
                windows = torch.from_numpy(run_data['windows']).to(device)
                labels = torch.from_numpy(run_data['labels']).to(device)
                outputs = model(windows)
                val_loss += criterion(outputs, labels).item() * len(labels)
                n_samples += len(labels)
        val_loss /= n_samples
        
        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                break
    
    # Restore best model
    if best_model_state is not None:
        model.load_state_dict({k: v.to(device) for k, v in best_model_state.items()})
    
    # Evaluation with per-run Kalman filtering
    model.eval()
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for run_data in val_runs:
            # Reset Kalman filter for each new run
            kalman = KalmanFilter(Q=kalman_Q, R=kalman_R)
            
            windows = torch.from_numpy(run_data['windows']).to(device)
            labels = run_data['labels']
            
            # Process windows in temporal order within this run
            outputs = model(windows)
            probs = torch.softmax(outputs, dim=1).cpu().numpy()
            
            # Apply Kalman filter sequentially within this run
            for i in range(len(probs)):
                pred_class = probs[i].argmax()
                filtered = kalman.update(pred_class)
                all_preds.append(int(round(filtered)))
                all_labels.append(labels[i])
    
    all_preds = np.clip(all_preds, 0, num_classes - 1)
    
    return f1_score(all_labels, all_preds, average='weighted')

log_progress("âœ“ Objective function defined")


[Step 4/6] Setting up optimization...


âœ“ Objective function defined


In [6]:
log_progress(f"\n{'='*60}")
log_progress(f"[Step 5/6] Starting optimization")
log_progress(f"{'='*60}\n")

optuna_start = time.time()
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(
    direction='maximize',
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=5),
    study_name='transkal_multiclass'
)

log_progress(f"Running {N_TRIALS} trials...")

for trial_num in range(N_TRIALS):
    study.optimize(objective, n_trials=1, show_progress_bar=False)
    trial = study.trials[-1]
    log_progress(f"Trial {trial_num + 1}/{N_TRIALS}: F1={trial.value:.4f} (best={study.best_value:.4f})")

optuna_time = time.time() - optuna_start

log_progress(f"\n{'='*60}")
log_progress("âœ“ Optimization complete!")
log_progress(f"Total time: {optuna_time:.2f}s")




[Step 5/6] Starting optimization





Running 5 trials...


Trial 1/5: F1=0.4782 (best=0.4782)


Trial 2/5: F1=0.4462 (best=0.4782)


Trial 3/5: F1=0.4543 (best=0.4782)


Trial 4/5: F1=0.7632 (best=0.7632)


Trial 5/5: F1=0.6890 (best=0.7632)





âœ“ Optimization complete!


Total time: 369.60s


In [7]:
end_time = time.time()
total_runtime = end_time - start_time

log_progress("\n[Step 6/6] Saving results...")

results = {
    'model': 'TransKal',
    'task': 'multiclass',
    'best_params': study.best_params,
    'best_f1_weighted': float(study.best_value),
    'num_trials': N_TRIALS,
    'run_fraction': RUN_FRACTION,
    'quick_mode': QUICK_MODE,
    'max_epochs': MAX_EPOCHS,
    'early_stopping_patience': PATIENCE,
    'optimization_time_seconds': optuna_time,
    'random_seed': RANDOM_SEED,
    'timing': {
        'start_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time)),
        'end_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time)),
        'total_runtime_seconds': float(total_runtime),
        'total_runtime_formatted': f"{int(total_runtime // 60)}m {int(total_runtime % 60)}s"
    }
}

mode_suffix = "_quick" if QUICK_MODE else ""
json_path = HYPERPARAM_DIR / f'transkal_best{mode_suffix}.json'
study_path = STUDY_DIR / f'transkal_study{mode_suffix}.pkl'

with open(json_path, 'w') as f:
    json.dump(results, f, indent=2)
log_progress(f"âœ“ Saved to {json_path}")

with open(study_path, 'wb') as f:
    pickle.dump(study, f)

log_progress(f"\n{'='*60}")
log_progress("âœ“ TransKal Hyperparameter Tuning Complete!")
log_progress(f"Runtime: {results['timing']['total_runtime_formatted']}")
log_progress(f"Best F1: {study.best_value:.4f}")
log_progress(f"{'='*60}")


[Step 6/6] Saving results...


âœ“ Saved to ../outputs/hyperparams/transkal_best_quick.json





âœ“ TransKal Hyperparameter Tuning Complete!


Runtime: 6m 17s


Best F1: 0.7632


