# **Kaggle Challenge: Pirate Pain Dataset üè¥‚Äç‚ò†Ô∏è (v13: Final Version)**

This notebook integrates the findings from our detailed data analysis to build a robust training and evaluation pipeline. It corrects the preprocessing strategy and implements a proper hold-out test set for unbiased evaluation.

**Final Strategy:**
1.  **Hold-Out Test Set:** The data is immediately split into a training set (80%) and a hold-out test set (20%). All HPO and K-Fold training is performed *only* on the training set. The test set is used just once for final, unbiased evaluation.
2.  **StandardScaler:** Based on our analysis that extreme spikes ('rare jewels') are the primary signal for pain, we have replaced `RobustScaler` and `PowerTransformer` with `StandardScaler`. This preserves the relative magnitude of these critical events.
3.  **One-Cycle LR & Compiled Model:** We retain the use of `OneCycleLR` for fast convergence and `torch.compile()` for significant speed improvements.
4.  **Focal Loss:** We continue to use Focal Loss to handle the severe class imbalance identified in the analysis.
5.  **Final Evaluation:** After K-Fold training, the ensemble of models is evaluated on the unseen test set to generate a final, unbiased Classification Report and Confusion Matrix.

## ‚öôÔ∏è 1. Setup & Libraries

In [None]:
# Set seed for reproducibility
SEED = 1234

# Import necessary libraries
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import gc
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# --- Setup Directories & Device ---
os.makedirs("models", exist_ok=True)
os.makedirs("submissions", exist_ok=True)

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
    print("\n--- Using GPU ---")
else:
    device = torch.device("cpu")
    print("\n--- Using CPU ---")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Configure plot display settings
sns.set_theme(font_scale=1.4)
sns.set_style('whitegrid')
plt.rc('font', size=14)

## üîÑ 2. Data Loading, Feature Engineering & Train-Test Split

In [None]:
print("--- 1. Loading Data ---")
DATA_DIR = "data"
X_TRAIN_PATH = os.path.join(DATA_DIR, "pirate_pain_train.csv")
Y_TRAIN_PATH = os.path.join(DATA_DIR, "pirate_pain_train_labels.csv")
X_TEST_KAGGLE_PATH = os.path.join(DATA_DIR, "pirate_pain_test.csv")

features_long_df = pd.read_csv(X_TRAIN_PATH)
labels_df = pd.read_csv(Y_TRAIN_PATH)
X_test_kaggle_long_df = pd.read_csv(X_TEST_KAGGLE_PATH)

print("--- 2. Reshaping and Feature Engineering ---")
N_TIMESTEPS = 160
JOINT_FEATURES = [f"joint_{i:02d}" for i in range(31) if f"joint_{i:02d}" != 'joint_30']
PAIN_FEATURES = [f"pain_survey_{i}" for i in range(1, 5)]
TIME_FEATURE = ['time']
FEATURES = JOINT_FEATURES + PAIN_FEATURES + TIME_FEATURE
LABEL_MAPPING = {'no_pain': 0, 'low_pain': 1, 'high_pain': 2}
N_CLASSES = len(LABEL_MAPPING)

def reshape_data(df, features_list, n_timesteps):
    df_pivot = df.pivot(index='sample_index', columns='time', values=features_list)
    data_2d = df_pivot.values
    n_samples = data_2d.shape[0]
    data_3d = data_2d.reshape(n_samples, len(features_list), n_timesteps)
    return data_3d.transpose(0, 2, 1)

X_full = reshape_data(features_long_df[features_long_df['sample_index'].isin(labels_df['sample_index'].unique())], FEATURES, N_TIMESTEPS)
X_test_kaggle = reshape_data(X_test_kaggle_long_df, FEATURES, N_TIMESTEPS)

y_full_df = labels_df.sort_values(by='sample_index')
le = LabelEncoder().fit(list(LABEL_MAPPING.keys()))
y_full = le.transform(y_full_df['label'])

def engineer_pirate_feature(X_3d, long_df):
    static_cols = ['sample_index', 'n_legs', 'n_hands', 'n_eyes']
    static_df = long_df[static_cols].drop_duplicates().set_index('sample_index')
    pirate_filter = (static_df['n_legs'] == 'one+peg_leg') | (static_df['n_hands'] == 'one+hook_hand') | (static_df['n_eyes'] == 'one+eye_patch')
    pirate_indices = static_df[pirate_filter].index
    sample_indices_ordered = sorted(long_df['sample_index'].unique())
    is_pirate_map = np.array([1 if idx in pirate_indices else 0 for idx in sample_indices_ordered])
    pirate_feature_broadcast = np.tile(is_pirate_map.reshape(-1, 1, 1), (1, X_3d.shape[1], 1))
    return np.concatenate([X_3d, pirate_feature_broadcast], axis=2)

X_full_engineered = engineer_pirate_feature(X_full, features_long_df)
X_test_kaggle_engineered = engineer_pirate_feature(X_test_kaggle, X_test_kaggle_long_df)
print(f"Engineered training data shape: {X_full_engineered.shape}")

print("\n--- 3. Creating Stratified Train-Test Split ---")
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
train_indices, test_indices = next(splitter.split(X_full_engineered, y_full))

X_train, y_train = X_full_engineered[train_indices], y_full[train_indices]
X_test, y_test = X_full_engineered[test_indices], y_full[test_indices]
print(f"Train set shape: {X_train.shape}, {y_train.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

print("\n--- 4. Calculating Alpha Weights for Focal Loss (from training set only) ---")
class_counts = np.bincount(y_train)
class_weights_tensor = 1.0 / torch.tensor(class_counts, dtype=torch.float)
alpha_tensor = (class_weights_tensor / class_weights_tensor.sum()).to(device)
print(f"Train class counts (0, 1, 2): {class_counts}")
print(f"Calculated alpha weights: {alpha_tensor}")

## üõ†Ô∏è 3. Helper Functions & Custom Loss

In [None]:
class FocalLoss(nn.Module):
    """Implements Focal Loss for cost-sensitive learning."""
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.alpha is not None:
            alpha_t = self.alpha[targets].to(focal_loss.device)
            focal_loss = alpha_t * focal_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

def create_sliding_windows(X_3d, y=None, window_size=10, stride=2):
    new_X, new_y, window_indices = [], [], []
    n_samples, n_timesteps, _ = X_3d.shape
    for i in range(n_samples):
        idx = 0
        while (idx + window_size) <= n_timesteps:
            new_X.append(X_3d[i, idx:idx+window_size, :])
            window_indices.append(i)
            if y is not None: new_y.append(y[i])
            idx += stride
    if y is not None:
        return np.array(new_X), np.array(new_y), np.array(window_indices)
    return np.array(new_X), np.array(window_indices)

def make_loader(ds, batch_size, shuffle, drop_last):
    return DataLoader(ds, batch_size=int(batch_size), shuffle=shuffle, drop_last=drop_last, 
                      num_workers=2, pin_memory=True, persistent_workers=True)

## üß† 4. Model & Training Engine

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, rnn_outputs):
        energy = torch.tanh(self.attn(rnn_outputs))
        attn_scores = self.v(energy).squeeze(2)
        attn_weights = torch.softmax(attn_scores, dim=1)
        context_vector = torch.bmm(attn_weights.unsqueeze(1), rnn_outputs).squeeze(1)
        return context_vector

class RecurrentClassifier(nn.Module):
    def __init__(self, hidden_size, num_layers, num_classes,
                 conv_out_channels, conv_kernel_size, bidirectional,
                 dropout_rate, feature_dropout_rate, rnn_type='GRU'):
        super().__init__()
        self.rnn_type, self.num_layers, self.hidden_size, self.bidirectional = \
            rnn_type, num_layers, hidden_size, bidirectional
        
        rnn_hidden_dim = hidden_size * 2 if bidirectional else hidden_size

        self.pain_embed_dim, self.pirate_embed_dim = 4, 4
        self.pain_embeddings = nn.ModuleList([nn.Embedding(3, self.pain_embed_dim) for _ in range(4)])
        self.pirate_embedding = nn.Embedding(2, self.pirate_embed_dim)
        
        # === MODIFICATION ===
        # The number of continuous features is 31 (30 joints + 1 time)
        num_continuous_features = 31
        total_embedding_dim = (4 * self.pain_embed_dim) + self.pirate_embed_dim
        conv_input_size = num_continuous_features + total_embedding_dim

        self.conv1d = nn.Conv1d(in_channels=conv_input_size, out_channels=conv_out_channels,
                                kernel_size=conv_kernel_size, padding='same')
        self.conv_activation = nn.ReLU()
        self.feature_dropout = nn.Dropout(feature_dropout_rate)

        if rnn_type == 'GRU':
            self.rnn = nn.GRU(
                input_size=conv_out_channels, hidden_size=hidden_size,
                num_layers=num_layers, batch_first=True, bidirectional=bidirectional,
                dropout=dropout_rate if num_layers > 1 else 0)
        elif rnn_type == 'LSTM':
            self.rnn = nn.LSTM(
                input_size=conv_out_channels, hidden_size=hidden_size,
                num_layers=num_layers, batch_first=True, bidirectional=bidirectional,
                dropout=dropout_rate if num_layers > 1 else 0)
        
        self.attention = Attention(rnn_hidden_dim)
        self.classifier = nn.Linear(rnn_hidden_dim, num_classes)

    def forward(self, x):
        # === MODIFICATION === 
        # Corrected slicing for 31 continuous and 5 categorical features
        x_continuous = x[:, :, :31]
        x_categorical = x[:, :, 31:].long()
        
        embedded_cats = [self.pain_embeddings[i](x_categorical[:, :, i]) for i in range(4)] \
                      + [self.pirate_embedding(x_categorical[:, :, 4])]
        x_combined = torch.cat([x_continuous] + embedded_cats, dim=2)
        x_permuted = x_combined.permute(0, 2, 1)
        x_conv = self.conv_activation(self.conv1d(x_permuted))
        x_conv_permuted = x_conv.permute(0, 2, 1)
        x_dropped = self.feature_dropout(x_conv_permuted)
        rnn_outputs, _ = self.rnn(x_dropped)
        context_vector = self.attention(rnn_outputs)
        return self.classifier(context_vector)

def train_one_epoch(model, loader, criterion, optimizer, scaler, scheduler, device):
    model.train()
    total_loss, all_preds, all_targets = 0, [], []
    for x, y in loader:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            logits = model(x)
            loss = criterion(logits, y)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        total_loss += loss.item() * x.size(0)
        all_preds.append(logits.argmax(dim=1).cpu().numpy())
        all_targets.append(y.cpu().numpy())
    return total_loss / len(loader.dataset.tensors[0]), f1_score(np.concatenate(all_targets), np.concatenate(all_preds), average='weighted')

def validate_one_epoch(model, loader, criterion, device):
    model.eval()
    total_loss, all_preds, all_targets = 0, [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                logits = model(x)
                loss = criterion(logits, y)
            total_loss += loss.item() * x.size(0)
            all_preds.append(logits.argmax(dim=1).cpu().numpy())
            all_targets.append(y.cpu().numpy())
    return total_loss / len(loader.dataset.tensors[0]), f1_score(np.concatenate(all_targets), np.concatenate(all_preds), average='weighted')

def fit(model, train_loader, val_loader, epochs, criterion, optimizer, scheduler, scaler, device, patience, experiment_name):
    model_path = f"models/{experiment_name}_best_model.pt"
    best_f1 = -1; patience_counter = 0
    print(f"--- Starting Training: {experiment_name} ---")
    for epoch in range(1, epochs + 1):
        train_loss, train_f1 = train_one_epoch(model, train_loader, criterion, optimizer, scaler, scheduler, device)
        val_loss, val_f1 = validate_one_epoch(model, val_loader, criterion, device)

        if epoch % 5 == 0: print(f"Epoch {epoch:3d}/{epochs} | Val F1: {val_f1:.4f} | Best Val F1: {best_f1:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}")

        if val_f1 > best_f1:
            best_f1, patience_counter = val_f1, 0
            torch.save(model._orig_mod.state_dict() if hasattr(model, '_orig_mod') else model.state_dict(), model_path)
        else:
            patience_counter += 1
            if patience_counter >= patience: print(f"Early stopping at epoch {epoch}. Best F1: {best_f1:.4f}"); break
    print(f"--- Finished Training --- Best F1: {best_f1:.4f}")
    uncompiled_model = model._orig_mod if hasattr(model, '_orig_mod') else model
    uncompiled_model.load_state_dict(torch.load(model_path))
    return uncompiled_model

## üß™ 5. Phase 1: Hyperparameter Search

We will run HPO on a validation split taken from our new, smaller training set.

In [None]:
def objective_function(config, X_train, y_train, alpha_tensor):
    # Create a validation split from the training data for this HPO run
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
    train_idx, val_idx = next(sss.split(X_train, y_train))
    X_train_hpo, y_train_hpo = X_train[train_idx], y_train[train_idx]
    X_val_hpo, y_val_hpo = X_train[val_idx], y_train[val_idx]

    # === MODIFICATION ===
    # Re-order and scale data using StandardScaler, based on our analysis
    continuous_indices_orig = list(range(30)) + [34] # 30 joints + 1 time
    categorical_indices_orig = list(range(30, 34)) + [35] # 4 pain surveys + 1 is_pirate
    
    X_train_hpo_reordered = np.concatenate([X_train_hpo[:, :, continuous_indices_orig], X_train_hpo[:, :, categorical_indices_orig]], axis=2)
    X_val_hpo_reordered = np.concatenate([X_val_hpo[:, :, continuous_indices_orig], X_val_hpo[:, :, categorical_indices_orig]], axis=2)

    continuous_indices_reordered = list(range(31))
    preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuous_indices_reordered)], remainder='passthrough')
    
    ns, ts, f = X_train_hpo_reordered.shape
    X_train_hpo_scaled = preprocessor.fit_transform(X_train_hpo_reordered.reshape(ns*ts, f)).reshape(ns, ts, f)
    ns_val, ts_val, f_val = X_val_hpo_reordered.shape
    X_val_hpo_scaled = preprocessor.transform(X_val_hpo_reordered.reshape(ns_val*ts_val, f_val)).reshape(ns_val, ts_val, f_val)
    # === END MODIFICATION ===

    X_train_w, y_train_w, _ = create_sliding_windows(X_train_hpo_scaled, y_train_hpo, config['window_size'], config['stride'])
    X_val_w, y_val_w, _ = create_sliding_windows(X_val_hpo_scaled, y_val_hpo, config['window_size'], config['stride'])
    
    train_loader = make_loader(TensorDataset(torch.from_numpy(X_train_w).float(), torch.from_numpy(y_train_w).long()), config["batch_size"], True, True)
    val_loader = make_loader(TensorDataset(torch.from_numpy(X_val_w).float(), torch.from_numpy(y_val_w).long()), config["batch_size"], False, False)

    model_config = {k: v for k, v in config.items() if k not in ['window_size', 'stride', 'lr', 'batch_size', 'l2_lambda', 'focal_loss_gamma']}
    model = RecurrentClassifier(**model_config, num_classes=N_CLASSES).to(device)
    model = torch.compile(model, backend="eager")
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["l2_lambda"])
    EPOCHS_HPO = 100
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=config["lr"], epochs=EPOCHS_HPO, steps_per_epoch=len(train_loader))
    scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))
    criterion = FocalLoss(alpha=alpha_tensor, gamma=config['focal_loss_gamma'])

    best_val_f1 = -1.0; patience_counter = 0; hpo_patience = 25
    
    for epoch in range(1, EPOCHS_HPO + 1):
        train_one_epoch(model, train_loader, criterion, optimizer, scaler, scheduler, device)
        _, val_f1 = validate_one_epoch(model, val_loader, criterion, device)
        tune.report({"val_f1": val_f1})
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1; patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= hpo_patience: break

In [None]:
search_space = {
    "rnn_type": tune.choice(['GRU', 'LSTM']),
    "focal_loss_gamma": tune.uniform(0.5, 3.0),
    "lr": tune.loguniform(1e-4, 1e-2),
    "batch_size": tune.choice([64, 128]),
    "hidden_size": tune.choice([256, 384, 512]),
    "num_layers": tune.choice([2, 3]),
    "dropout_rate": tune.uniform(0.1, 0.5),
    "feature_dropout_rate": tune.uniform(0.1, 0.5),
    "bidirectional": tune.choice([True, False]),
    "l2_lambda": tune.loguniform(1e-8, 1e-5),
    "conv_out_channels": tune.choice([64, 128]),
    "conv_kernel_size": tune.choice([3, 5]),
    "window_size": tune.choice([10]), # Fixed based on analysis
    "stride": tune.choice([2])      # Fixed based on analysis
}

if ray.is_initialized(): ray.shutdown()
ray.init(num_cpus=os.cpu_count(), num_gpus=1, ignore_reinit_error=True, log_to_driver=False)

print("--- Starting HPO ---")
analysis = tune.run(
    tune.with_parameters(objective_function, X_train=X_train, y_train=y_train, alpha_tensor=alpha_tensor),
    resources_per_trial={"cpu": 4, "gpu": 0.25},
    config=search_space,
    num_samples=50, # Number of trials to run
    search_alg=OptunaSearch(metric="val_f1", mode="max"),
    scheduler=ASHAScheduler(metric="val_f1", mode="max", grace_period=25, reduction_factor=2),
    name="pirate_pain_final_search",
    verbose=1
)

best_trial = analysis.get_best_trial(metric="val_f1", mode="max", scope="all")
FINAL_CONFIG = best_trial.config
print("\n--- Best Hyperparameters Found ---")
print(FINAL_CONFIG)

## üèÜ 6. Phase 2: K-Fold Ensemble Training

Now we train our final ensemble of models using the best hyperparameters on our full training set with K-Fold cross-validation.

In [None]:
N_SPLITS = 5
FINAL_EXPERIMENT_NAME = f"Final-{FINAL_CONFIG['rnn_type']}_H{FINAL_CONFIG['hidden_size']}_L{FINAL_CONFIG['num_layers']}_v13"

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
fold_val_f1_list = []

# === MODIFICATION === 
# Reorder full training data once before the loop
continuous_indices_orig = list(range(30)) + [34]
categorical_indices_orig = list(range(30, 34)) + [35]
X_train_reordered = np.concatenate([X_train[:, :, continuous_indices_orig], X_train[:, :, categorical_indices_orig]], axis=2)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_reordered, y_train)):
    fold_name = f"{FINAL_EXPERIMENT_NAME}_fold_{fold+1}"
    print(f"\n--- Fold {fold+1}/{N_SPLITS} --- ({fold_name}) ---")
    
    X_train_fold, y_train_fold = X_train_reordered[train_idx], y_train[train_idx]
    X_val_fold, y_val_fold = X_train_reordered[val_idx], y_train[val_idx]

    continuous_indices_reordered = list(range(31))
    preprocessor_fold = ColumnTransformer([('scaler', StandardScaler(), continuous_indices_reordered)], remainder='passthrough')

    ns, ts, f = X_train_fold.shape
    X_train_scaled = preprocessor_fold.fit_transform(X_train_fold.reshape(ns*ts, f)).reshape(ns, ts, f)
    ns_val, ts_val, f_val = X_val_fold.shape
    X_val_scaled = preprocessor_fold.transform(X_val_fold.reshape(ns_val*ts_val, f_val)).reshape(ns_val, ts_val, f_val)
    
    X_train_w, y_train_w, _ = create_sliding_windows(X_train_scaled, y_train_fold, FINAL_CONFIG['window_size'], FINAL_CONFIG['stride'])
    X_val_w, y_val_w, _ = create_sliding_windows(X_val_scaled, y_val_fold, FINAL_CONFIG['window_size'], FINAL_CONFIG['stride'])
    
    train_loader = make_loader(TensorDataset(torch.from_numpy(X_train_w).float(), torch.from_numpy(y_train_w).long()), FINAL_CONFIG['batch_size'], True, True)
    val_loader = make_loader(TensorDataset(torch.from_numpy(X_val_w).float(), torch.from_numpy(y_val_w).long()), FINAL_CONFIG['batch_size'], False, False)

    model_config_kfold = {k: v for k, v in FINAL_CONFIG.items() if k not in ['window_size', 'stride', 'lr', 'batch_size', 'l2_lambda', 'focal_loss_gamma']}
    model_fold = RecurrentClassifier(**model_config_kfold, num_classes=N_CLASSES).to(device)
    model_fold = torch.compile(model_fold, backend="eager")
    
    EPOCHS_K_FOLD = 150
    optimizer = torch.optim.AdamW(model_fold.parameters(), lr=FINAL_CONFIG['lr'], weight_decay=FINAL_CONFIG['l2_lambda'])
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=FINAL_CONFIG['lr'], epochs=EPOCHS_K_FOLD, steps_per_epoch=len(train_loader))
    scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))
    criterion = FocalLoss(alpha=alpha_tensor, gamma=FINAL_CONFIG['focal_loss_gamma'])

    model_fold_uncompiled = fit(model_fold, train_loader, val_loader, EPOCHS_K_FOLD, criterion, optimizer, scheduler, scaler, device, 50, fold_name)
    
    _, val_f1 = validate_one_epoch(model_fold_uncompiled, val_loader, criterion, device)
    fold_val_f1_list.append(val_f1)
    print(f"Fold {fold+1} Final Val F1: {val_f1:.4f}")

print(f"\n--- üèÜ K-Fold Training Complete --- Average CV F1: {np.mean(fold_val_f1_list):.4f} ---")

## üìä 7. Phase 3: Final Evaluation on Hold-Out Test Set

This is the most important step. We will now evaluate our trained K-Fold ensemble on the test set that we held out at the very beginning. This provides a final, unbiased measure of our model's performance on unseen data.

In [None]:
print("--- Starting Final Evaluation on the Hold-Out Test Set ---")

# --- 1. Preprocess the Hold-Out Test Data ---
# We need a scaler fitted on the ENTIRE training set to transform the test set.
X_train_reordered_flat = X_train_reordered.reshape(X_train_reordered.shape[0] * N_TIMESTEPS, X_train_reordered.shape[2])
final_preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuous_indices_reordered)], remainder='passthrough')
final_preprocessor.fit(X_train_reordered_flat)

# Reorder and scale the test data
X_test_reordered = np.concatenate([X_test[:, :, continuous_indices_orig], X_test[:, :, categorical_indices_orig]], axis=2)
X_test_scaled = final_preprocessor.transform(X_test_reordered.reshape(X_test.shape[0] * N_TIMESTEPS, X_test.shape[2])).reshape(X_test.shape)

# Create sliding windows for the test set
X_test_w, y_test_w, test_window_indices = create_sliding_windows(X_test_scaled, y_test, FINAL_CONFIG['window_size'], FINAL_CONFIG['stride'])
test_loader = DataLoader(TensorDataset(torch.from_numpy(X_test_w).float()), batch_size=FINAL_CONFIG['batch_size'], shuffle=False)
print("Test data preprocessed and ready for evaluation.")

# --- 2. Generate Ensemble Predictions ---
all_fold_probabilities_test = []
model_config_eval = {k: v for k, v in FINAL_CONFIG.items() if k not in ['window_size', 'stride', 'lr', 'batch_size', 'l2_lambda', 'focal_loss_gamma']}

for fold in range(N_SPLITS):
    fold_name = f"{FINAL_EXPERIMENT_NAME}_fold_{fold+1}"
    model_path = f"models/{fold_name}_best_model.pt"
    print(f"Loading model {fold+1}/{N_SPLITS} from {model_path}...")
    
    model = RecurrentClassifier(**model_config_eval, num_classes=N_CLASSES).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = torch.compile(model, backend="eager")
    model.eval()
    
    fold_probs = []
    with torch.no_grad():
        for (inputs,) in test_loader:
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                probs = torch.softmax(model(inputs.to(device)), dim=1)
                fold_probs.append(probs.cpu().numpy())
    all_fold_probabilities_test.append(np.concatenate(fold_probs))

# Average probabilities across folds and aggregate for final predictions
mean_probabilities_test = np.mean(all_fold_probabilities_test, axis=0)
df_probs_test = pd.DataFrame(mean_probabilities_test)
df_probs_test['original_index'] = test_window_indices
agg_probs_test = df_probs_test.groupby('original_index').mean().values
final_predictions_test = np.argmax(agg_probs_test, axis=1)
print("\nEnsemble predictions generated for the test set.")

# --- 3. Display Performance Metrics ---
print("\n" + "*"*60)
print("         FINAL UNBIASED PERFORMANCE ON HOLD-OUT TEST SET")
print("*"*60 + "\n")

print(f"Overall Weighted F1-Score: {f1_score(y_test, final_predictions_test, average='weighted'):.4f}\n")

class_names = le.classes_
report = classification_report(y_test, final_predictions_test, target_names=class_names)
print("Classification Report:")
print(report)

cm = confusion_matrix(y_test, final_predictions_test)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.title('Final Confusion Matrix on Unseen Test Data', fontsize=16)
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

## üì¨ 8. Phase 4: Generate Kaggle Submission

Finally, we use our trained ensemble to make predictions on the official Kaggle test set and generate the submission file.

In [None]:
print("--- Preparing Kaggle test dataset for submission ---")
submission_filename_base = f"submission_{FINAL_EXPERIMENT_NAME}.csv"

# Reorder and scale the Kaggle test data using the same preprocessor fitted on our training data
X_test_kaggle_reordered = np.concatenate([X_test_kaggle_engineered[:, :, continuous_indices_orig], X_test_kaggle_engineered[:, :, categorical_indices_orig]], axis=2)
X_test_kaggle_scaled = final_preprocessor.transform(X_test_kaggle_reordered.reshape(X_test_kaggle_reordered.shape[0] * N_TIMESTEPS, X_test_kaggle_reordered.shape[2])).reshape(X_test_kaggle_reordered.shape)

X_test_kaggle_w, test_kaggle_window_indices = create_sliding_windows(X_test_kaggle_scaled, y=None, window_size=FINAL_CONFIG['window_size'], stride=FINAL_CONFIG['stride'])
test_kaggle_loader = DataLoader(TensorDataset(torch.from_numpy(X_test_kaggle_w).float()), FINAL_CONFIG['batch_size'], False, False)

all_fold_probabilities_kaggle = []

for fold in range(N_SPLITS):
    fold_name = f"{FINAL_EXPERIMENT_NAME}_fold_{fold+1}"
    model_path = f"models/{fold_name}_best_model.pt"
    print(f"Loading model {fold+1}/{N_SPLITS} from {model_path} for Kaggle submission...")
    
    model = RecurrentClassifier(**model_config_eval, num_classes=N_CLASSES).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = torch.compile(model, backend="eager")
    model.eval()
    
    fold_preds_kaggle = []
    with torch.no_grad():
        for (inputs,) in test_kaggle_loader:
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                probs = torch.softmax(model(inputs.to(device)), dim=1)
                fold_preds_kaggle.append(probs.cpu().numpy())
    all_fold_probabilities_kaggle.append(np.concatenate(fold_preds_kaggle))

mean_probabilities_kaggle = np.mean(all_fold_probabilities_kaggle, axis=0)
df_probs_kaggle = pd.DataFrame(mean_probabilities_kaggle)
df_probs_kaggle['original_index'] = test_kaggle_window_indices
agg_probs_kaggle = df_probs_kaggle.groupby('original_index').mean().values
final_predictions_kaggle = le.inverse_transform(np.argmax(agg_probs_kaggle, axis=1))

submission_df = pd.DataFrame({'sample_index': sorted(X_test_kaggle_long_df['sample_index'].unique()), 'label': final_predictions_kaggle})
submission_df['sample_index'] = submission_df['sample_index'].apply(lambda x: f"{x:03d}")
submission_filepath = os.path.join("submissions", submission_filename_base)
submission_df.to_csv(submission_filepath, index=False)

print(f"\nSuccessfully saved submission to {submission_filepath}!")
print(submission_df.head())