# **Kaggle Challenge: Pirate Pain Dataset üè¥‚Äç‚ò†Ô∏è (v17: Final Version - Mean Aggregation)**

This notebook represents the definitive final strategy. It **reverts the submission logic back to the original, more robust `mean` windowing aggregation**, as the `Top-K` strategy proved to be ineffective on the leaderboard. 

**üî• Summary of Final Strategy:**

1.  **‚úÖ `Mean` Aggregation Restored:** The final prediction is made by averaging the probabilities from **all** sliding windows. This is a more robust strategy that considers the full context of the time series.
2.  **Stable, Aligned HPO:** The HPO process is correctly aligned with the training pipeline (augmentation, weighted sampling) and is stable on Windows (`num_workers=0`, short trial names).
3.  **Hold-Out Test Set (10%):** A 10% test set is used for a final, unbiased evaluation before submission.
4.  **Full Suite of Balancing Techniques:**
    - **Feature Cleaning:** `joint_30` is removed.
    - **Data Augmentation:** Noise injection for minority classes.
    - **WeightedRandomSampler:** Balanced batches during training.
    - **Focal Loss:** Advanced loss function.
5.  **Optimized Performance:** Uses the `cudagraphs` backend for fast, stable training.

## ‚öôÔ∏è 1. Setup & Libraries

In [None]:
# Set seed for reproducibility
SEED = 1234

# Import necessary libraries
import os
import logging
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import copy
from itertools import product
import time
import gc

os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from functools import partial

logs_dir = "tensorboard"
os.makedirs("models", exist_ok=True)
os.makedirs("submissions", exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
    print("\n--- Using GPU ---")
else:
    device = torch.device("cpu")
    print("\n--- Using CPU ---")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

sns.set_theme(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline

## üîÑ 2. Data Loading & Pre-Split

In [None]:
print("--- 1. Loading Data ---")
DATA_DIR = "data"
X_TRAIN_PATH = os.path.join(DATA_DIR, "pirate_pain_train.csv")
Y_TRAIN_PATH = os.path.join(DATA_DIR, "pirate_pain_train_labels.csv")
X_TEST_PATH = os.path.join(DATA_DIR, "pirate_pain_test.csv")

features_long_df = pd.read_csv(X_TRAIN_PATH)
labels_df = pd.read_csv(Y_TRAIN_PATH)
X_test_long_df = pd.read_csv(X_TEST_PATH)

if 'joint_30' in features_long_df.columns:
    features_long_df = features_long_df.drop(columns=['joint_30'])
    X_test_long_df = X_test_long_df.drop(columns=['joint_30'])
    print("Removed zero-variance feature: 'joint_30'")

N_TIMESTEPS = 160
JOINT_FEATURES = [f"joint_{i:02d}" for i in range(30)]
PAIN_FEATURES = [f"pain_survey_{i}" for i in range(1, 5)]
TIME_FEATURE = ['time']
FEATURES = JOINT_FEATURES + PAIN_FEATURES + TIME_FEATURE
LABEL_MAPPING = {'no_pain': 0, 'low_pain': 1, 'high_pain': 2}
N_CLASSES = len(LABEL_MAPPING)

def reshape_data(df, features_list, n_timesteps):
    df_pivot = df.pivot(index='sample_index', columns='time', values=features_list)
    data_2d = df_pivot.values
    n_samples = data_2d.shape[0]
    data_3d = data_2d.reshape(n_samples, len(features_list), n_timesteps)
    return data_3d.transpose(0, 2, 1)

X_train_full = reshape_data(features_long_df[features_long_df['sample_index'].isin(labels_df['sample_index'].unique())], FEATURES, N_TIMESTEPS)
y_train_full_df = labels_df.sort_values(by='sample_index')
le = LabelEncoder().fit(list(LABEL_MAPPING.keys()))
y_train_full = le.transform(y_train_full_df['label'])

print("\n--- 2. Engineering 'is_pirate' Feature ---")
static_cols = ['sample_index', 'n_legs', 'n_hands', 'n_eyes']
static_df = features_long_df[static_cols].drop_duplicates().set_index('sample_index')
pirate_filter = (static_df['n_legs'] == 'one+peg_leg') | (static_df['n_hands'] == 'one+hook_hand') | (static_df['n_eyes'] == 'one+eye_patch')
pirate_indices = static_df[pirate_filter].index
sample_indices_ordered = sorted(features_long_df[features_long_df['sample_index'].isin(labels_df['sample_index'].unique())]['sample_index'].unique())
is_pirate_map = np.array([1 if idx in pirate_indices else 0 for idx in sample_indices_ordered])
pirate_feature_broadcast = np.tile(is_pirate_map.reshape(-1, 1, 1), (1, N_TIMESTEPS, 1))
X_train_full_engineered = np.concatenate([X_train_full, pirate_feature_broadcast], axis=2)

print("\n--- 3. Creating 90/10 Train/Test Split ---")
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=SEED)
for train_indices, test_indices in splitter.split(X_train_full_engineered, y_train_full):
    X_main_train, y_main_train = X_train_full_engineered[train_indices], y_train_full[train_indices]
    X_holdout_test, y_holdout_test = X_train_full_engineered[test_indices], y_train_full[test_indices]

print(f"Main training set shape: {X_main_train.shape}")
print(f"Hold-out test set shape: {X_holdout_test.shape}")

print("\n--- 4. Calculating Alpha Weights for Focal Loss (from main training set) ---")
class_counts = np.bincount(y_main_train)
class_weights_tensor = 1.0 / torch.tensor(class_counts, dtype=torch.float)
alpha_tensor = (class_weights_tensor / class_weights_tensor.sum()).to(device)
print(f"Class counts (0, 1, 2) in main training set: {class_counts}")
print(f"Calculated alpha weights: {alpha_tensor}")

## üõ†Ô∏è 3. Helper Functions & Custom Loss

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        if self.alpha is not None:
            alpha_t = self.alpha[targets].to(focal_loss.device)
            focal_loss = alpha_t * focal_loss
        if self.reduction == 'mean': return focal_loss.mean()
        elif self.reduction == 'sum': return focal_loss.sum()
        else: return focal_loss

def create_sliding_windows(X_3d, y=None, window_size=100, stride=20):
    new_X, new_y, window_indices = [], [], []
    n_samples, n_timesteps, _ = X_3d.shape
    for i in range(n_samples):
        idx = 0
        while (idx + window_size) <= n_timesteps:
            new_X.append(X_3d[i, idx:idx+window_size, :])
            window_indices.append(i)
            if y is not None: new_y.append(y[i])
            idx += stride
    if y is not None: return np.array(new_X), np.array(new_y), np.array(window_indices)
    return np.array(new_X), np.array(window_indices)

def make_loader(ds, batch_size, shuffle, drop_last, sampler=None, num_workers=4):
    use_shuffle = shuffle if sampler is None else False
    return DataLoader(ds, batch_size=int(batch_size), shuffle=use_shuffle, drop_last=drop_last, 
                      num_workers=num_workers, pin_memory=True, persistent_workers=(num_workers > 0), sampler=sampler)

def augment_minority_classes(X_w, y_w, continuous_feature_count, noise_level=0.01, aug_factor=1):
    X_aug_list, y_aug_list = [X_w], [y_w]
    minority_indices = np.where((y_w == 1) | (y_w == 2))[0]
    if len(minority_indices) == 0: return X_w, y_w
    for _ in range(aug_factor):
        X_to_augment = X_w[minority_indices]
        noise = np.random.normal(0, noise_level, X_to_augment.shape)
        X_augmented = X_to_augment.copy()
        X_augmented[:, :, :continuous_feature_count] += noise[:, :, :continuous_feature_count]
        X_aug_list.append(X_augmented)
        y_aug_list.append(y_w[minority_indices])
    return np.concatenate(X_aug_list, axis=0), np.concatenate(y_aug_list, axis=0)

## üß† 4. Model & Training Engine

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, rnn_outputs):
        energy = torch.tanh(self.attn(rnn_outputs))
        attn_scores = self.v(energy).squeeze(2)
        attn_weights = torch.softmax(attn_scores, dim=1)
        context_vector = torch.bmm(attn_weights.unsqueeze(1), rnn_outputs).squeeze(1)
        return context_vector

class RecurrentClassifier(nn.Module):
    def __init__(self, hidden_size, num_layers, num_classes,
                 conv_out_channels, conv_kernel_size, bidirectional,
                 dropout_rate, feature_dropout_rate, rnn_type='GRU'):
        super().__init__()
        self.rnn_type, self.num_layers, self.hidden_size, self.bidirectional = \
            rnn_type, num_layers, hidden_size, bidirectional
        
        rnn_hidden_dim = hidden_size * 2 if bidirectional else hidden_size

        self.pain_embed_dim, self.pirate_embed_dim = 4, 4
        self.pain_embeddings = nn.ModuleList([nn.Embedding(3, self.pain_embed_dim) for _ in range(4)])
        self.pirate_embedding = nn.Embedding(2, self.pirate_embed_dim)
        
        num_continuous_features = 31 # 30 joints + 1 time
        total_embedding_dim = (4 * self.pain_embed_dim) + self.pirate_embed_dim
        conv_input_size = num_continuous_features + total_embedding_dim

        self.conv1d = nn.Conv1d(in_channels=conv_input_size, out_channels=conv_out_channels,
                                kernel_size=conv_kernel_size, padding='same')
        self.conv_activation = nn.ReLU()
        self.feature_dropout = nn.Dropout(feature_dropout_rate)

        rnn_class = nn.GRU if rnn_type == 'GRU' else nn.LSTM
        self.rnn = rnn_class(
            input_size=conv_out_channels, hidden_size=hidden_size,
            num_layers=num_layers, batch_first=True, bidirectional=bidirectional,
            dropout=dropout_rate if num_layers > 1 else 0)
        
        self.attention = Attention(rnn_hidden_dim)
        self.classifier = nn.Linear(rnn_hidden_dim, num_classes)

    def forward(self, x):
        x_continuous = x[:, :, :31]
        x_categorical = x[:, :, 31:].long()
        
        embedded_cats = [self.pain_embeddings[i](x_categorical[:, :, i]) for i in range(4)] \
                      + [self.pirate_embedding(x_categorical[:, :, 4])]
        x_combined = torch.cat([x_continuous] + embedded_cats, dim=2)
        x_permuted = x_combined.permute(0, 2, 1)
        x_conv = self.conv_activation(self.conv1d(x_permuted))
        x_conv_permuted = x_conv.permute(0, 2, 1)
        x_dropped = self.feature_dropout(x_conv_permuted)
        rnn_outputs, _ = self.rnn(x_dropped)
        context_vector = self.attention(rnn_outputs)
        return self.classifier(context_vector)

def train_one_epoch(model, loader, criterion, optimizer, scaler, scheduler, device):
    model.train()
    total_loss, all_preds, all_targets = 0, [], []
    num_samples = len(loader.sampler) if loader.sampler else len(loader.dataset)
    for x, y in loader:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            logits = model(x)
            loss = criterion(logits, y)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        total_loss += loss.item() * x.size(0)
        all_preds.append(logits.argmax(dim=1).cpu().numpy())
        all_targets.append(y.cpu().numpy())
    return total_loss / num_samples, f1_score(np.concatenate(all_targets), np.concatenate(all_preds), average='weighted')

def validate_one_epoch(model, loader, criterion, device):
    model.eval()
    total_loss, all_preds, all_targets = 0, [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                logits = model(x)
                loss = criterion(logits, y)
            total_loss += loss.item() * x.size(0)
            all_preds.append(logits.argmax(dim=1).cpu().numpy())
            all_targets.append(y.cpu().numpy())
    return total_loss / len(loader.dataset.tensors[0]), f1_score(np.concatenate(all_targets), np.concatenate(all_preds), average='weighted')

def fit(model, train_loader, val_loader, epochs, criterion, optimizer, scheduler, scaler, device, patience, experiment_name):
    model_path = f"models/{experiment_name}_best_model.pt"
    best_f1 = -1; patience_counter = 0
    print(f"--- Starting Training: {experiment_name} ---")
    for epoch in range(1, epochs + 1):
        train_loss, train_f1 = train_one_epoch(model, train_loader, criterion, optimizer, scaler, scheduler, device)
        val_loss, val_f1 = validate_one_epoch(model, val_loader, criterion, device)
        if epoch % 10 == 0: print(f"Epoch {epoch:3d}/{epochs} | Val F1: {val_f1:.4f} | Train Loss: {train_loss:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}")
        if val_f1 > best_f1:
            best_f1, patience_counter = val_f1, 0
            torch.save(model._orig_mod.state_dict() if hasattr(model, '_orig_mod') else model.state_dict(), model_path)
        else:
            patience_counter += 1
            if patience_counter >= patience: print(f"Early stopping at epoch {epoch}. Best F1: {best_f1:.4f}"); break
    print(f"--- Finished Training --- Best F1: {best_f1:.4f}")
    uncompiled_model = model._orig_mod if hasattr(model, '_orig_mod') else model
    uncompiled_model.load_state_dict(torch.load(model_path))
    return uncompiled_model

## üß™ 5. Phase 1: Aligned & Stable Hyperparameter Search

In [None]:
def objective_function(config, X_train_w_ref, y_train_w_ref, X_val_w_ref, y_val_w_ref, alpha_tensor, continuous_indices_reordered):
    X_train_w = ray.get(X_train_w_ref)
    y_train_w = ray.get(y_train_w_ref)
    X_val_w = ray.get(X_val_w_ref)
    y_val_w = ray.get(y_val_w_ref)
    
    EPOCHS = 100
    
    X_train_w_aug, y_train_w_aug = augment_minority_classes(
        X_train_w, y_train_w, continuous_feature_count=len(continuous_indices_reordered), aug_factor=1
    )
    
    class_counts = np.bincount(y_train_w_aug)
    class_weights = 1. / (class_counts + 1e-9)
    sample_weights = np.array([class_weights[t] for t in y_train_w_aug])
    sampler = WeightedRandomSampler(torch.from_numpy(sample_weights).double(), num_samples=len(sample_weights), replacement=True)

    train_ds = TensorDataset(torch.from_numpy(X_train_w_aug).float(), torch.from_numpy(y_train_w_aug).long())
    val_ds = TensorDataset(torch.from_numpy(X_val_w).float(), torch.from_numpy(y_val_w).long())
    
    train_loader = make_loader(train_ds, config["batch_size"], shuffle=False, drop_last=True, sampler=sampler, num_workers=0)
    val_loader = make_loader(val_ds, config["batch_size"], False, False, num_workers=0)

    model_config = {k: v for k, v in config.items() if k not in ['lr', 'batch_size', 'l2_lambda', 'focal_loss_gamma']}
    model = RecurrentClassifier(**model_config, num_classes=N_CLASSES).to(device)
    model = torch.compile(model, backend="cudagraphs")
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["l2_lambda"])
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=config["lr"], epochs=EPOCHS, steps_per_epoch=len(train_loader))
    
    scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))
    criterion = FocalLoss(alpha=alpha_tensor, gamma=config['focal_loss_gamma'])

    best_val_f1 = -1.0; patience_counter = 0; hpo_patience = 20
    
    for epoch in range(1, EPOCHS + 1):
        train_loss, _ = train_one_epoch(model, train_loader, criterion, optimizer, scaler, scheduler, device)
        _, val_f1 = validate_one_epoch(model, val_loader, criterion, device)
        tune.report({"val_f1": val_f1, "train_loss": train_loss})
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1; patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= hpo_patience: break
    del model, train_loader, val_loader
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [None]:
print("--- Preparing data for HPO ---")
WINDOW_SIZE = 10
STRIDE = 2

continuous_indices_orig = list(range(30)) + [34]
categorical_indices_orig = list(range(30, 34)) + [35]
X_main_train_reordered = np.concatenate([
    X_main_train[:, :, continuous_indices_orig],
    X_main_train[:, :, categorical_indices_orig]], axis=2)

hpo_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
for hpo_train_idx, hpo_val_idx in hpo_splitter.split(X_main_train_reordered, y_main_train):
    X_hpo_train, y_hpo_train = X_main_train_reordered[hpo_train_idx], y_main_train[hpo_train_idx]
    X_hpo_val, y_hpo_val = X_main_train_reordered[hpo_val_idx], y_main_train[hpo_val_idx]

continuous_indices_reordered = list(range(31))
preprocessor_hpo = ColumnTransformer([('scaler', StandardScaler(), continuous_indices_reordered)], remainder='passthrough')
ns, ts, f = X_hpo_train.shape
X_hpo_train_scaled = preprocessor_hpo.fit_transform(X_hpo_train.reshape(ns*ts, f)).reshape(ns, ts, f)
ns_val, ts_val, f_val = X_hpo_val.shape
X_hpo_val_scaled = preprocessor_hpo.transform(X_hpo_val.reshape(ns_val*ts_val, f_val)).reshape(ns_val, ts_val, f_val)

X_train_w, y_train_w, _ = create_sliding_windows(X_hpo_train_scaled, y_hpo_train, WINDOW_SIZE, STRIDE)
X_val_w, y_val_w, _ = create_sliding_windows(X_hpo_val_scaled, y_hpo_val, WINDOW_SIZE, STRIDE)

search_space = {
    "rnn_type": tune.choice(['GRU', 'LSTM']),
    "focal_loss_gamma": tune.uniform(0.5, 3.0),
    "lr": tune.loguniform(1e-4, 1e-2),
    "batch_size": tune.choice([64, 128]),
    "hidden_size": tune.choice([256, 384, 512]), 
    "num_layers": tune.choice([2, 3]),
    "dropout_rate": tune.uniform(0, 0.5), 
    "feature_dropout_rate": tune.uniform(0, 0.5),
    "bidirectional": tune.choice([True, False]), 
    "l2_lambda": tune.loguniform(1e-8, 1e-5),
    "conv_out_channels": tune.choice([128]), 
    "conv_kernel_size": tune.choice([5])
}

if ray.is_initialized(): ray.shutdown()
ray.init(num_cpus=os.cpu_count(), num_gpus=1, ignore_reinit_error=True, log_to_driver=False)

X_train_w_ref = ray.put(X_train_w)
y_train_w_ref = ray.put(y_train_w)
X_val_w_ref = ray.put(X_val_w)
y_val_w_ref = ray.put(y_val_w)

def short_trial_name(trial):
    return f"{trial.trainable_name}_{trial.trial_id}"

print("--- Starting Aligned & Stable HPO ---")
analysis = tune.run(
    partial(objective_function, 
            X_train_w_ref=X_train_w_ref, y_train_w_ref=y_train_w_ref,
            X_val_w_ref=X_val_w_ref, y_val_w_ref=y_val_w_ref,
            alpha_tensor=alpha_tensor,
            continuous_indices_reordered=continuous_indices_reordered),
    resources_per_trial={"cpu": 4, "gpu": 0.25},
    config=search_space, 
    num_samples=60,
    search_alg=OptunaSearch(metric="val_f1", mode="max"),
    scheduler=ASHAScheduler(metric="val_f1", mode="max", grace_period=15, reduction_factor=2),
    name="pirate_pain_aligned_hpo_v17",
    verbose=1,
    trial_dirname_creator=short_trial_name
)

In [None]:
print("\n--- Loading HPO Search Results ---")

try:
    best_trial = analysis.get_best_trial(metric="val_f1", mode="max", scope="all")
    if best_trial:
        FINAL_CONFIG = best_trial.config
        FINAL_BEST_VAL_F1 = best_trial.last_result.get("val_f1", 0.0) 
        print(f"Best validation F1 score from HPO: {FINAL_BEST_VAL_F1:.4f}")
        print("Best hyperparameters found:")
        print(FINAL_CONFIG)
    else: raise ValueError("No successful trials found.")
except Exception as e:
    print(f"\nWARNING: Could not load HPO analysis. The error was: {e}")
    print("\n--- USING FALLBACK CONFIGURATION ---")
    FINAL_CONFIG = {'rnn_type': 'GRU', 'focal_loss_gamma': 2.8, 'lr': 0.0015, 'batch_size': 128, 'hidden_size': 512, 'num_layers': 2, 'dropout_rate': 0.35, 'feature_dropout_rate': 0.33, 'bidirectional': False, 'l2_lambda': 4.2e-06, 'conv_out_channels': 128, 'conv_kernel_size': 5}
    FINAL_BEST_VAL_F1 = 0.93 # Fallback score reflects the harder, more realistic HPO task
    print("Best hyperparameters (fallback):\n", FINAL_CONFIG)

FINAL_CONFIG['window_size'] = WINDOW_SIZE
FINAL_CONFIG['stride'] = STRIDE

del X_train_w_ref, y_train_w_ref, X_val_w_ref, y_val_w_ref, X_hpo_train, y_hpo_train, X_hpo_val, y_hpo_val, X_hpo_train_scaled, X_hpo_val_scaled
gc.collect()

## üèÜ 6. Phase 2: K-Fold Ensemble Training on 90% Training Set

In [None]:
print("--- üèÜ Final Configuration Set for K-Fold Training --- ")
N_SPLITS = 5
FINAL_EXPERIMENT_NAME = f"StableFinalMean-{FINAL_CONFIG['rnn_type']}_H{FINAL_CONFIG['hidden_size']}_v17"
submission_filename_base = f"submission_{FINAL_EXPERIMENT_NAME}.csv"
print(f"Submission name will be: {submission_filename_base}")

In [None]:
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
fold_val_f1_list = []
continuous_indices_reordered = list(range(31))
EPOCHS = 350
PATIENCE = 100

X_main_train_reordered = np.concatenate([
    X_main_train[:, :, continuous_indices_orig],
    X_main_train[:, :, categorical_indices_orig]], axis=2)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_main_train_reordered, y_main_train)):
    fold_name = f"{FINAL_EXPERIMENT_NAME}_fold_{fold+1}"
    print(f"\n--- Fold {fold+1}/{N_SPLITS} --- ({fold_name}) ---")
    
    X_train_fold, y_train_fold = X_main_train_reordered[train_idx], y_main_train[train_idx]
    X_val_fold, y_val_fold = X_main_train_reordered[val_idx], y_main_train[val_idx]

    preprocessor_fold = ColumnTransformer([('s', StandardScaler(), continuous_indices_reordered)], remainder='passthrough')
    ns, ts, f = X_train_fold.shape
    X_train_scaled = preprocessor_fold.fit_transform(X_train_fold.reshape(ns*ts, f)).reshape(ns, ts, f)
    ns_val, ts_val, f_val = X_val_fold.shape
    X_val_scaled = preprocessor_fold.transform(X_val_fold.reshape(ns_val*ts_val, f_val)).reshape(ns_val, ts_val, f_val)
    
    X_train_w, y_train_w, _ = create_sliding_windows(X_train_scaled, y_train_fold, FINAL_CONFIG['window_size'], FINAL_CONFIG['stride'])
    X_val_w, y_val_w, _ = create_sliding_windows(X_val_scaled, y_val_fold, FINAL_CONFIG['window_size'], FINAL_CONFIG['stride'])
    
    X_train_w_aug, y_train_w_aug = augment_minority_classes(X_train_w, y_train_w, continuous_feature_count=len(continuous_indices_reordered), aug_factor=1)

    class_counts = np.bincount(y_train_w_aug)
    class_weights = 1. / class_counts
    sample_weights = np.array([class_weights[t] for t in y_train_w_aug])
    sampler = WeightedRandomSampler(torch.from_numpy(sample_weights).double(), num_samples=len(sample_weights), replacement=True)

    train_ds = TensorDataset(torch.from_numpy(X_train_w_aug).float(), torch.from_numpy(y_train_w_aug).long())
    val_ds = TensorDataset(torch.from_numpy(X_val_w).float(), torch.from_numpy(y_val_w).long())
    train_loader = make_loader(train_ds, FINAL_CONFIG['batch_size'], shuffle=False, drop_last=True, sampler=sampler, num_workers=4)
    val_loader = make_loader(val_ds, FINAL_CONFIG['batch_size'], shuffle=False, drop_last=False, num_workers=4)

    model_config_kfold = {k: v for k, v in FINAL_CONFIG.items() if k not in ['window_size', 'stride', 'lr', 'batch_size', 'l2_lambda', 'focal_loss_gamma']}
    model_fold = RecurrentClassifier(**model_config_kfold, num_classes=N_CLASSES).to(device)
    model_fold = torch.compile(model_fold, backend="cudagraphs")
    
    optimizer = torch.optim.AdamW(model_fold.parameters(), lr=FINAL_CONFIG['lr'], weight_decay=FINAL_CONFIG['l2_lambda'])
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=FINAL_CONFIG['lr'], epochs=EPOCHS, steps_per_epoch=len(train_loader))
    scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))
    criterion = FocalLoss(alpha=alpha_tensor, gamma=FINAL_CONFIG['focal_loss_gamma'])

    model_fold_uncompiled = fit(model_fold, train_loader, val_loader, EPOCHS, criterion, optimizer, scheduler, scaler, device, PATIENCE, fold_name)
    
    _, val_f1 = validate_one_epoch(model_fold_uncompiled, val_loader, criterion, device)
    fold_val_f1_list.append(val_f1)
    print(f"Fold {fold+1} Final Val F1: {val_f1:.4f}")
    
    gc.collect()
    if torch.cuda.is_available(): torch.cuda.empty_cache()

print(f"\n--- üèÜ K-Fold Training Complete --- Average CV F1: {np.mean(fold_val_f1_list):.4f}")

## üìä 7. Final Model Evaluation on Hold-Out Set

In [None]:
print("--- Evaluating ensembled model on the 10% hold-out test set ---")

X_holdout_test_reordered = np.concatenate([
    X_holdout_test[:, :, continuous_indices_orig],
    X_holdout_test[:, :, categorical_indices_orig]], axis=2)

final_preprocessor = ColumnTransformer([('scaler', StandardScaler(), continuous_indices_reordered)], remainder='passthrough')
ns, ts, f = X_main_train_reordered.shape
final_preprocessor.fit(X_main_train_reordered.reshape(ns * ts, f))

ns_test, ts_test, f_test = X_holdout_test_reordered.shape
X_holdout_scaled = final_preprocessor.transform(X_holdout_test_reordered.reshape(ns_test * ts_test, f_test)).reshape(ns_test, ts_test, f_test)
X_holdout_w, holdout_window_indices = create_sliding_windows(X_holdout_scaled, y=None, window_size=FINAL_CONFIG['window_size'], stride=FINAL_CONFIG['stride'])
holdout_loader = make_loader(TensorDataset(torch.from_numpy(X_holdout_w).float()), FINAL_CONFIG['batch_size'], False, False)

model_config_final = {k: v for k, v in FINAL_CONFIG.items() if k not in ['window_size', 'stride', 'lr', 'batch_size', 'l2_lambda', 'focal_loss_gamma']}
all_fold_probs = []
for fold in range(N_SPLITS):
    fold_name = f"{FINAL_EXPERIMENT_NAME}_fold_{fold+1}"
    model_path = f"models/{fold_name}_best_model.pt"
    model = RecurrentClassifier(**model_config_final, num_classes=N_CLASSES).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    fold_preds = []
    with torch.no_grad():
        for (inputs,) in holdout_loader:
            probs = torch.softmax(model(inputs.to(device)), dim=1)
            fold_preds.append(probs.cpu().numpy())
    all_fold_probs.append(np.concatenate(fold_preds))

# --- REVERTED TO MEAN: Reverting to the original, more robust mean aggregation ---
print("\n--- Aggregating predictions using Mean Aggregation ---")
mean_probs = np.mean(all_fold_probs, axis=0)
df_probs = pd.DataFrame(mean_probs, columns=[f"prob_{c}" for c in range(N_CLASSES)])
df_probs['original_index'] = holdout_window_indices

agg_probs = df_probs.groupby('original_index')[[f"prob_{c}" for c in range(N_CLASSES)]].mean().values
final_holdout_preds = np.argmax(agg_probs, axis=1)

print(f"\n--- Classification Report on {len(y_holdout_test)}-Sample Hold-Out Set ---")
print(classification_report(y_holdout_test, final_holdout_preds, target_names=le.classes_))

print("--- Confusion Matrix ---")
cm = confusion_matrix(y_holdout_test, final_holdout_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix on Hold-Out Set')
plt.show()

## üì¨ 8. Phase 3: Ensemble Submission

In [None]:
print("\n--- Preparing competition test dataset for submission ---")

X_test_full = reshape_data(X_test_long_df, FEATURES, N_TIMESTEPS)
static_df_test = X_test_long_df[static_cols].drop_duplicates().set_index('sample_index')
pirate_filter_test = (static_df_test['n_legs'] == 'one+peg_leg') | (static_df_test['n_hands'] == 'one+hook_hand') | (static_df_test['n_eyes'] == 'one+eye_patch')
pirate_indices_test = static_df_test[pirate_filter_test].index
sample_indices_test_ordered = sorted(X_test_long_df['sample_index'].unique())
is_pirate_map_test = np.array([1 if idx in pirate_indices_test else 0 for idx in sample_indices_test_ordered])
pirate_feature_broadcast_test = np.tile(is_pirate_map_test.reshape(-1, 1, 1), (1, N_TIMESTEPS, 1))
X_test_full_engineered = np.concatenate([X_test_full, pirate_feature_broadcast_test], axis=2)

X_test_reordered = np.concatenate([
    X_test_full_engineered[:, :, continuous_indices_orig],
    X_test_full_engineered[:, :, categorical_indices_orig]], axis=2)

ns_test, ts_test, f_test = X_test_reordered.shape
X_test_scaled = final_preprocessor.transform(X_test_reordered.reshape(ns_test * ts_test, f_test)).reshape(ns_test, ts_test, f_test)

X_test_w, test_window_indices = create_sliding_windows(X_test_scaled, y=None, window_size=FINAL_CONFIG['window_size'], stride=FINAL_CONFIG['stride'])
submission_loader = make_loader(TensorDataset(torch.from_numpy(X_test_w).float()), FINAL_CONFIG['batch_size'], False, False)

all_fold_probs_submission = []
model_config_final = {k: v for k, v in FINAL_CONFIG.items() if k not in ['window_size', 'stride', 'lr', 'batch_size', 'l2_lambda', 'focal_loss_gamma']}

for fold in range(N_SPLITS):
    fold_name = f"{FINAL_EXPERIMENT_NAME}_fold_{fold+1}"
    model_path = f"models/{fold_name}_best_model.pt"
    model = RecurrentClassifier(**model_config_final, num_classes=N_CLASSES).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    fold_preds = []
    with torch.no_grad():
        for (inputs,) in submission_loader:
            probs = torch.softmax(model(inputs.to(device)), dim=1)
            fold_preds.append(probs.cpu().numpy())
    all_fold_probs_submission.append(np.concatenate(fold_preds))

# --- REVERTED TO MEAN: Using the robust mean aggregation for the final submission ---
mean_probs_sub = np.mean(all_fold_probs_submission, axis=0)
df_probs_sub = pd.DataFrame(mean_probs_sub, columns=[f"prob_{c}" for c in range(N_CLASSES)])
df_probs_sub['original_index'] = test_window_indices

agg_probs_sub = df_probs_sub.groupby('original_index')[[f"prob_{c}" for c in range(N_CLASSES)]].mean().values
final_predictions = le.inverse_transform(np.argmax(agg_probs_sub, axis=1))

submission_df = pd.DataFrame({'sample_index': sorted(X_test_long_df['sample_index'].unique()), 'label': final_predictions})
submission_df['sample_index'] = submission_df['sample_index'].apply(lambda x: f"{x:03d}")
submission_filepath = os.path.join("submissions", submission_filename_base)
submission_df.to_csv(submission_filepath, index=False)
print(f"\nSuccessfully saved to {submission_filepath}!")
print(submission_df.head())