# **Kaggle Challenge: Pirate Pain Dataset üè¥‚Äç‚ò†Ô∏è (v12: Compiled Model & OneCycleLR)**

This notebook targets a major training speedup by using PyTorch's JIT compiler. It also replaces the learning rate scheduler with a more aggressive and often faster-converging policy.

**Strategy Update:**
1.  **üî• PyTorch 2.0 Model Compilation:** The model is now wrapped with `torch.compile()`. This JIT-compiles the model into optimized, high-performance kernels for a significant training and inference speedup with no change to the model's logic.
2.  **One-Cycle LR Scheduler:** The Cosine Annealing scheduler has been replaced with `OneCycleLR`. This policy can lead to faster convergence and better final performance. The training loop has been modified to step the scheduler after each batch, as required.
3.  **Focal Loss (Cost-Sensitive Learning):** We retain the use of Focal Loss to dynamically focus training on hard-to-classify examples.
4.  **HPO for RNN Type:** The hyperparameter search continues to explore whether `GRU` or `LSTM` is the optimal choice for this dataset.

## ‚öôÔ∏è 1. Setup & Libraries

In [14]:
# Set seed for reproducibility
SEED = 1234

# Import necessary libraries
import os
import logging
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import copy
from itertools import product
import time
import gc

# Set environment variables before importing modules
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# --- PyTorch Imports ---
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader

# --- Sklearn Imports ---
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

# --- Ray[tune] & Optuna Imports ---
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from functools import partial

# --- Setup Directories & Device ---
logs_dir = "tensorboard"
os.makedirs("models", exist_ok=True)
os.makedirs("submissions", exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
    print("\n--- Using GPU ---")
else:
    device = torch.device("cpu")
    print("\n--- Using CPU ---")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Configure plot display settings
sns.set_theme(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline


--- Using GPU ---
PyTorch version: 2.5.1
Device: cuda


## üîÑ 2. Data Loading & Feature Engineering

In [15]:
print("--- 1. Loading Data ---")

# --- Define File Paths and Features ---
DATA_DIR = "data"
X_TRAIN_PATH = os.path.join(DATA_DIR, "pirate_pain_train.csv")
Y_TRAIN_PATH = os.path.join(DATA_DIR, "pirate_pain_train_labels.csv")
X_TEST_PATH = os.path.join(DATA_DIR, "pirate_pain_test.csv")
SUBMISSION_PATH = os.path.join(DATA_DIR, "sample_submission.csv")

try:
    features_long_df = pd.read_csv(X_TRAIN_PATH)
    labels_df = pd.read_csv(Y_TRAIN_PATH)
    X_test_long_df = pd.read_csv(X_TEST_PATH)
    
    N_TIMESTEPS = 160
    JOINT_FEATURES = [f"joint_{i:02d}" for i in range(31)]
    PAIN_FEATURES = [f"pain_survey_{i}" for i in range(1, 5)]
    TIME_FEATURE = ['time']
    FEATURES = JOINT_FEATURES + PAIN_FEATURES + TIME_FEATURE
    
    # === NEW CLEANING STEP: Remove Zero-Variance Column ===
    if 'joint_30' in FEATURES:
        FEATURES.remove('joint_30')
        print("REMOVED 'joint_30' from feature list.")
    # ========================================================
    
    LABEL_MAPPING = {'no_pain': 0, 'low_pain': 1, 'high_pain': 2}
    N_CLASSES = len(LABEL_MAPPING)

    def reshape_data(df, features_list, n_timesteps):
        df_pivot = df.pivot(index='sample_index', columns='time', values=features_list)
        data_2d = df_pivot.values
        n_samples = data_2d.shape[0]
        data_3d = data_2d.reshape(n_samples, len(features_list), n_timesteps)
        return data_3d.transpose(0, 2, 1)

    X_train_full = reshape_data(features_long_df[features_long_df['sample_index'].isin(labels_df['sample_index'].unique())], FEATURES, N_TIMESTEPS)
    X_test_full = reshape_data(X_test_long_df, FEATURES, N_TIMESTEPS)
    y_train_full_df = labels_df.sort_values(by='sample_index')
    le = LabelEncoder().fit(list(LABEL_MAPPING.keys()))
    y_train_full = le.transform(y_train_full_df['label'])
    print(f"Loaded X_train_full (shape: {X_train_full.shape}) and y_train_full (shape: {y_train_full.shape})")
    print(f"Loaded X_test_full (shape: {X_test_full.shape})")

    print("\n--- 2. Engineering 'is_pirate' Feature ---")
    static_cols = ['sample_index', 'n_legs', 'n_hands', 'n_eyes']
    static_df = features_long_df[static_cols].drop_duplicates().set_index('sample_index')
    pirate_filter = (static_df['n_legs'] == 'one+peg_leg') | (static_df['n_hands'] == 'one+hook_hand') | (static_df['n_eyes'] == 'one+eye_patch')
    pirate_indices = static_df[pirate_filter].index
    sample_indices_ordered = sorted(features_long_df[features_long_df['sample_index'].isin(labels_df['sample_index'].unique())]['sample_index'].unique())
    is_pirate_map = np.array([1 if idx in pirate_indices else 0 for idx in sample_indices_ordered])
    pirate_feature_broadcast = np.tile(is_pirate_map.reshape(-1, 1, 1), (1, N_TIMESTEPS, 1))
    X_train_full_engineered = np.concatenate([X_train_full, pirate_feature_broadcast], axis=2)

    static_df_test = X_test_long_df[static_cols].drop_duplicates().set_index('sample_index')
    pirate_filter_test = (static_df_test['n_legs'] == 'one+peg_leg') | (static_df_test['n_hands'] == 'one+hook_hand') | (static_df_test['n_eyes'] == 'one+eye_patch')
    pirate_indices_test = static_df_test[pirate_filter_test].index
    sample_indices_test_ordered = sorted(X_test_long_df['sample_index'].unique())
    is_pirate_map_test = np.array([1 if idx in pirate_indices_test else 0 for idx in sample_indices_test_ordered])
    pirate_feature_broadcast_test = np.tile(is_pirate_map_test.reshape(-1, 1, 1), (1, N_TIMESTEPS, 1))
    X_test_full_engineered = np.concatenate([X_test_full, pirate_feature_broadcast_test], axis=2)
    
    N_FEATURES_NEW = X_train_full_engineered.shape[2]
    print(f"Created X_train_full_engineered (shape: {X_train_full_engineered.shape})")
    print(f"Created X_test_full_engineered (shape: {X_test_full_engineered.shape})")
    print(f"N_FEATURES is now: {N_FEATURES_NEW}")

    print("\n--- 3. Calculating Alpha Weights for Focal Loss ---")
    class_counts_series = labels_df['label'].value_counts()
    counts_ordered = class_counts_series.reindex(LABEL_MAPPING.keys()).values
    class_weights_tensor = 1.0 / torch.tensor(counts_ordered, dtype=torch.float)
    alpha_tensor = (class_weights_tensor / class_weights_tensor.sum()).to(device)
    print(f"Class counts (0, 1, 2): {counts_ordered}")
    print(f"Calculated alpha weights: {alpha_tensor}")

except Exception as e:
    print(f"An error occurred: {e}")

--- 1. Loading Data ---
REMOVED 'joint_30' from feature list.
Loaded X_train_full (shape: (661, 160, 35)) and y_train_full (shape: (661,))
Loaded X_test_full (shape: (1324, 160, 35))

--- 2. Engineering 'is_pirate' Feature ---
Created X_train_full_engineered (shape: (661, 160, 36))
Created X_test_full_engineered (shape: (1324, 160, 36))
N_FEATURES is now: 36

--- 3. Calculating Alpha Weights for Focal Loss ---
Class counts (0, 1, 2): [511  94  56]
Calculated alpha weights: tensor([0.0643, 0.3493, 0.5864], device='cuda:0')


## üõ†Ô∏è 3. Helper Functions & Custom Loss

In [16]:
class FocalLoss(nn.Module):
    """Implements Focal Loss for cost-sensitive learning."""
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.alpha is not None:
            alpha_t = self.alpha[targets].to(focal_loss.device)
            focal_loss = alpha_t * focal_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

def create_sliding_windows(X_3d, y=None, window_size=100, stride=20):
    new_X, new_y, window_indices = [], [], []
    n_samples, n_timesteps, _ = X_3d.shape
    for i in range(n_samples):
        idx = 0
        while (idx + window_size) <= n_timesteps:
            new_X.append(X_3d[i, idx:idx+window_size, :])
            window_indices.append(i)
            if y is not None: new_y.append(y[i])
            idx += stride
    if y is not None:
        return np.array(new_X), np.array(new_y), np.array(window_indices)
    return np.array(new_X), np.array(window_indices)

def make_loader(ds, batch_size, shuffle, drop_last):
    return DataLoader(ds, batch_size=int(batch_size), shuffle=shuffle, drop_last=drop_last, 
                      num_workers=2, pin_memory=True, persistent_workers=True)

## üß† 4. Model & Training Engine

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, rnn_outputs):
        energy = torch.tanh(self.attn(rnn_outputs))
        attn_scores = self.v(energy).squeeze(2)
        attn_weights = torch.softmax(attn_scores, dim=1)
        context_vector = torch.bmm(attn_weights.unsqueeze(1), rnn_outputs).squeeze(1)
        return context_vector

class RecurrentClassifier(nn.Module):
    def __init__(self, hidden_size, num_layers, num_classes,
                 conv_out_channels, conv_kernel_size, bidirectional,
                 dropout_rate, feature_dropout_rate, rnn_type='GRU'):
        super().__init__()
        self.rnn_type, self.num_layers, self.hidden_size, self.bidirectional = \
            rnn_type, num_layers, hidden_size, bidirectional
        
        rnn_hidden_dim = hidden_size * 2 if bidirectional else hidden_size

        self.pain_embed_dim, self.pirate_embed_dim = 4, 4
        self.pain_embeddings = nn.ModuleList([nn.Embedding(3, self.pain_embed_dim) for _ in range(4)])
        self.pirate_embedding = nn.Embedding(2, self.pirate_embed_dim)
        
        # --- MODIFIED: Changed from 32 to 31 (due to joint_30 removal) ---
        num_continuous_features = 31 
        total_embedding_dim = (4 * self.pain_embed_dim) + self.pirate_embed_dim
        conv_input_size = num_continuous_features + total_embedding_dim

        self.conv1d = nn.Conv1d(in_channels=conv_input_size, out_channels=conv_out_channels,
                                kernel_size=conv_kernel_size, padding='same')
        self.conv_activation = nn.ReLU()
        self.feature_dropout = nn.Dropout(feature_dropout_rate)

        if rnn_type == 'GRU':
            self.rnn = nn.GRU(
                input_size=conv_out_channels, hidden_size=hidden_size,
                num_layers=num_layers, batch_first=True, bidirectional=bidirectional,
                dropout=dropout_rate if num_layers > 1 else 0)
        elif rnn_type == 'LSTM':
            self.rnn = nn.LSTM(
                input_size=conv_out_channels, hidden_size=hidden_size,
                num_layers=num_layers, batch_first=True, bidirectional=bidirectional,
                dropout=dropout_rate if num_layers > 1 else 0)
        
        self.attention = Attention(rnn_hidden_dim)
        self.classifier = nn.Linear(rnn_hidden_dim, num_classes)

    def forward(self, x):
        # --- MODIFIED: Changed slice from :32 to :31 ---
        x_continuous = x[:, :, :31]
        x_categorical = x[:, :, 31:].long()
        
        embedded_cats = [self.pain_embeddings[i](x_categorical[:, :, i]) for i in range(4)] \
                      + [self.pirate_embedding(x_categorical[:, :, 4])]
        x_combined = torch.cat([x_continuous] + embedded_cats, dim=2)
        x_permuted = x_combined.permute(0, 2, 1)
        x_conv = self.conv_activation(self.conv1d(x_permuted))
        x_conv_permuted = x_conv.permute(0, 2, 1)
        x_dropped = self.feature_dropout(x_conv_permuted)
        rnn_outputs, _ = self.rnn(x_dropped)
        context_vector = self.attention(rnn_outputs)
        return self.classifier(context_vector)

# --- MODIFIED: Scheduler is now passed in and stepped per-batch ---
def train_one_epoch(model, loader, criterion, optimizer, scaler, scheduler, device):
    model.train()
    total_loss, all_preds, all_targets = 0, [], []
    for x, y in loader:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            logits = model(x)
            loss = criterion(logits, y)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step() # <-- OneCycleLR is stepped each batch
        total_loss += loss.item() * x.size(0)
        all_preds.append(logits.argmax(dim=1).cpu().numpy())
        all_targets.append(y.cpu().numpy())
    return total_loss / len(loader.dataset.tensors[0]), f1_score(np.concatenate(all_targets), np.concatenate(all_preds), average='weighted')

def validate_one_epoch(model, loader, criterion, device):
    model.eval()
    total_loss, all_preds, all_targets = 0, [], []
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                logits = model(x)
                loss = criterion(logits, y)
            total_loss += loss.item() * x.size(0)
            all_preds.append(logits.argmax(dim=1).cpu().numpy())
            all_targets.append(y.cpu().numpy())
    return total_loss / len(loader.dataset.tensors[0]), f1_score(np.concatenate(all_targets), np.concatenate(all_preds), average='weighted')

def objective_function(config, X_train_w, y_train_w, X_val_w, y_val_w, alpha_tensor):
    EPOCHS = 150
    train_loader = make_loader(TensorDataset(X_train_w, y_train_w), config["batch_size"], True, True)
    val_loader = make_loader(TensorDataset(X_val_w, y_val_w), config["batch_size"], False, False)

    model_config = {k: v for k, v in config.items() if k not in ['lr', 'batch_size', 'l2_lambda', 'focal_loss_gamma']}
    model = RecurrentClassifier(**model_config, num_classes=N_CLASSES).to(device)
    model = torch.compile(model, backend="eager") # <-- SPEEDUP
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["l2_lambda"])
    # --- MODIFIED: Use OneCycleLR ---
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=config["lr"], epochs=EPOCHS, steps_per_epoch=len(train_loader)
    )
    scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))
    criterion = FocalLoss(alpha=alpha_tensor, gamma=config['focal_loss_gamma'])

    best_val_f1 = -1.0; patience_counter = 0; hpo_patience = 30
    
    for epoch in range(1, EPOCHS + 1):
        train_loss, _ = train_one_epoch(model, train_loader, criterion, optimizer, scaler, scheduler, device)
        _, val_f1 = validate_one_epoch(model, val_loader, criterion, device)
        tune.report({"val_f1": val_f1, "train_loss": train_loss})
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1; patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= hpo_patience: break

def fit(model, train_loader, val_loader, epochs, criterion, optimizer, scheduler, scaler, device, patience, experiment_name):
    model_path = f"models/{experiment_name}_best_model.pt"
    best_f1 = -1; patience_counter = 0
    print(f"--- Starting Training: {experiment_name} ---")
    for epoch in range(1, epochs + 1):
        # --- MODIFIED: Pass scheduler to training function ---
        train_loss, train_f1 = train_one_epoch(model, train_loader, criterion, optimizer, scaler, scheduler, device)
        val_loss, val_f1 = validate_one_epoch(model, val_loader, criterion, device)

        if epoch % 5 == 0: print(f"Epoch {epoch:3d}/{epochs} | Best Val F1: {best_f1:.4f} | Val F1: {val_f1:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}")

        if val_f1 > best_f1:
            best_f1, patience_counter = val_f1, 0
            # When using torch.compile, it's better to save the state_dict of the original model
            torch.save(model._orig_mod.state_dict() if hasattr(model, '_orig_mod') else model.state_dict(), model_path)
        else:
            patience_counter += 1
            if patience_counter >= patience: print(f"Early stopping at epoch {epoch}. Best F1: {best_f1:.4f}"); break
    print(f"--- Finished Training --- Best F1: {best_f1:.4f}")
    # Load the best weights back into the uncompiled model for consistency
    uncompiled_model = model._orig_mod if hasattr(model, '_orig_mod') else model
    uncompiled_model.load_state_dict(torch.load(model_path))
    return uncompiled_model

## üß™ 5. Phase 1: Hyperparameter Search

In [18]:
# Define fixed windowing parameters
WINDOW_SIZE = 10
STRIDE = 2

# --- MODIFICATION START ---

# 1. DEFINE CORRECT FEATURE INDICES (POST-joint_30 REMOVAL)
# Original features in X_train_full_engineered (36 total):
# -> 0-29: joint_0-29 (30 features)
# -> 30-33: pain_survey_1-4 (4 features)
# -> 34: time (1 feature)
# -> 35: is_pirate (1 feature)

continuous_indices_orig = list(range(30)) + [34] # 30 joints + 1 time
categorical_indices_orig = list(range(30, 34)) + [35] # 4 pain surveys + 1 is_pirate

# Reorder columns to group continuous (31) and categorical (5) for easy scaling
X_train_full_reordered = np.concatenate([
    X_train_full_engineered[:, :, continuous_indices_orig],
    X_train_full_engineered[:, :, categorical_indices_orig]
], axis=2)
print(f"Reordered features. New shape: {X_train_full_reordered.shape}")

# 2. DEFINE PREPROCESSING PIPELINE
# In our reordered array (X_train_full_reordered):
# -> Indices 0-29 are joint_0-29
# -> Index 30 is 'time'
# -> Indices 31-35 are categorical (handled by remainder='passthrough')

# Define index lists for the ColumnTransformer
SKEWED_CONTINUOUS_INDICES = list(range(13, 26)) # joint_13 to joint_25
OTHER_CONTINUOUS_INDICES = list(range(13)) + list(range(26, 30)) + [30] # joint_0-12, joint_26-29, and 'time'

# Pipeline 1: For highly skewed data
# Apply Yeo-Johnson transformation, then scale using StandardScaler
skewed_pipeline = Pipeline([
    ('power', PowerTransformer(method='yeo-johnson')),
    ('scaler', StandardScaler()) 
])

# Pipeline 2: For other continuous data
# Just use StandardScaler (safer than StandardScaler)
other_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# 3. CREATE THE COLUMN TRANSFORMER
# This will apply the correct pipeline to each group and leave
# the categorical features (indices 31-35) untouched.
preprocessor_hpo = ColumnTransformer(
    transformers=[
        ('skewed', skewed_pipeline, SKEWED_CONTINUOUS_INDICES),
        ('other', other_pipeline, OTHER_CONTINUOUS_INDICES)
    ],
    remainder='passthrough' # Leaves categorical (indices 31-35) untouched
)
print("Created new PowerTransforming + RobustScaling preprocessor.")

# --- MODIFICATION END ---

print("--- Splitting data for HPO ---")
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
for train_idx, val_idx in sss.split(X_train_full_reordered, y_train_full):
    X_train_split, y_train_split = X_train_full_reordered[train_idx], y_train_full[train_idx]
    X_val_split, y_val_split = X_train_full_reordered[val_idx], y_train_full[val_idx]

print("--- Pre-scaling data for HPO efficiency ---")
# This section now uses our powerful new preprocessor_hpo
ns, ts, f = X_train_split.shape
X_train_split_scaled = preprocessor_hpo.fit_transform(X_train_split.reshape(ns*ts, f)).reshape(ns, ts, f)
ns_val, ts_val, f_val = X_val_split.shape
X_val_split_scaled = preprocessor_hpo.transform(X_val_split.reshape(ns_val*ts_val, f_val)).reshape(ns_val, ts_val, f_val)

print("--- Creating fixed sliding windows for HPO ---")
X_train_w, y_train_w, _ = create_sliding_windows(X_train_split_scaled, y_train_split, WINDOW_SIZE, STRIDE)
X_val_w, y_val_w, _ = create_sliding_windows(X_val_split_scaled, y_val_split, WINDOW_SIZE, STRIDE)

# Convert to tensors once before HPO
X_train_w_torch = torch.from_numpy(X_train_w).float()
y_train_w_torch = torch.from_numpy(y_train_w).long()
X_val_w_torch = torch.from_numpy(X_val_w).float()
y_val_w_torch = torch.from_numpy(y_val_w).long()

print(f"Created training windows of shape: {X_train_w_torch.shape}")

# Clean up memory
del X_train_split_scaled, X_val_split_scaled, X_train_w, y_train_w, X_val_w, y_val_w
gc.collect()

Reordered features. New shape: (661, 160, 36)
Created new PowerTransforming + RobustScaling preprocessor.
--- Splitting data for HPO ---
--- Pre-scaling data for HPO efficiency ---
--- Creating fixed sliding windows for HPO ---
Created training windows of shape: torch.Size([40128, 10, 36])


78

In [19]:
# --- MODIFIED: Search space with rnn_type and focal_loss_gamma ---
search_space = {
    "rnn_type": tune.choice(['GRU', 'LSTM']),
    "focal_loss_gamma": tune.uniform(0.5, 3.0),
    "lr": tune.loguniform(1e-4, 1e-2), # This will be the max_lr for OneCycleLR
    "batch_size": tune.choice([64, 128]),
    "hidden_size": tune.choice([256, 384, 512]), 
    "num_layers": tune.choice([2, 3]),
    "dropout_rate": tune.uniform(0, 0.5), 
    "feature_dropout_rate": tune.uniform(0, 0.5),
    "bidirectional": tune.choice([True, False]), 
    "l2_lambda": tune.loguniform(1e-8, 1e-5),
    "conv_out_channels": tune.choice([128]), 
    "conv_kernel_size": tune.choice([5])
}

def short_trial_name(trial): return f"{trial.trainable_name}_{trial.trial_id}"

if ray.is_initialized(): ray.shutdown()
ray.init(num_cpus=os.cpu_count(), num_gpus=1, ignore_reinit_error=True, log_to_driver=False)

print("--- Starting HPO with Focal Loss and RNN-Type search ---")
analysis = tune.run(
    tune.with_parameters(objective_function, 
                         X_train_w=X_train_w_torch, y_train_w=y_train_w_torch,
                         X_val_w=X_val_w_torch, y_val_w=y_val_w_torch,
                         alpha_tensor=alpha_tensor),
    resources_per_trial={"cpu": 4, "gpu": 0.25},
    config=search_space, 
    num_samples=10, 
    search_alg=OptunaSearch(metric="val_f1", mode="max"),
    scheduler=ASHAScheduler(metric="val_f1", mode="max", grace_period=25, reduction_factor=2),
    name="pirate_pain_focalloss_search_v12", 
    verbose=1,
    trial_dirname_creator=short_trial_name
)

0,1
Current time:,2025-11-15 21:01:13
Running for:,00:38:10.94
Memory:,9.3/13.9 GiB

Trial name,status,loc,batch_size,bidirectional,conv_kernel_size,conv_out_channels,dropout_rate,feature_dropout_rate,focal_loss_gamma,hidden_size,l2_lambda,lr,num_layers,rnn_type,iter,total time (s),val_f1,train_loss
objective_function_182014c1,TERMINATED,127.0.0.1:47696,128,True,5,128,0.448313,0.0329935,1.73333,512,3.7229e-08,0.000395751,3,LSTM,76,1546.89,0.940249,0.00150993
objective_function_04f94a5f,TERMINATED,127.0.0.1:47108,64,False,5,128,0.432408,0.007398,2.71729,384,1.29733e-08,0.000457615,3,LSTM,25,617.033,0.916688,0.00096299
objective_function_6c28069a,TERMINATED,127.0.0.1:47152,128,False,5,128,0.444522,0.21918,0.690762,256,8.30337e-08,0.00038123,3,LSTM,55,666.395,0.942149,0.000913154
objective_function_01d53a7a,TERMINATED,127.0.0.1:42144,128,False,5,128,0.332371,0.400343,1.88229,512,3.13271e-08,0.00345245,3,GRU,25,323.021,0.918116,0.00221385
objective_function_852b6d06,TERMINATED,127.0.0.1:29792,64,False,5,128,0.497002,0.0720787,2.41771,512,3.87069e-08,0.000417102,3,GRU,25,605.674,0.923981,0.00147654
objective_function_117f05f6,TERMINATED,127.0.0.1:42568,64,False,5,128,0.261547,0.0628552,2.80822,256,1.29973e-07,0.00061898,3,GRU,100,1629.8,0.955645,1.82617e-06
objective_function_15388e0c,TERMINATED,127.0.0.1:35268,128,False,5,128,0.489037,0.29658,2.37081,512,5.21076e-06,0.00411469,3,LSTM,68,853.726,0.942897,0.000346225
objective_function_3334736f,TERMINATED,127.0.0.1:45928,128,True,5,128,0.223899,0.399432,1.29826,256,6.81964e-08,0.000661341,3,LSTM,50,802.649,0.929455,0.00111905
objective_function_a8350bd7,TERMINATED,127.0.0.1:32092,128,False,5,128,0.442899,0.000439364,2.40641,512,3.53866e-07,0.000382274,2,GRU,25,210.438,0.934554,0.00153485
objective_function_f923cd06,TERMINATED,127.0.0.1:32900,128,False,5,128,0.483005,0.409517,1.21567,256,1.66162e-08,0.00668374,3,GRU,25,238.633,0.922421,0.00312497


2025-11-15 21:01:13,535	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/Karim Negm/ray_results/pirate_pain_focalloss_search_v12' in 0.0199s.
2025-11-15 21:01:13,553	INFO tune.py:1041 -- Total run time: 2291.00 seconds (2290.91 seconds for the tuning loop).


In [20]:
print("\n--- Loading Partial Search Results ---")

# --- 1. Define the path to your experiment results ---
# Ray's default is '~/ray_results/<experiment_name>'
# The traceback confirms your user folder, so this path should work.
experiment_path = os.path.expanduser("~/ray_results/pirate_pain_focalloss_search_v12")

# --- 2. Load the analysis from disk ---
print(f"Loading analysis from: {experiment_path}")
try:
    analysis = tune.ExperimentAnalysis(experiment_path)
except Exception as e:
    print(f"ERROR: Could not load analysis from {experiment_path}")
    print(f"Make sure the path is correct. The error was: {e}")
    # Raise the error to stop the notebook if the path is wrong
    raise e

# --- 3. Get the best trial from the *completed* runs ---
print("Getting best *completed* trial from partial analysis...")
best_trial = analysis.get_best_trial(metric="val_f1", mode="max", scope="all")

# --- 4. The rest of your original cell logic ---
if best_trial:
    FINAL_CONFIG = best_trial.config
    # Check if 'val_f1' exists, use a default if not
    FINAL_BEST_VAL_F1 = best_trial.last_result.get("val_f1", 0.0) 
    print(f"Best validation F1 score from completed trials: {FINAL_BEST_VAL_F1:.4f}")
    print("Best hyperparameters found:")
    print(FINAL_CONFIG)
else:
    print("ERROR: No trials completed successfully. Using a default config.")
    FINAL_CONFIG = {'rnn_type': 'GRU', 'focal_loss_gamma': 2.0, 'lr': 0.001, 'batch_size': 128, 'hidden_size': 256, 'num_layers': 2, 'dropout_rate': 0.3, 'feature_dropout_rate': 0.3, 'bidirectional': True, 'l2_lambda': 1e-06, 'conv_out_channels': 128, 'conv_kernel_size': 5}
    FINAL_BEST_VAL_F1 = 0.0

# Add the fixed windowing params to the final config for the next steps
# Make sure WINDOW_SIZE and STRIDE are defined (they should be from Cell 5)
FINAL_CONFIG['window_size'] = WINDOW_SIZE
FINAL_CONFIG['stride'] = STRIDE

# Clean up HPO data to save memory
try:
    del X_train_w_torch, y_train_w_torch, X_val_w_torch, y_val_w_torch
    del X_train_split, y_train_split, X_val_split, y_val_split
except NameError:
    print("Data already cleaned up or not in memory.")
gc.collect()

print("\n--- Ready to proceed to K-Fold Training ---")


--- Loading Partial Search Results ---
Loading analysis from: C:\Users\Karim Negm/ray_results/pirate_pain_focalloss_search_v12
Getting best *completed* trial from partial analysis...
Best validation F1 score from completed trials: 0.9556
Best hyperparameters found:
{'rnn_type': 'GRU', 'focal_loss_gamma': 2.8082203178049916, 'lr': 0.0006189800893397386, 'batch_size': 64, 'hidden_size': 256, 'num_layers': 3, 'dropout_rate': 0.26154739277453204, 'feature_dropout_rate': 0.06285520676214107, 'bidirectional': False, 'l2_lambda': 1.299730145545422e-07, 'conv_out_channels': 128, 'conv_kernel_size': 5}

--- Ready to proceed to K-Fold Training ---


## üèÜ 6. Phase 2: K-Fold Ensemble Training

In [21]:
print("--- üèÜ Final Configuration Set --- ")
print(f"Best Val F1 from HPO search: {FINAL_BEST_VAL_F1:.4f}")
print(FINAL_CONFIG)

N_SPLITS = 5
FINAL_EXPERIMENT_NAME = f"Compiled-FocalLoss-{FINAL_CONFIG['rnn_type']}_H{FINAL_CONFIG['hidden_size']}_L{FINAL_CONFIG['num_layers']}_"\
                      f"C{FINAL_CONFIG['conv_out_channels']}_K{FINAL_CONFIG['conv_kernel_size']}_v12"
submission_filename_base = f"submission_{FINAL_EXPERIMENT_NAME}.csv"
print(f"Submission name will be: {submission_filename_base}")

--- üèÜ Final Configuration Set --- 
Best Val F1 from HPO search: 0.9556
{'rnn_type': 'GRU', 'focal_loss_gamma': 2.8082203178049916, 'lr': 0.0006189800893397386, 'batch_size': 64, 'hidden_size': 256, 'num_layers': 3, 'dropout_rate': 0.26154739277453204, 'feature_dropout_rate': 0.06285520676214107, 'bidirectional': False, 'l2_lambda': 1.299730145545422e-07, 'conv_out_channels': 128, 'conv_kernel_size': 5, 'window_size': 10, 'stride': 2}
Submission name will be: submission_Compiled-FocalLoss-GRU_H256_L3_C128_K5_v12.csv


In [23]:
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
fold_val_f1_list = []
continuous_indices_reordered = list(range(31))
EPOCHS = 150

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_full_reordered, y_train_full)):
    fold_name = f"{FINAL_EXPERIMENT_NAME}_fold_{fold+1}"
    print(f"\n--- Fold {fold+1}/{N_SPLITS} --- ({fold_name}) ---")
    
    X_train_fold, y_train_fold = X_train_full_reordered[train_idx], y_train_full[train_idx]
    X_val_fold, y_val_fold = X_train_full_reordered[val_idx], y_train_full[val_idx]

    preprocessor_fold = ColumnTransformer([('s', StandardScaler(), continuous_indices_reordered)], remainder='passthrough')
    ns, ts, f = X_train_fold.shape
    X_train_scaled = preprocessor_fold.fit_transform(X_train_fold.reshape(ns*ts, f)).reshape(ns, ts, f)
    ns_val, ts_val, f_val = X_val_fold.shape
    X_val_scaled = preprocessor_fold.transform(X_val_fold.reshape(ns_val*ts_val, f_val)).reshape(ns_val, ts_val, f_val)
    
    X_train_w, y_train_w, _ = create_sliding_windows(X_train_scaled, y_train_fold, FINAL_CONFIG['window_size'], FINAL_CONFIG['stride'])
    X_val_w, y_val_w, _ = create_sliding_windows(X_val_scaled, y_val_fold, FINAL_CONFIG['window_size'], FINAL_CONFIG['stride'])
    
    train_loader = make_loader(TensorDataset(torch.from_numpy(X_train_w).float(), torch.from_numpy(y_train_w).long()), FINAL_CONFIG['batch_size'], True, True)
    val_loader = make_loader(TensorDataset(torch.from_numpy(X_val_w).float(), torch.from_numpy(y_val_w).long()), FINAL_CONFIG['batch_size'], False, False)

    model_config_kfold = {k: v for k, v in FINAL_CONFIG.items() if k not in ['window_size', 'stride', 'lr', 'batch_size', 'l2_lambda', 'focal_loss_gamma']}
    model_fold = RecurrentClassifier(**model_config_kfold, num_classes=N_CLASSES).to(device)
    model_fold = torch.compile(model_fold, backend="eager") # <-- SPEEDUP
    
    optimizer = torch.optim.AdamW(model_fold.parameters(), lr=FINAL_CONFIG['lr'], weight_decay=FINAL_CONFIG['l2_lambda'])
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=FINAL_CONFIG['lr'], epochs=EPOCHS, steps_per_epoch=len(train_loader))
    scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))
    criterion = FocalLoss(alpha=alpha_tensor, gamma=FINAL_CONFIG['focal_loss_gamma'])

    # Note: `fit` returns the uncompiled model, which is what we want for saving/inference
    model_fold_uncompiled = fit(model_fold, train_loader, val_loader, EPOCHS, criterion, optimizer, scheduler, scaler, device, 100, fold_name)
    
    # Re-validate with the uncompiled model to get final score
    _, val_f1 = validate_one_epoch(model_fold_uncompiled, val_loader, criterion, device)
    fold_val_f1_list.append(val_f1)
    print(f"Fold {fold+1} Final Val F1: {val_f1:.4f}")

print(f"\n--- üèÜ K-Fold Training Complete --- Average F1: {np.mean(fold_val_f1_list):.4f}")


--- Fold 1/5 --- (Compiled-FocalLoss-GRU_H256_L3_C128_K5_v12_fold_1) ---
--- Starting Training: Compiled-FocalLoss-GRU_H256_L3_C128_K5_v12_fold_1 ---
Epoch   5/150 | Val F1: 0.9048 | LR: 0.000043
Epoch  10/150 | Val F1: 0.9296 | LR: 0.000094
Epoch  15/150 | Val F1: 0.9218 | LR: 0.000173
Epoch  20/150 | Val F1: 0.9303 | LR: 0.000270
Epoch  25/150 | Val F1: 0.9384 | LR: 0.000373
Epoch  30/150 | Val F1: 0.9366 | LR: 0.000470
Epoch  35/150 | Val F1: 0.9336 | LR: 0.000549
Epoch  40/150 | Val F1: 0.9432 | LR: 0.000601
Epoch  45/150 | Val F1: 0.9391 | LR: 0.000619
Epoch  50/150 | Val F1: 0.9377 | LR: 0.000616
Epoch  55/150 | Val F1: 0.9374 | LR: 0.000605
Epoch  60/150 | Val F1: 0.9351 | LR: 0.000588
Epoch  65/150 | Val F1: 0.9393 | LR: 0.000565
Epoch  70/150 | Val F1: 0.9405 | LR: 0.000536
Epoch  75/150 | Val F1: 0.9397 | LR: 0.000502
Epoch  80/150 | Val F1: 0.9400 | LR: 0.000464
Epoch  85/150 | Val F1: 0.9394 | LR: 0.000423
Epoch  90/150 | Val F1: 0.9462 | LR: 0.000378
Epoch  95/150 | Val F

## üì¨ 7. Phase 3: Ensemble Submission

In [26]:
print("\n--- Preparing test dataset for submission ---")
continuous_indices_orig = list(range(30)) + [34]
categorical_indices_orig = list(range(30, 34)) + [35]
X_test_full_reordered = np.concatenate([
    X_test_full_engineered[:, :, continuous_indices_orig],
    X_test_full_engineered[:, :, categorical_indices_orig]], axis=2)

continuous_indices_reordered = list(range(31))
preprocessor_final = ColumnTransformer([('scaler', StandardScaler(), continuous_indices_reordered)], remainder='passthrough')

ns, ts, f = X_train_full_reordered.shape
preprocessor_final.fit(X_train_full_reordered.reshape(ns * ts, f))

ns_test, ts_test, f_test = X_test_full_reordered.shape
X_test_scaled = preprocessor_final.transform(X_test_full_reordered.reshape(ns_test * ts_test, f_test)).reshape(ns_test, ts_test, f_test)
X_test_w, test_window_indices = create_sliding_windows(X_test_scaled, y=None, window_size=FINAL_CONFIG['window_size'], stride=FINAL_CONFIG['stride'])
test_loader = make_loader(TensorDataset(torch.from_numpy(X_test_w).float()), FINAL_CONFIG['batch_size'], False, False)

model_config_final = {k: v for k, v in FINAL_CONFIG.items() if k not in ['window_size', 'stride', 'lr', 'batch_size', 'l2_lambda', 'focal_loss_gamma']}
all_fold_probabilities = []

for fold in range(N_SPLITS-1):
    if fold == 1: continue  
    fold_name = f"{FINAL_EXPERIMENT_NAME}_fold_{fold+1}"
    model_path = f"models/{fold_name}_best_model.pt"
    print(f"Loading model {fold+1}/{N_SPLITS} from {model_path}...")
    # Load state_dict into the uncompiled model structure
    model_fold = RecurrentClassifier(**model_config_final, num_classes=N_CLASSES).to(device)
    model_fold.load_state_dict(torch.load(model_path, map_location=device))
    model_fold = torch.compile(model_fold, backend="eager") # <-- Compile for faster inference
    model_fold.eval()
    
    fold_preds = []
    with torch.no_grad():
        for (inputs,) in test_loader:
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                probs = torch.softmax(model_fold(inputs.to(device)), dim=1)
                fold_preds.append(probs.cpu().numpy())
    all_fold_probabilities.append(np.concatenate(fold_preds))

mean_probabilities = np.mean(all_fold_probabilities, axis=0)
df_probs = pd.DataFrame(mean_probabilities, columns=[f"prob_{i}" for i in range(N_CLASSES)])
df_probs['original_index'] = test_window_indices
agg_probs = df_probs.groupby('original_index')[[f"prob_{i}" for i in range(N_CLASSES)]].mean().values
final_predictions = le.inverse_transform(np.argmax(agg_probs, axis=1))

submission_df = pd.DataFrame({'sample_index': sorted(X_test_long_df['sample_index'].unique()), 'label': final_predictions})
submission_df['sample_index'] = submission_df['sample_index'].apply(lambda x: f"{x:03d}")
submission_filepath = os.path.join("submissions", submission_filename_base)
submission_df.to_csv(submission_filepath, index=False)
print(f"\nSuccessfully saved to {submission_filepath}!")
print(submission_df.head())


--- Preparing test dataset for submission ---
Loading model 1/5 from models/Compiled-FocalLoss-GRU_H256_L3_C128_K5_v12_fold_1_best_model.pt...
Loading model 3/5 from models/Compiled-FocalLoss-GRU_H256_L3_C128_K5_v12_fold_3_best_model.pt...
Loading model 4/5 from models/Compiled-FocalLoss-GRU_H256_L3_C128_K5_v12_fold_4_best_model.pt...

Successfully saved to submissions\submission_Compiled-FocalLoss-GRU_H256_L3_C128_K5_v12.csv!
  sample_index    label
0          000  no_pain
1          001  no_pain
2          002  no_pain
3          003  no_pain
4          004  no_pain
