# **Kaggle Challenge: Pirate Pain Dataset üè¥‚Äç‚ò†Ô∏è (v6: Regularized Embeddings)**

This notebook implements a robust K-Fold Cross-Validation and Ensembling strategy. This version improves upon the embedding layer strategy with better regularization.

**Strategy:**
1.  **Feature Engineering & Splitting:** 
    * The 37 features are now split into two groups:
        * **32 Continuous Features:** `joint_` (31) and `time` (1). These will be scaled.
        * **5 Categorical Features:** `pain_survey_` (4) and `is_pirate` (1). These will be fed into `nn.Embedding` layers.
2.  **Hybrid Model Architecture:** The `RecurrentClassifier` handles both continuous and embedded categorical features, concatenating them before passing them to the GRU.
3.  **Enhanced Regularization:** A new, dedicated `nn.Dropout` layer is applied to the combined feature set *before* the GRU layer. This is crucial for preventing the more complex model from overfitting.
4.  **Hyperparameter Search:** Use Ray Tune & Optuna on a single 80/20 split to find the best hyperparameters, including the new `feature_dropout_rate`.
5.  **K-Fold Training:** Train `K=5` models on 5 different folds using the best configuration found.
6.  **Ensemble Prediction:** Load all 5 models, average their probabilities, and create the final submission.

## ‚öôÔ∏è 1. Setup & Libraries

In [3]:
# Set seed for reproducibility
SEED = 123

# Import necessary libraries
import os
import logging
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import copy
from itertools import product
import time

# Set environment variables before importing modules
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# --- PyTorch Imports ---
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader

# --- Sklearn Imports ---
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

# --- Ray[tune] & Optuna Imports ---
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from functools import partial

# --- Setup Directories & Device ---
logs_dir = "tensorboard"
os.makedirs("models", exist_ok=True)
os.makedirs("submissions", exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
    print("\n--- Using GPU ---")
else:
    device = torch.device("cpu")
    print("\n--- Using CPU ---")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Configure plot display settings
sns.set_theme(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline


--- Using GPU ---
PyTorch version: 2.5.1
Device: cuda


## üîÑ 2. Data Loading & Feature Engineering

In [4]:
print("--- 1. Loading Data ---")

# --- Define File Paths and Features ---
DATA_DIR = "data"
X_TRAIN_PATH = os.path.join(DATA_DIR, "pirate_pain_train.csv")
Y_TRAIN_PATH = os.path.join(DATA_DIR, "pirate_pain_train_labels.csv")
X_TEST_PATH = os.path.join(DATA_DIR, "pirate_pain_test.csv")
SUBMISSION_PATH = os.path.join(DATA_DIR, "sample_submission.csv")

try:
    # Load features and labels
    features_long_df = pd.read_csv(X_TRAIN_PATH)
    labels_df = pd.read_csv(Y_TRAIN_PATH)
    X_test_long_df = pd.read_csv(X_TEST_PATH)
    
    # --- Define constants ---
    N_TIMESTEPS = 160
    JOINT_FEATURES = [f"joint_{i:02d}" for i in range(31)]
    PAIN_FEATURES = [f"pain_survey_{i}" for i in range(1, 5)]
    TIME_FEATURE = ['time']
    
    FEATURES = JOINT_FEATURES + PAIN_FEATURES + TIME_FEATURE
    N_FEATURES_ORIGINAL = len(FEATURES) # 31 + 4 + 1 = 36
    LABEL_MAPPING = {'no_pain': 0, 'low_pain': 1, 'high_pain': 2}
    N_CLASSES = len(LABEL_MAPPING)

    # --- Reshape function ---
    def reshape_data(df, features_list, n_timesteps):
        df_pivot = df.pivot(index='sample_index', columns='time', values=features_list)
        data_2d = df_pivot.values
        n_samples = data_2d.shape[0]
        data_3d = data_2d.reshape(n_samples, len(features_list), n_timesteps)
        return data_3d.transpose(0, 2, 1)

    # --- Load and reshape X_train_full (36 features) ---
    X_train_full = reshape_data(
        features_long_df[features_long_df['sample_index'].isin(labels_df['sample_index'].unique())], 
        FEATURES, 
        N_TIMESTEPS
    )
    
    # --- Load and reshape X_test (36 features) ---
    X_test_full = reshape_data(
        X_test_long_df, FEATURES, N_TIMESTEPS
    )

    # --- Load and prepare y_train_full ---
    y_train_full_df = labels_df.sort_values(by='sample_index')
    le = LabelEncoder()
    le.fit(list(LABEL_MAPPING.keys()))
    y_train_full = le.transform(y_train_full_df['label'])
    
    print(f"Loaded X_train_full (shape: {X_train_full.shape}) and y_train_full (shape: {y_train_full.shape})")
    print(f"Loaded X_test_full (shape: {X_test_full.shape})")

    # --- 2. Engineer 'is_pirate' Feature (for Train) ---
    print("\n--- 2. Engineering 'is_pirate' Feature ---")
    static_cols = ['sample_index', 'n_legs', 'n_hands', 'n_eyes']
    static_df = features_long_df[static_cols].drop_duplicates().set_index('sample_index')
    
    pirate_filter = (
        (static_df['n_legs'] == 'one+peg_leg') |
        (static_df['n_hands'] == 'one+hook_hand') |
        (static_df['n_eyes'] == 'one+eye_patch')
    )
    pirate_indices = static_df[pirate_filter].index
    sample_indices_ordered = sorted(features_long_df[features_long_df['sample_index'].isin(labels_df['sample_index'].unique())]['sample_index'].unique())
    is_pirate_map = np.array([1 if idx in pirate_indices else 0 for idx in sample_indices_ordered])
    pirate_feature_broadcast = np.tile(is_pirate_map.reshape(-1, 1, 1), (1, N_TIMESTEPS, 1))
    
    # Concatenate with X_train_full
    X_train_full_engineered = np.concatenate([X_train_full, pirate_feature_broadcast], axis=2)
    
    # --- 3. Engineer 'is_pirate' Feature (for Test) ---
    static_df_test = X_test_long_df[static_cols].drop_duplicates().set_index('sample_index')
    pirate_filter_test = (
        (static_df_test['n_legs'] == 'one+peg_leg') |
        (static_df_test['n_hands'] == 'one+hook_hand') |
        (static_df_test['n_eyes'] == 'one+eye_patch')
    )
    pirate_indices_test = static_df_test[pirate_filter_test].index
    sample_indices_test_ordered = sorted(X_test_long_df['sample_index'].unique())
    is_pirate_map_test = np.array([1 if idx in pirate_indices_test else 0 for idx in sample_indices_test_ordered])
    pirate_feature_broadcast_test = np.tile(is_pirate_map_test.reshape(-1, 1, 1), (1, N_TIMESTEPS, 1))
    
    # Concatenate with X_test_full
    X_test_full_engineered = np.concatenate([X_test_full, pirate_feature_broadcast_test], axis=2)
    
    N_FEATURES_NEW = X_train_full_engineered.shape[2] # This will be 37
    print(f"Created X_train_full_engineered (shape: {X_train_full_engineered.shape})")
    print(f"Created X_test_full_engineered (shape: {X_test_full_engineered.shape})")
    print(f"N_FEATURES is now: {N_FEATURES_NEW}")

    # --- 4. Calculate Class Weights ---
    print("\n--- 3. Calculating Class Weights ---")
    class_counts_series = labels_df['label'].value_counts()
    counts_ordered = class_counts_series.reindex(LABEL_MAPPING.keys()).values
    class_weights_tensor = 1.0 / torch.tensor(counts_ordered, dtype=torch.float)
    class_weights_tensor = class_weights_tensor / class_weights_tensor.sum() # Normalize weights
    class_weights_tensor = class_weights_tensor.to(device)
    
    print(f"Class counts (0, 1, 2): {counts_ordered}")
    print(f"Calculated class weights: {class_weights_tensor}")

except FileNotFoundError as e:
    print(f"Error: Could not find a required file. {e}")
except Exception as e:
    print(f"An error occurred: {e}")

--- 1. Loading Data ---
Loaded X_train_full (shape: (661, 160, 36)) and y_train_full (shape: (661,))
Loaded X_test_full (shape: (1324, 160, 36))

--- 2. Engineering 'is_pirate' Feature ---
Created X_train_full_engineered (shape: (661, 160, 37))
Created X_test_full_engineered (shape: (1324, 160, 37))
N_FEATURES is now: 37

--- 3. Calculating Class Weights ---
Class counts (0, 1, 2): [511  94  56]
Calculated class weights: tensor([0.0643, 0.3493, 0.5864], device='cuda:0')


## üõ†Ô∏è 3. Helper Functions

In [5]:
def create_sliding_windows(X_3d, y=None, window_size=100, stride=20):
    """
    Takes 3D data (n_samples, n_timesteps, n_features)
    and creates overlapping windows.
    """
    new_X = []
    new_y = []
    window_indices = [] 
    
    n_samples, n_timesteps, n_features = X_3d.shape
    
    for i in range(n_samples):
        sample = X_3d[i]
        idx = 0
        while (idx + window_size) <= n_timesteps:
            window = sample[idx : idx + window_size]
            new_X.append(window)
            window_indices.append(i)
            if y is not None: new_y.append(y[i])
            idx += stride
            
    if y is not None:
        return np.array(new_X), np.array(new_y), np.array(window_indices)
    else:
        return np.array(new_X), np.array(window_indices)
    
def make_loader(ds, batch_size, shuffle, drop_last):
    """Creates a PyTorch DataLoader with optimized settings."""
    return DataLoader(
        ds, batch_size=int(batch_size), shuffle=shuffle, drop_last=drop_last,
        num_workers=0, pin_memory=True,
        pin_memory_device="cuda" if torch.cuda.is_available() else "",
        prefetch_factor=None,
    )

## üß† 4. Model & Training Engine

In [6]:
class RecurrentClassifier(nn.Module):
    def __init__(
            self,
            hidden_size,
            num_layers,
            num_classes,
            rnn_type='GRU',
            bidirectional=False,
            dropout_rate=0.2, # Dropout for within the GRU
            feature_dropout_rate=0.5 # Dropout for the input features
            ):
        super().__init__()

        self.rnn_type = rnn_type
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        
        # --- Embedding Layers for Categorical Features ---
        self.pain_embed_dim = 4
        self.pirate_embed_dim = 4

        self.pain_embeddings = nn.ModuleList(
            [nn.Embedding(num_embeddings=3, embedding_dim=self.pain_embed_dim) for _ in range(4)]
        )
        self.pirate_embedding = nn.Embedding(num_embeddings=2, embedding_dim=self.pirate_embed_dim)

        # --- Calculate RNN Input Size ---
        num_continuous_features = 32 # 31 joints + 1 time feature
        total_embedding_dim = (4 * self.pain_embed_dim) + self.pirate_embed_dim
        rnn_input_size = num_continuous_features + total_embedding_dim

        # --- Regularization: Feature Dropout ---
        self.feature_dropout = nn.Dropout(feature_dropout_rate)
        
        # --- RNN Layer ---
        rnn_map = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}
        rnn_module = rnn_map[rnn_type]
        gru_dropout_val = dropout_rate if num_layers > 1 else 0

        self.rnn = rnn_module(
            input_size=rnn_input_size, hidden_size=hidden_size,
            num_layers=num_layers, batch_first=True, bidirectional=bidirectional,
            dropout=gru_dropout_val
        )
        
        # --- Classifier Head ---
        classifier_input_size = hidden_size * 2 if self.bidirectional else hidden_size
        self.classifier = nn.Linear(classifier_input_size, num_classes)

    def forward(self, x):
        """ x shape: (batch_size, seq_length, 37) """
        # 1. Split continuous and categorical features
        # Continuous: 31 joint + 1 time = 32 features (indices 0-31)
        # Categorical: 4 pain + 1 pirate = 5 features (indices 32-36)
        x_continuous = x[:, :, :32] 
        x_categorical = x[:, :, 32:].long()

        # 2. Apply embeddings to categorical features
        embedded_cats = []
        for i in range(4): # Pain surveys
            embedded_cats.append(self.pain_embeddings[i](x_categorical[:, :, i]))
        embedded_cats.append(self.pirate_embedding(x_categorical[:, :, 4])) # is_pirate

        # 3. Concatenate all features
        all_embeddings = torch.cat(embedded_cats, dim=2)
        x_combined = torch.cat([x_continuous, all_embeddings], dim=2)

        # 4. Apply Feature Dropout (Regularization)
        x_combined = self.feature_dropout(x_combined)
        
        # 5. Pass combined tensor through RNN
        rnn_out, hidden = self.rnn(x_combined)

        if self.rnn_type == 'LSTM': hidden = hidden[0]

        if self.bidirectional:
            hidden = hidden.view(self.num_layers, 2, -1, self.hidden_size)
            hidden_to_classify = torch.cat([hidden[-1, 0, :, :], hidden[-1, 1, :, :]], dim=1)
        else:
            hidden_to_classify = hidden[-1]

        logits = self.classifier(hidden_to_classify)
        return logits

def train_one_epoch(model, train_loader, criterion, optimizer, scaler, device):
    model.train()
    running_loss = 0.0
    all_predictions = []
    all_targets = []

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad(set_to_none=True)

        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            logits = model(inputs)
            loss = criterion(logits, targets)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() * inputs.size(0)
        predictions = logits.argmax(dim=1)
        all_predictions.append(predictions.cpu().numpy())
        all_targets.append(targets.cpu().numpy())

    epoch_loss = running_loss / len(np.concatenate(all_targets))
    epoch_f1 = f1_score(np.concatenate(all_targets), np.concatenate(all_predictions), average='weighted')
    return epoch_loss, epoch_f1

def validate_one_epoch(model, val_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                logits = model(inputs)
                loss = criterion(logits, targets)
            running_loss += loss.item() * inputs.size(0)
            predictions = logits.argmax(dim=1)
            all_predictions.append(predictions.cpu().numpy())
            all_targets.append(targets.cpu().numpy())

    epoch_loss = running_loss / len(val_loader.dataset.tensors[1])
    epoch_f1 = f1_score(np.concatenate(all_targets), np.concatenate(all_predictions), average='weighted')
    return epoch_loss, epoch_f1

def objective_function(config, X_train, y_train, X_val, y_val, class_weights_tensor):
    # 1. --- Preprocessing ---
    # Indices for continuous features: 31 joint + 1 time = 32
    continuous_indices = list(range(32)) 
    preprocessor = ColumnTransformer(
        transformers=[('scaler', StandardScaler(), continuous_indices)],
        remainder='passthrough' # Leaves categorical features untouched
    )

    ns, ts, f = X_train.shape
    X_train_2d = X_train.reshape(ns * ts, f)
    ns_val, ts_val, f_val = X_val.shape
    X_val_2d = X_val.reshape(ns_val * ts_val, f_val)

    preprocessor.fit(X_train_2d)
    X_train_final = preprocessor.transform(X_train_2d).reshape(ns, ts, -1)
    X_val_final = preprocessor.transform(X_val_2d).reshape(ns_val, ts_val, -1)

    # 2. --- Windowing, Datasets, Dataloaders ---
    X_train_w, y_train_w, _ = create_sliding_windows(X_train_final, y_train, config["window_size"], config["stride"])
    X_val_w, y_val_w, _ = create_sliding_windows(X_val_final, y_val, config["window_size"], config["stride"])
    train_ds = TensorDataset(torch.from_numpy(X_train_w).float(), torch.from_numpy(y_train_w).long())
    val_ds = TensorDataset(torch.from_numpy(X_val_w).float(), torch.from_numpy(y_val_w).long())
    train_loader = make_loader(train_ds, config["batch_size"], shuffle=True, drop_last=True)
    val_loader = make_loader(val_ds, config["batch_size"], shuffle=False, drop_last=False)

    # 3. --- Model, Optimizer, etc. ---
    model = RecurrentClassifier(
        hidden_size=config["hidden_size"], num_layers=config["num_layers"], num_classes=3,
        dropout_rate=config["dropout_rate"], feature_dropout_rate=config["feature_dropout_rate"],
        bidirectional=config["bidirectional"], rnn_type=config["rnn_type"]
    ).to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["l2_lambda"])
    scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

    # 4. --- Training & Reporting Loop ---
    for epoch in range(1, 151):
        train_loss, _ = train_one_epoch(model, train_loader, criterion, optimizer, scaler, device)
        _, val_f1 = validate_one_epoch(model, val_loader, criterion, device)
        tune.report({"val_f1": val_f1, "train_loss": train_loss})

def fit(model, train_loader, val_loader, epochs, criterion, optimizer, scaler, device, patience=0, 
        evaluation_metric="val_f1", mode='max', verbose=10, experiment_name=""):
    training_history = {'train_loss': [], 'val_loss': [], 'train_f1': [], 'val_f1': []}
    model_path = f"models/{experiment_name}_best_model.pt"
    best_metric = -1
    best_epoch = -1
    patience_counter = 0
    
    print(f"--- Starting Training: {experiment_name} ---")

    for epoch in range(1, epochs + 1):
        train_loss, train_f1 = train_one_epoch(model, train_loader, criterion, optimizer, scaler, device)
        val_loss, val_f1 = validate_one_epoch(model, val_loader, criterion, device)
        training_history['train_loss'].append(train_loss); training_history['val_loss'].append(val_loss)
        training_history['train_f1'].append(train_f1); training_history['val_f1'].append(val_f1)

        if verbose > 0 and (epoch % verbose == 0 or epoch == 1):
            print(f"Epoch {epoch:3d}/{epochs} | Train: Loss={train_loss:.4f}, F1={train_f1:.4f} | Val: Loss={val_loss:.4f}, F1={val_f1:.4f}")

        current_metric = val_f1
        if current_metric > best_metric:
            best_metric = current_metric
            best_epoch = epoch
            torch.save(model.state_dict(), model_path)
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\nEarly stopping triggered after {epoch} epochs.")
                break

    print(f"Restoring best model from epoch {best_epoch} with {evaluation_metric} {best_metric:.4f}")
    model.load_state_dict(torch.load(model_path))
    print(f"--- Finished Training: {experiment_name} ---")
    return model

## üß™ 5. Phase 1: Hyperparameter Search

### 5.1. Data Preparation for HPO

First, we re-order the columns in our dataset so they match what the new model expects:
1.  **32 Continuous Features First:** 31 `joint_` features, followed by the `time` feature.
2.  **5 Categorical Features Last:** 4 `pain_survey_` features, followed by the `is_pirate` feature.

Then, we perform a single, stratified 80/20 split on this re-ordered dataset. These raw, unscaled splits are passed to the HPO function.

In [7]:
# Re-order columns to group continuous and categorical features
# Continuous: joint_ (0-30), time (35)
# Categorical: pain_ (31-34), is_pirate (36)
continuous_indices_orig = list(range(31)) + [35]
categorical_indices_orig = list(range(31, 35)) + [36]

X_train_full_reordered = np.concatenate([
    X_train_full_engineered[:, :, continuous_indices_orig],
    X_train_full_engineered[:, :, categorical_indices_orig]
], axis=2)

print(f"Original feature order (example): joint_0...pain_survey_4...time...is_pirate")
print(f"Re-ordered X_train_full shape: {X_train_full_reordered.shape}")

# --- Split the re-ordered data ---
print("\n--- Splitting re-ordered, unscaled data for HPO ---")
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)

for train_idx, val_idx in sss.split(X_train_full_reordered, y_train_full):
    X_train_split = X_train_full_reordered[train_idx]
    y_train_split = y_train_full[train_idx]
    X_val_split = X_train_full_reordered[val_idx]
    y_val_split = y_train_full[val_idx]

print(f"  X_train_split: {X_train_split.shape}")
print(f"  X_val_split:   {X_val_split.shape}")

Original feature order (example): joint_0...pain_survey_4...time...is_pirate
Re-ordered X_train_full shape: (661, 160, 37)

--- Splitting re-ordered, unscaled data for HPO ---
  X_train_split: (528, 160, 37)
  X_val_split:   (133, 160, 37)


### 5.2. HPO Search Execution (Ray Tune + Optuna)

In [8]:
# --- 1. Define the Search Space --
search_space = {
    "window_size": tune.choice([5, 10, 20]),
    "stride": tune.choice([1, 2, 5]),
    "rnn_type": tune.choice(['GRU']),
    "lr": tune.loguniform(1e-4, 5e-3),
    "batch_size": tune.choice([64, 128]),
    "hidden_size": tune.choice([128, 256, 384]),
    "num_layers": tune.choice([2, 3]),
    "dropout_rate": tune.uniform(0.1, 0.5),         # GRU dropout
    "feature_dropout_rate": tune.uniform(0.1, 0.6), # Feature dropout
    "bidirectional": tune.choice([True, False]),
    "l2_lambda": tune.loguniform(1e-7, 1e-3)
}
# --- 2. Define the Optimizer and Scheduler ---
optuna_search = OptunaSearch(metric="val_f1", mode="max")
scheduler = ASHAScheduler(metric="val_f1", mode="max", grace_period=20, reduction_factor=2)

# --- 3. Initialize Ray ---
if ray.is_initialized(): ray.shutdown()
ray.init(num_cpus=16, num_gpus=1, ignore_reinit_error=True, log_to_driver=False)

# --- 4. Run the Tuner --
print("Starting hyperparameter search...")
objective_with_data = tune.with_parameters(
    objective_function, 
    X_train=X_train_split, y_train=y_train_split,
    X_val=X_val_split, y_val=y_val_split,
    class_weights_tensor=class_weights_tensor
)

analysis = tune.run(
    objective_with_data,
    resources_per_trial={"cpu": 4, "gpu": 0.25}, 
    config=search_space,
    num_samples=25, # Run a few more trials for the more complex space 
    search_alg=optuna_search,
    scheduler=scheduler,
    name="pirate_pain_reg_embedding_search_v6",
    verbose=1
)

print("\n--- Search Complete ---\n")

# --- 5. Get Best Results ---
print("Getting best trial from analysis...")
best_trial = analysis.get_best_trial(metric="val_f1", mode="max", scope="all")
if best_trial:
    FINAL_CONFIG = best_trial.config
    FINAL_BEST_VAL_F1 = best_trial.last_result["val_f1"]
    print(f"Best validation F1 score: {FINAL_BEST_VAL_F1:.4f}")
    print("Best hyperparameters found:")
    print(FINAL_CONFIG)
else:
    print("ERROR: No trials completed successfully. Using a default config.")
    FINAL_CONFIG = {
        'window_size': 20, 'stride': 1, 'rnn_type': 'GRU', 'lr': 0.0008, 'batch_size': 64,
        'hidden_size': 256, 'num_layers': 2, 'dropout_rate': 0.25, 'feature_dropout_rate': 0.4,
        'bidirectional': True, 'l2_lambda': 5e-06
    }
    FINAL_BEST_VAL_F1 = 0.0

del X_train_split, y_train_split, X_val_split, y_val_split, X_train_full_reordered

0,1
Current time:,2025-11-13 10:34:29
Running for:,01:14:10.17
Memory:,9.0/13.9 GiB

Trial name,status,loc,batch_size,bidirectional,dropout_rate,feature_dropout_rate,hidden_size,l2_lambda,lr,num_layers,rnn_type,stride,window_size,iter,total time (s),val_f1,train_loss
objective_function_738bf483,TERMINATED,127.0.0.1:19216,128,False,0.416345,0.181599,384,2.11221e-07,0.000839728,2,GRU,2,10,100,611.426,0.926955,0.000745785
objective_function_3148fef8,TERMINATED,127.0.0.1:432,128,True,0.460497,0.32166,128,3.50291e-07,0.00227908,3,GRU,2,20,20,143.022,0.910596,0.00523267
objective_function_3d6429db,TERMINATED,127.0.0.1:10828,64,True,0.238156,0.136527,256,0.000283498,0.000583365,3,GRU,2,10,40,886.56,0.924735,0.00233808
objective_function_6fc0b8d6,TERMINATED,127.0.0.1:13628,64,True,0.37448,0.122932,384,1.53886e-05,0.00230122,2,GRU,5,10,20,140.958,0.906881,0.0120187
objective_function_5e9b8053,TERMINATED,127.0.0.1:32780,64,True,0.192472,0.169223,384,0.000266938,0.00149425,2,GRU,2,10,40,669.224,0.924419,0.00493057
objective_function_67052d96,TERMINATED,127.0.0.1:37896,128,False,0.166377,0.497665,128,3.16406e-07,0.00163633,2,GRU,1,5,20,200.076,0.901503,0.0224814
objective_function_e234a5a6,TERMINATED,127.0.0.1:14228,128,False,0.266316,0.332446,128,1.18724e-05,0.00169372,3,GRU,1,5,80,905.931,0.925623,0.0087625
objective_function_43f8486b,TERMINATED,127.0.0.1:38220,64,True,0.393106,0.290163,384,1.02212e-07,0.000163902,2,GRU,1,20,80,3123.45,0.929029,0.000685631
objective_function_947c6964,TERMINATED,127.0.0.1:31456,64,True,0.151001,0.556569,384,6.59058e-06,0.000572588,3,GRU,5,10,40,392.741,0.919922,0.016442
objective_function_23e05c2e,TERMINATED,127.0.0.1:37532,64,False,0.448428,0.143195,256,5.89587e-07,0.000249997,3,GRU,5,10,20,136.235,0.895209,0.0210446


2025-11-13 10:34:29,226	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/Karim Negm/ray_results/pirate_pain_reg_embedding_search_v6' in 0.0447s.
2025-11-13 10:34:29,244	INFO tune.py:1041 -- Total run time: 4450.25 seconds (4450.13 seconds for the tuning loop).



--- Search Complete ---

Getting best trial from analysis...
Best validation F1 score: 0.9326
Best hyperparameters found:
{'window_size': 10, 'stride': 2, 'rnn_type': 'GRU', 'lr': 0.0009065748678215162, 'batch_size': 64, 'hidden_size': 384, 'num_layers': 2, 'dropout_rate': 0.30536040643808177, 'feature_dropout_rate': 0.3772620476473097, 'bidirectional': False, 'l2_lambda': 0.0007661770615057044}


## üèÜ 6. Phase 2: K-Fold Ensemble Training

In [9]:
# ===================================================================
# --- üèÜ FINAL MODEL CONFIGURATION üèÜ ---
# ===================================================================
print("--- üèÜ Final Configuration Set --- ")
print(f"Best Val F1 from HPO search: {FINAL_BEST_VAL_F1:.4f}")
print(FINAL_CONFIG)

N_SPLITS = 5
FINAL_EXPERIMENT_NAME = f"GRU_Optuna_KFold_Ensemble_v6_RegEmbed"
submission_filename_base = f"submission_{FINAL_EXPERIMENT_NAME}.csv"
print(f"Submission name will be: {submission_filename_base}")

--- üèÜ Final Configuration Set --- 
Best Val F1 from HPO search: 0.9326
{'window_size': 10, 'stride': 2, 'rnn_type': 'GRU', 'lr': 0.0009065748678215162, 'batch_size': 64, 'hidden_size': 384, 'num_layers': 2, 'dropout_rate': 0.30536040643808177, 'feature_dropout_rate': 0.3772620476473097, 'bidirectional': False, 'l2_lambda': 0.0007661770615057044}
Submission name will be: submission_GRU_Optuna_KFold_Ensemble_v6_RegEmbed.csv


In [10]:
# --- Re-order the full dataset for K-Fold training ---
continuous_indices_orig = list(range(31)) + [35]
categorical_indices_orig = list(range(31, 35)) + [36]
X_train_full_reordered = np.concatenate([
    X_train_full_engineered[:, :, continuous_indices_orig],
    X_train_full_engineered[:, :, categorical_indices_orig]
], axis=2)

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED) 
print(f"--- Starting {N_SPLITS}-Fold CV Training ---")
fold_val_f1_list = []
continuous_indices_reordered = list(range(32))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_full_reordered, y_train_full)):
    fold_name = f"kfold_fold_{fold+1}"
    print(f"\n--- Fold {fold+1}/{N_SPLITS} --- ({fold_name}) ---")
    
    X_train_fold_full = X_train_full_reordered[train_idx]
    y_train_fold_full = y_train_full[train_idx]
    X_val_fold_full = X_train_full_reordered[val_idx]
    y_val_fold_full = y_train_full[val_idx]

    # --- Scale INSIDE the fold ---
    preprocessor_fold = ColumnTransformer(
        transformers=[('scaler', StandardScaler(), continuous_indices_reordered)],
        remainder='passthrough'
    )
    
    ns, ts, f = X_train_fold_full.shape
    X_train_2d = X_train_fold_full.reshape(ns * ts, f)
    ns_val, ts_val, f_val = X_val_fold_full.shape
    X_val_2d = X_val_fold_full.reshape(ns_val * ts_val, f_val)

    preprocessor_fold.fit(X_train_2d)
    X_train_fold_scaled = preprocessor_fold.transform(X_train_2d).reshape(ns, ts, -1)
    X_val_fold_scaled = preprocessor_fold.transform(X_val_2d).reshape(ns_val, ts_val, -1)

    # --- Create Sliding Windows & Dataloaders ---
    X_train_w, y_train_w, _ = create_sliding_windows(X_train_fold_scaled, y_train_fold_full, 
        window_size=FINAL_CONFIG['window_size'], stride=FINAL_CONFIG['stride'])
    X_val_w, y_val_w, _ = create_sliding_windows(X_val_fold_scaled, y_val_fold_full, 
        window_size=FINAL_CONFIG['window_size'], stride=FINAL_CONFIG['stride'])
    train_ds = TensorDataset(torch.from_numpy(X_train_w).float(), torch.from_numpy(y_train_w).long())
    val_ds = TensorDataset(torch.from_numpy(X_val_w).float(), torch.from_numpy(y_val_w).long())
    train_loader = make_loader(train_ds, FINAL_CONFIG['batch_size'], shuffle=True, drop_last=True)
    val_loader = make_loader(val_ds, FINAL_CONFIG['batch_size'], shuffle=False, drop_last=False)
    
    # --- Create & Train Model ---
    model_fold = RecurrentClassifier(
        hidden_size=FINAL_CONFIG["hidden_size"], num_layers=FINAL_CONFIG["num_layers"], num_classes=N_CLASSES,
        dropout_rate=FINAL_CONFIG["dropout_rate"], feature_dropout_rate=FINAL_CONFIG["feature_dropout_rate"],
        bidirectional=FINAL_CONFIG["bidirectional"], rnn_type=FINAL_CONFIG["rnn_type"]
    ).to(device)
    
    optimizer = torch.optim.AdamW(model_fold.parameters(), lr=FINAL_CONFIG['lr'], weight_decay=FINAL_CONFIG['l2_lambda'])
    scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
    
    model_fold = fit(
        model=model_fold, train_loader=train_loader, val_loader=val_loader, epochs=300,
        criterion=criterion, optimizer=optimizer, scaler=scaler, device=device,
        verbose=25, experiment_name=fold_name, patience=40 # Increase patience slightly
    )
    
    _, val_f1 = validate_one_epoch(model_fold, val_loader, criterion, device)
    fold_val_f1_list.append(val_f1)
    print(f"Fold {fold+1} Best Model Val F1: {val_f1:.4f}")

print(f"\n--- üèÜ K-Fold Training Complete ---")
print(f"Fold F1 scores: {[round(f, 4) for f in fold_val_f1_list]}")
print(f"Average F1 across folds: {np.mean(fold_val_f1_list):.4f}")

--- Starting 5-Fold CV Training ---

--- Fold 1/5 --- (kfold_fold_1) ---
--- Starting Training: kfold_fold_1 ---
Epoch   1/300 | Train: Loss=0.1443, F1=0.8687 | Val: Loss=0.3706, F1=0.8395
Epoch  25/300 | Train: Loss=0.0068, F1=0.9933 | Val: Loss=0.4545, F1=0.8906
Epoch  50/300 | Train: Loss=0.0047, F1=0.9957 | Val: Loss=0.4868, F1=0.8984
Epoch  75/300 | Train: Loss=0.0025, F1=0.9971 | Val: Loss=0.5258, F1=0.8911

Early stopping triggered after 75 epochs.
Restoring best model from epoch 35 with val_f1 0.9063
--- Finished Training: kfold_fold_1 ---
Fold 1 Best Model Val F1: 0.9063

--- Fold 2/5 --- (kfold_fold_2) ---
--- Starting Training: kfold_fold_2 ---
Epoch   1/300 | Train: Loss=0.1488, F1=0.8631 | Val: Loss=0.2583, F1=0.8978
Epoch  25/300 | Train: Loss=0.0076, F1=0.9905 | Val: Loss=0.3890, F1=0.9216
Epoch  50/300 | Train: Loss=0.0048, F1=0.9938 | Val: Loss=0.4694, F1=0.9169

Early stopping triggered after 67 epochs.
Restoring best model from epoch 27 with val_f1 0.9293
--- Finishe

## üì¨ 7. Phase 3: Ensemble Submission

In [13]:
print("\n--- Preparing test dataset for submission ---")

# --- 1. Re-order test data columns ---
X_test_full_reordered = np.concatenate([
    X_test_full_engineered[:, :, continuous_indices_orig],
    X_test_full_engineered[:, :, categorical_indices_orig]
], axis=2)

# --- 2. Prepare Final Preprocessor (Fit on ALL re-ordered training data) ---
preprocessor_final = ColumnTransformer(
    transformers=[('scaler', StandardScaler(), continuous_indices_reordered)],
    remainder='passthrough'
)
ns, ts, f = X_train_full_reordered.shape
preprocessor_final.fit(X_train_full_reordered.reshape(ns * ts, f))

# --- 3. Scale, Window, and Load Test Data ---
ns_test, ts_test, f_test = X_test_full_reordered.shape
X_test_scaled_2d = preprocessor_final.transform(X_test_full_reordered.reshape(ns_test * ts_test, f_test))
X_test_final_scaled = X_test_scaled_2d.reshape(ns_test, ts_test, -1)

X_test_w, test_window_indices = create_sliding_windows(X_test_final_scaled, y=None, 
    window_size=FINAL_CONFIG['window_size'], stride=FINAL_CONFIG['stride'])

test_ds = TensorDataset(torch.from_numpy(X_test_w).float())
test_loader = make_loader(test_ds, batch_size=FINAL_CONFIG['batch_size'], shuffle=False, drop_last=False)
print(f"Final TestLoader created with {len(X_test_w)} windows.")

# --- 4. Generate Predictions from K-Fold Models ---
all_fold_probabilities = []
for fold in range(N_SPLITS):
    model_path = f"models/kfold_fold_{fold+1}_best_model.pt"
    print(f"Loading model {fold+1}/{N_SPLITS} from {model_path}...")
    model_fold = RecurrentClassifier(
        hidden_size=FINAL_CONFIG["hidden_size"], num_layers=FINAL_CONFIG["num_layers"], num_classes=N_CLASSES,
        dropout_rate=FINAL_CONFIG["dropout_rate"], feature_dropout_rate=FINAL_CONFIG["feature_dropout_rate"],
        bidirectional=FINAL_CONFIG["bidirectional"], rnn_type=FINAL_CONFIG["rnn_type"]
    ).to(device)
    model_fold.load_state_dict(torch.load(model_path, map_location=device))
    model_fold.eval()
    
    fold_predictions = []
    with torch.no_grad():
        for (inputs,) in test_loader:
            inputs = inputs.to(device)
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                logits = model_fold(inputs)
                probs = torch.softmax(logits, dim=1)
                fold_predictions.append(probs.cpu().numpy())
    all_fold_probabilities.append(np.concatenate(fold_predictions))

# --- 5. Average, Aggregate, and Save Submission ---
mean_probabilities = np.mean(all_fold_probabilities, axis=0)
prob_cols = [f"prob_{i}" for i in range(N_CLASSES)]
df_probs = pd.DataFrame(mean_probabilities, columns=prob_cols)
df_probs['original_index'] = test_window_indices
agg_probs = df_probs.groupby('original_index')[prob_cols].mean().values
final_predictions = le.inverse_transform(np.argmax(agg_probs, axis=1))

submission_df = pd.DataFrame({
    'sample_index': sorted(X_test_long_df['sample_index'].unique()),
    'label': final_predictions
})
submission_df['sample_index'] = submission_df['sample_index'].apply(lambda x: f"{x:03d}")
submission_filepath = os.path.join("submissions", submission_filename_base)
submission_df.to_csv(submission_filepath, index=False)
print(f"\nSuccessfully saved to {submission_filepath}!")
print(submission_df.head())


--- Preparing test dataset for submission ---
Final TestLoader created with 100624 windows.
Loading model 1/5 from models/kfold_fold_1_best_model.pt...
Loading model 2/5 from models/kfold_fold_2_best_model.pt...
Loading model 3/5 from models/kfold_fold_3_best_model.pt...
Loading model 4/5 from models/kfold_fold_4_best_model.pt...
Loading model 5/5 from models/kfold_fold_5_best_model.pt...

Successfully saved to submissions\submission_GRU_Optuna_KFold_Ensemble_v6_RegEmbed.csv!
  sample_index    label
0          000  no_pain
1          001  no_pain
2          002  no_pain
3          003  no_pain
4          004  no_pain
