# **Kaggle Challenge: Pirate Pain Dataset üè¥‚Äç‚ò†Ô∏è (v2: K-Fold Ensemble)**

This notebook implements a robust K-Fold Cross-Validation and Ensembling strategy to improve on the baseline model. 

**Strategy:**
1.  **Hyperparameter Search:** Use Ray Tune & Optuna on a single 80/20 split to find a good set of hyperparameters (the `FINAL_CONFIG`).
2.  **K-Fold Training:** Instead of training one model on 100% of the data, we train `K` (e.g., 5) models on `K` different 80/20 splits ("folds"). We use early stopping to find the best model for each fold and save it to disk.
3.  **Ensemble Prediction:** To create the final submission, we load all `K` models. We get `K` different predictions for the test set, average their (softmax) probabilities, and then aggregate these averaged probabilities for the final submission. This is far more robust than training a single, "blind" model.

## ‚öôÔ∏è **1. Setup & Libraries**

In [13]:
# Set seed for reproducibility
SEED = 123

# Import necessary libraries
import os
import logging
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import copy
from itertools import product
import time

# Set environment variables before importing modules
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# --- PyTorch Imports ---
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader

# --- Sklearn Imports ---
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

# --- Ray[tune] & Optuna Imports ---
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from functools import partial

# --- Setup Directories & Device ---
logs_dir = "tensorboard"
os.makedirs("models", exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
    print("\n--- Using GPU (RTX 3070, here we come!) ---")
else:
    device = torch.device("cpu")
    print("\n--- Using CPU ---")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Configure plot display settings
sns.set_theme(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline


--- Using GPU (RTX 3070, here we come!) ---
PyTorch version: 2.5.1
Device: cuda


## üîÑ **2. Data Loading & Reshaping**

In [14]:
# --- 1. Define File Paths and Features ---
DATA_DIR = "data"
X_TRAIN_PATH = os.path.join(DATA_DIR, "pirate_pain_train.csv")
Y_TRAIN_PATH = os.path.join(DATA_DIR, "pirate_pain_train_labels.csv")
X_TEST_PATH = os.path.join(DATA_DIR, "pirate_pain_test.csv")
SUBMISSION_PATH = os.path.join(DATA_DIR, "sample_submission.csv")

# Define our time-series features
# We'll ignore static features (n_legs, etc.) for our baseline model
JOINT_FEATURES = [f"joint_{i:02d}" for i in range(31)]
PAIN_FEATURES = [f"pain_survey_{i}" for i in range(1, 5)]
FEATURES = JOINT_FEATURES + PAIN_FEATURES

N_FEATURES = len(FEATURES)
N_TIMESTEPS = 160 # Fixed from our earlier debugging

print(f"Using {N_FEATURES} features: {FEATURES[:3]}... to {FEATURES[-3:]}")

# --- 2. Create the Reshaping Function ---
def reshape_data(df, features_list, n_timesteps):
    """
    Pivots the long-format dataframe into a 3D NumPy array.
    Shape: (n_samples, n_timesteps, n_features)
    """
    df_pivot = df.pivot(index='sample_index', columns='time', values=features_list)
    data_2d = df_pivot.values
    n_samples = data_2d.shape[0]
    data_3d = data_2d.reshape(n_samples, len(features_list), n_timesteps)
    return data_3d.transpose(0, 2, 1)

def create_sliding_windows(X_3d, y=None, window_size=100, stride=20):
    """
    Takes 3D data (n_samples, n_timesteps, n_features)
    and creates overlapping windows.
    
    Returns:
    - new_X: (n_windows, window_size, n_features)
    - new_y (if y is provided): (n_windows,)
    - window_indices: (n_windows,) array tracking which original sample
                      (e.g., 0, 1, 2...) each window came from.
    """
    new_X = []
    new_y = []
    # This new array tracks which original sample each window came from.
    window_indices = [] 
    
    n_samples, n_timesteps, n_features = X_3d.shape
    
    # Iterate over each original sample
    for i in range(n_samples):
        sample = X_3d[i] # Shape (160, 35)
        
        # Slide a window over this sample
        idx = 0
        while (idx + window_size) <= n_timesteps:
            window = sample[idx : idx + window_size]
            new_X.append(window)
            window_indices.append(i) # Track the original sample index (0, 1, 2...)
            
            if y is not None:
                new_y.append(y[i]) # The label is the same for all windows
                
            idx += stride
            
    if y is not None:
        # Return new X, new y, and the index mapping
        return np.array(new_X), np.array(new_y), np.array(window_indices)
    else:
        # Return new X and the index mapping
        return np.array(new_X), np.array(window_indices)

# --- 3. Load and Reshape Data ---
print("Loading and reshaping training data...")
X_train_long = pd.read_csv(X_TRAIN_PATH)
X_train_full = reshape_data(X_train_long[X_train_long['sample_index'].isin(X_train_long['sample_index'].unique())], FEATURES, N_TIMESTEPS)

print("Loading and reshaping test data...")
X_test_long = pd.read_csv(X_TEST_PATH)
X_test = reshape_data(X_test_long, FEATURES, N_TIMESTEPS)

# Load labels
y_train_df = pd.read_csv(Y_TRAIN_PATH)
y_train_full_df = y_train_df.sort_values(by='sample_index')
y_train_labels_str = y_train_full_df['label'].values # Fixed from our debugging

print(f"X_train_full shape: {X_train_full.shape}")
print(f"y_train_labels_str shape: {y_train_labels_str.shape}")
print(f"X_test shape: {X_test.shape}")

del X_train_long, X_test_long, y_train_df

Using 35 features: ['joint_00', 'joint_01', 'joint_02']... to ['pain_survey_2', 'pain_survey_3', 'pain_survey_4']
Loading and reshaping training data...
Loading and reshaping test data...
X_train_full shape: (661, 160, 35)
y_train_labels_str shape: (661,)
X_test shape: (1324, 160, 35)


## üöß **3. Preprocessing: Split & Scale**

1.  **Encode Labels:** Convert `no_pain`, `low_pain`, `high_pain` to `0`, `1`, `2`.
2.  **Split Data:** Use `StratifiedShuffleSplit` to create a single 80/20 train/validation split **for the HPO phase**.
3.  **Scale Features:** Use `StandardScaler`. We `fit` it *only* on the training data and `transform` all sets (train, val, and test).

In [15]:
# --- 1. Encode Labels ---
LABEL_MAPPING = {'no_pain': 0, 'low_pain': 1, 'high_pain': 2}
le = LabelEncoder()
le.fit(list(LABEL_MAPPING.keys()))
y_train_full = le.transform(y_train_labels_str)
N_CLASSES = len(LABEL_MAPPING)

print(f"Labels encoded. {N_CLASSES} classes: {LABEL_MAPPING}")

# 1. DEFINE YOUR WINDOW PARAMETERS
NEW_WINDOW_SIZE = 80  # Example: 80 timesteps
NEW_STRIDE = 20       # Example: 20 timesteps

print("--- Applying sliding window augmentation ---")
# 3. APPLY THE NEW WINDOWING FUNCTION
X_train_windowed, y_train_windowed, _ = create_sliding_windows(
    X_train_full, 
    y_train_full, 
    window_size=NEW_WINDOW_SIZE, 
    stride=NEW_STRIDE
)

print(f"Original X shape: {X_train_full.shape}")
print(f"Windowed X shape: {X_train_windowed.shape}")
print(f"Original y shape: {y_train_full.shape}")
print(f"Windowed y shape: {y_train_windowed.shape}")

# (You would also need to apply this to X_test for submission)
# (But NOT to y_train_full for the split)

# 4. USE THE *WINDOWED* DATA FOR YOUR SPLIT
print("\n--- Splitting windowed data ---")
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)

# IMPORTANT: You split the *new*, *larger* X and y arrays
for train_idx, val_idx in sss.split(X_train_windowed, y_train_windowed):
    X_train_split = X_train_windowed[train_idx]
    y_train_split = y_train_windowed[train_idx]
    X_val_split = X_train_windowed[val_idx]
    y_val_split = y_train_windowed[val_idx]

print(f"  X_train_split: {X_train_split.shape}")
print(f"  y_train_split: {y_train_split.shape}")
print(f"  X_val_split:   {X_val_split.shape}")
print(f"  y_val_split:   {y_val_split.shape}")
# --- 3. Scale Features (The "No-Cheating" Rule) ---
scaler = StandardScaler()
ns, ts, f = X_train_split.shape
X_train_2d = X_train_split.reshape(ns * ts, f)
print(f"Fitting Scaler on X_train_2d shape: {X_train_2d.shape}")
scaler.fit(X_train_2d)

X_train_scaled_2d = scaler.transform(X_train_2d)
X_train_scaled = X_train_scaled_2d.reshape(ns, ts, f)

ns_val, ts_val, f_val = X_val_split.shape
X_val_2d = X_val_split.reshape(ns_val * ts_val, f_val)
X_val_scaled_2d = scaler.transform(X_val_2d)
X_val_scaled = X_val_scaled_2d.reshape(ns_val, ts_val, f_val)

# We will scale the X_test data LATER, during the submission phase
# This ensures our K-Fold models use the correct scaler for their fold
# For now, we only need the HPO data scaled.
print("Scaling complete for HPO data.")
print(f"  X_train_scaled: {X_train_scaled.shape}")
print(f"  X_val_scaled:   {X_val_scaled.shape}")

del X_train_2d, X_val_2d, X_train_scaled_2d, X_val_scaled_2d

Labels encoded. 3 classes: {'no_pain': 0, 'low_pain': 1, 'high_pain': 2}
--- Applying sliding window augmentation ---
Original X shape: (661, 160, 35)
Windowed X shape: (3305, 80, 35)
Original y shape: (661,)
Windowed y shape: (3305,)

--- Splitting windowed data ---
  X_train_split: (2644, 80, 35)
  y_train_split: (2644,)
  X_val_split:   (661, 80, 35)
  y_val_split:   (661,)
Fitting Scaler on X_train_2d shape: (211520, 35)
Scaling complete for HPO data.
  X_train_scaled: (2644, 80, 35)
  X_val_scaled:   (661, 80, 35)


## üöö **4. PyTorch DataLoaders (for HPO)**

This section prepares the DataLoaders for the Ray Tune HPO phase.

In [16]:
# --- 1. Convert to Tensors ---
train_features = torch.from_numpy(X_train_scaled).float()
train_targets = torch.from_numpy(y_train_split).long()

val_features = torch.from_numpy(X_val_scaled).float()
val_targets = torch.from_numpy(y_val_split).long()

# test_features = torch.from_numpy(X_test_scaled).float() # We do this later

# --- 2. Create TensorDatasets ---
train_ds = TensorDataset(train_features, train_targets)
val_ds = TensorDataset(val_features, val_targets)
# test_ds = TensorDataset(test_features) # We do this later

# --- 3. Define make_loader function (from Lecture 4) ---
BATCH_SIZE = 128 # This will be our default, but Optuna can tune it

def make_loader(ds, batch_size, shuffle, drop_last):
    # Set num_workers=0 for Windows-friendly loading (from our debugging)
    num_workers = 0 
    
    # Create DataLoader with performance optimizations
    return DataLoader(
        ds,
        batch_size=int(batch_size), # Ensure batch_size is an int for the DataLoader
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
        pin_memory=True,
        pin_memory_device="cuda" if torch.cuda.is_available() else "",
        prefetch_factor=None,
    )

# --- 4. Create DataLoaders ---
# We will create these *inside* the objective function now,
# as the batch size is a hyperparameter we want to tune.
print("DataLoaders will be created inside the tuning loop.")

# --- !! MODIFICATION !! ---
# We are KEEPING the numpy arrays for the K-Fold step later
print("Keeping HPO numpy arrays in memory for K-Fold training...")
# del X_train_scaled, X_val_scaled, test_features, train_features, val_features

DataLoaders will be created inside the tuning loop.
Keeping HPO numpy arrays in memory for K-Fold training...


## üõ†Ô∏è **5. Model & Training Engine**

These are the core components from Lecture 4, modified for Ray Tune and K-Fold.

-   `RecurrentClassifier`: Our flexible model (RNN, LSTM, GRU).
-   `train_one_epoch` / `validate_one_epoch`: Standard loops.
-   `objective_function`: The wrapper for Ray Tune's HPO.
-   `fit`: The **original** training loop from Lecture 4, **b-rought back** to handle our K-Fold training, complete with early stopping.

In [17]:
def recurrent_summary(model, input_size):
    """
    Custom summary function that emulates torchinfo's output while correctly
    counting parameters for RNN/GRU/LSTM layers.
    """

    output_shapes = {}
    hooks = []

    def get_hook(name):
        def hook(module, input, output):
            if isinstance(output, tuple):
                shape1 = list(output[0].shape)
                shape1[0] = -1  # Replace batch dimension with -1

                if isinstance(output[1], tuple):  # LSTM case: (h_n, c_n)
                    shape2 = list(output[1][0].shape)
                else:  # RNN/GRU case: h_n only
                    shape2 = list(output[1].shape)
                shape2[1] = -1
                output_shapes[name] = f"[{shape1}, {shape2}]"
            else:
                shape = list(output.shape)
                shape[0] = -1
                output_shapes[name] = f"{shape}"
        return hook

    try:
        device_summary = next(model.parameters()).device
    except StopIteration:
        device_summary = torch.device("cpu")

    dummy_input = torch.randn(1, *input_size).to(device_summary)

    for name, module in model.named_children():
        if isinstance(module, (nn.Linear, nn.RNN, nn.GRU, nn.LSTM)):
            hook_handle = module.register_forward_hook(get_hook(name))
            hooks.append(hook_handle)

    model.eval()
    with torch.no_grad():
        try:
            model(dummy_input)
        except Exception as e:
            print(f"Error during dummy forward pass: {e}")
            for h in hooks:
                h.remove()
            return

    for h in hooks:
        h.remove()

    print("-" * 79)
    print(f"{'Layer (type)':<25} {'Output Shape':<28} {'Param #':<18}")
    print("=" * 79)

    total_params = 0
    total_trainable_params = 0

    for name, module in model.named_children():
        if name in output_shapes:
            module_params = sum(p.numel() for p in module.parameters())
            trainable_params = sum(p.numel() for p in module.parameters() if p.requires_grad)

            total_params += module_params
            total_trainable_params += trainable_params

            layer_name = f"{name} ({type(module).__name__})"
            output_shape_str = str(output_shapes[name])
            params_str = f"{trainable_params:,}"

            print(f"{layer_name:<25} {output_shape_str:<28} {params_str:<15}")

    print("=" * 79)
    print(f"Total params: {total_params:,}")
    print(f"Trainable params: {total_trainable_params:,}")
    print(f"Non-trainable params: {total_params - total_trainable_params:,}")
    print("-" * 79)

In [18]:
class RecurrentClassifier(nn.Module):
    """
    Generic RNN classifier (RNN, LSTM, GRU) from Lecture 4.
    Uses the last hidden state for classification.
    """
    def __init__(
            self,
            input_size,
            hidden_size,
            num_layers,
            num_classes,
            rnn_type='GRU',        # 'RNN', 'LSTM', or 'GRU'
            bidirectional=False,
            dropout_rate=0.2
            ):
        super().__init__()

        self.rnn_type = rnn_type
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional

        rnn_map = {
            'RNN': nn.RNN,
            'LSTM': nn.LSTM,
            'GRU': nn.GRU
        }

        if rnn_type not in rnn_map:
            raise ValueError("rnn_type must be 'RNN', 'LSTM', or 'GRU'")

        rnn_module = rnn_map[rnn_type]

        # Dropout is only applied between layers (if num_layers > 1)
        dropout_val = dropout_rate if num_layers > 1 else 0

        self.rnn = rnn_module(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,       # Input shape: (batch, seq_len, features)
            bidirectional=bidirectional,
            dropout=dropout_val
        )

        if self.bidirectional:
            classifier_input_size = hidden_size * 2 # Concat fwd + bwd
        else:
            classifier_input_size = hidden_size

        self.classifier = nn.Linear(classifier_input_size, num_classes)

    def forward(self, x):
        """
        x shape: (batch_size, seq_length, input_size)
        """
        rnn_out, hidden = self.rnn(x)

        if self.rnn_type == 'LSTM':
            hidden = hidden[0]

        if self.bidirectional:
            hidden = hidden.view(self.num_layers, 2, -1, self.hidden_size)
            hidden_to_classify = torch.cat([hidden[-1, 0, :, :], hidden[-1, 1, :, :]], dim=1)
        else:
            hidden_to_classify = hidden[-1]

        logits = self.classifier(hidden_to_classify)
        return logits

In [19]:
# This cell contains all our training functions

def train_one_epoch(model, train_loader, criterion, optimizer, scaler, device, l1_lambda=0, l2_lambda=0):
    model.train()
    running_loss = 0.0
    all_predictions = []
    all_targets = []

    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad(set_to_none=True)

        with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
            logits = model(inputs)
            loss = criterion(logits, targets)
            l1_norm = sum(p.abs().sum() for p in model.parameters())
            l2_norm = sum(p.pow(2).sum() for p in model.parameters())
            loss = loss + l1_lambda * l1_norm + l2_lambda * l2_norm

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() * inputs.size(0)
        predictions = logits.argmax(dim=1)
        all_predictions.append(predictions.cpu().numpy())
        all_targets.append(targets.cpu().numpy())

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_f1 = f1_score(
        np.concatenate(all_targets),
        np.concatenate(all_predictions),
        average='weighted'
    )
    return epoch_loss, epoch_f1

def validate_one_epoch(model, val_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                logits = model(inputs)
                loss = criterion(logits, targets)

            running_loss += loss.item() * inputs.size(0)
            predictions = logits.argmax(dim=1)
            all_predictions.append(predictions.cpu().numpy())
            all_targets.append(targets.cpu().numpy())

    epoch_loss = running_loss / len(val_loader.dataset)
    epoch_f1 = f1_score(
        np.concatenate(all_targets),
        np.concatenate(all_predictions),
        average='weighted'
    )
    return epoch_loss, epoch_f1

def log_metrics_to_tensorboard(writer, epoch, train_loss, train_f1, val_loss, val_f1, model):
    writer.add_scalar('Loss/Training', train_loss, epoch)
    writer.add_scalar('Loss/Validation', val_loss, epoch)
    writer.add_scalar('F1/Training', train_f1, epoch)
    writer.add_scalar('F1/Validation', val_f1, epoch)


# This is the objective_function for Ray Tune HPO
def objective_function(config, train_ds, val_ds):
    """
    This is the main function that Ray Tune will call for each trial.
    'config' is a dictionary of hyperparameters from Optuna.
    'train_ds' and 'val_ds' are our TensorDatasets.
    """
    
    # --- 1. Create DataLoaders with the tuned batch size ---
    train_loader = make_loader(train_ds, batch_size=config["batch_size"], shuffle=True, drop_last=True)
    val_loader = make_loader(val_ds, batch_size=config["batch_size"], shuffle=False, drop_last=False)
    
    # --- 2. Create Model --- 
    model = RecurrentClassifier(
        input_size=N_FEATURES,
        hidden_size=config["hidden_size"],
        num_layers=config["num_layers"],
        num_classes=N_CLASSES,
        dropout_rate=config["dropout_rate"],
        bidirectional=config["bidirectional"],
        rnn_type=config["rnn_type"]
    ).to(device)
    
    if torch.__version__[0] >= "2":
        model = torch.compile(model)
    
    # --- 3. Create Optimizer, Loss, Scaler ---
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["l2_lambda"])
    scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))

    # --- 4. The Training Loop (adapted from fit) ---
    # We loop for a fixed number of epochs (e.g., 200) and let Ray's
    # ASHA scheduler handle early stopping of bad trials.
    EPOCHS = 200 
    
    for epoch in range(1, EPOCHS + 1):
        train_loss, train_f1 = train_one_epoch(
            model, train_loader, criterion, optimizer, scaler, device, 0, config["l2_lambda"]
        )

        val_loss, val_f1 = validate_one_epoch(
            model, val_loader, criterion, device
        )
        
        # --- Send Results to Ray Tune --- 
        tune.report({
            "train_loss": train_loss,
            "train_f1": train_f1,
            "val_loss": val_loss,
            "val_f1": val_f1
        })

# --- !! MODIFICATION !! ---
# This is the original 'fit' function from Lecture 4, brought back.
# We will use THIS function for our K-Fold training, as it has early stopping.
def fit(model, train_loader, val_loader, epochs, criterion, optimizer, scaler, device,
        l1_lambda=0, l2_lambda=0, patience=0, evaluation_metric="val_f1", mode='max',
        restore_best_weights=True, writer=None, verbose=10, experiment_name=""):
    
    training_history = {
        'train_loss': [], 'val_loss': [],
        'train_f1': [], 'val_f1': []
    }
    
    model_path = f"models/{experiment_name}_best_model.pt"

    if patience > 0:
        patience_counter = 0
        best_metric = float('-inf') if mode == 'max' else float('inf')
        best_epoch = 0

    print(f"--- Starting Training: {experiment_name} ---")
    print(f"Will train for {epochs} epochs with patience={patience} monitoring {evaluation_metric}")

    for epoch in range(1, epochs + 1):
        train_loss, train_f1 = train_one_epoch(
            model, train_loader, criterion, optimizer, scaler, device, l1_lambda, l2_lambda
        )

        val_loss, val_f1 = validate_one_epoch(
            model, val_loader, criterion, device
        )

        training_history['train_loss'].append(train_loss)
        training_history['val_loss'].append(val_loss)
        training_history['train_f1'].append(train_f1)
        training_history['val_f1'].append(val_f1)

        if writer is not None:
            log_metrics_to_tensorboard(
                writer, epoch, train_loss, train_f1, val_loss, val_f1, model
            )

        if verbose > 0 and (epoch % verbose == 0 or epoch == 1):
            print(f"Epoch {epoch:3d}/{epochs} | "
                  f"Train: Loss={train_loss:.4f}, F1={train_f1:.4f} | "
                  f"Val: Loss={val_loss:.4f}, F1={val_f1:.4f}")

        if patience > 0:
            current_metric = training_history[evaluation_metric][-1]
            is_improvement = (current_metric > best_metric) if mode == 'max' else (current_metric < best_metric)

            if is_improvement:
                best_metric = current_metric
                best_epoch = epoch
                torch.save(model.state_dict(), model_path)
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"\nEarly stopping triggered after {epoch} epochs.")
                    break

    if restore_best_weights and patience > 0:
        print(f"Restoring best model from epoch {best_epoch} with {evaluation_metric} {best_metric:.4f}")
        model.load_state_dict(torch.load(model_path))

    if patience == 0:
        print("Training complete. Saving final model.")
        torch.save(model.state_dict(), model_path.replace("_best_model.pt", "_final_model.pt"))

    if writer is not None:
        writer.close()
    
    print(f"--- Finished Training: {experiment_name} ---")
    return model, training_history, best_epoch if 'best_epoch' in locals() else epochs

## üß™ **6. Phase 1: Hyperparameter Search**

This cell is identical to before. We run HPO on our single 80/20 split to find a good `FINAL_CONFIG` to use for our K-Fold training.

In [20]:
# --- 1. Define the Search Space for Optuna ---
search_space = {
    "rnn_type": tune.choice(['GRU', 'LSTM']),
    "lr": tune.loguniform(1e-5, 1e-2),           # Widen the learning rate
    "batch_size": tune.choice([64, 128, 256]),  
    "hidden_size": tune.choice([128, 256, 384]),# Let's try bigger models
    "num_layers": tune.choice([2, 3]),       # Let's try deeper models
    "dropout_rate": tune.uniform(0.1, 0.6),     # Widen the dropout range
    "bidirectional": tune.choice([True, False]),
    "l2_lambda": tune.loguniform(1e-7, 1e-3)      # Widen the L2 range
}

# --- 2. Define the Optimizer (Optuna) and Scheduler (ASHA) ---
optuna_search = OptunaSearch(
    metric="val_f1",
    mode="max"
)

scheduler = ASHAScheduler(
    metric="val_f1",
    mode="max",
    grace_period=20,  # Min epochs a trial must run
    reduction_factor=2  # How aggressively to stop trials
)

# --- 3. Initialize Ray (WITH THE BIG HAMMER FIX) ---
if ray.is_initialized():
    ray.shutdown()

ray_logs_path = os.path.abspath("./ray_results")
os.makedirs(ray_logs_path, exist_ok=True)
os.environ["RAY_TEMP_DIR"] = ray_logs_path

ray.init(
    num_cpus=16, 
    num_gpus=1, 
    ignore_reinit_error=True
)

def short_trial_name(trial):
    """Creates a short, unique name for each trial folder."""
    return f"{trial.trainable_name}_{trial.trial_id}"


# --- 4. Run the Tuner ---
print("Starting hyperparameter search (1 trial at a time)...")

analysis = tune.run(
    tune.with_parameters(objective_function, train_ds=train_ds, val_ds=val_ds),
    resources_per_trial={"cpu": 4, "gpu": 0.25}, 
    config=search_space,
    num_samples=20, # Number of different HPO trials to run
    search_alg=optuna_search,
    scheduler=scheduler,
    name="pirate_pain_optuna_search",
    storage_path=ray_logs_path,
    trial_dirname_creator=short_trial_name,
    log_to_file=True,
    verbose=1 # 0 = quiet, 1 = table, 2 = detailed
)

print("\n--- Search Complete ---")

# --- 5. Get Best Results (FIX 3) ---
print("Getting best trial from analysis...")
best_trial = analysis.get_best_trial(metric="val_f1", mode="max", scope="all")
if best_trial:
    best_config = best_trial.config
    best_val_f1 = best_trial.last_result["val_f1"]
    
    print(f"Best validation F1 score: {best_val_f1:.4f}")
    print("Best hyperparameters found:")
    print(best_config)
else:
    print("ERROR: No trials completed successfully. Check the 'ray_results' folder for logs.")
    best_config = None # Handle the case where all trials failed

0,1
Current time:,2025-11-08 22:43:58
Running for:,00:10:28.79
Memory:,9.8/13.9 GiB

Trial name,status,loc,batch_size,bidirectional,dropout_rate,hidden_size,l2_lambda,lr,num_layers,rnn_type,iter,total time (s),train_loss,train_f1,val_loss
objective_function_dd0b15d7,TERMINATED,127.0.0.1:6696,256,True,0.37759,384,3.51052e-05,2.57749e-05,2,GRU,100,116.595,0.229354,0.938132,0.170618
objective_function_405329c7,TERMINATED,127.0.0.1:46992,64,False,0.500568,384,2.87899e-05,0.00395307,2,LSTM,100,248.193,0.798879,0.944063,0.193177
objective_function_e8dbb73e,TERMINATED,127.0.0.1:52396,64,True,0.54854,384,2.57871e-07,0.00349848,3,LSTM,20,143.817,0.205152,0.938113,0.223358
objective_function_142bf406,TERMINATED,127.0.0.1:27616,64,True,0.490094,384,1.14063e-05,0.000228306,2,GRU,100,397.04,0.0276794,0.998476,0.0618401
objective_function_eaae3216,TERMINATED,127.0.0.1:51840,128,False,0.241778,384,4.51583e-06,0.00122211,2,LSTM,100,134.977,0.0356843,0.994916,0.0666771
objective_function_74ce3293,TERMINATED,127.0.0.1:6708,128,False,0.141629,256,0.000728551,0.00161677,3,GRU,100,163.886,0.119777,0.980728,0.0809032
objective_function_4feb27ba,TERMINATED,127.0.0.1:8436,256,False,0.258338,128,3.72887e-06,0.00429286,2,GRU,100,36.2272,0.00352432,1.0,0.09291
objective_function_954ea5f5,TERMINATED,127.0.0.1:40944,64,False,0.528355,384,0.000319292,0.000765373,2,GRU,80,168.726,0.123499,0.983529,0.109776
objective_function_935aae6a,TERMINATED,127.0.0.1:45928,64,False,0.593239,128,3.2435e-07,0.000844423,3,LSTM,20,31.0759,0.21506,0.93011,0.23864
objective_function_50fdc555,TERMINATED,127.0.0.1:20616,64,True,0.406444,384,0.000438249,0.0066905,3,LSTM,20,118.452,10.362,0.698501,0.840256


[36m(pid=gcs_server)[0m [2025-11-08 22:33:48,102 E 40516 6752] (gcs_server.exe) gcs_server.cc:302: Failed to establish connection to the event+metrics exporter agent. Events and metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
[33m(raylet)[0m [2025-11-08 22:33:52,235 E 49856 11732] (raylet.exe) main.cc:975: Failed to establish connection to the metrics exporter agent. Metrics will not be exported. Exporter agent status: RpcError: Running out of retries to initialize the metrics agent. rpc_code: 14
2025-11-08 22:43:58,622	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'c:/Users/Karim Negm/Documents/AN2DL/Challenge 1/ray_results/pirate_pain_optuna_search' in 0.0404s.
2025-11-08 22:43:58,645	INFO tune.py:1041 -- Total run time: 628.86 seconds (628.74 seconds for the tuning loop).



--- Search Complete ---
Getting best trial from analysis...
Best validation F1 score: 0.9789
Best hyperparameters found:
{'rnn_type': 'GRU', 'lr': 0.00022830551615609333, 'batch_size': 64, 'hidden_size': 384, 'num_layers': 2, 'dropout_rate': 0.4900942505159511, 'bidirectional': True, 'l2_lambda': 1.1406282234266435e-05}


## üèÜ **7. Final Model Configuration**

This cell now holds our winning configuration, ready for the K-Fold training.

In [21]:
# ===================================================================
# --- üèÜ FINAL MODEL CONFIGURATION üèÜ ---
# ===================================================================
FINAL_CONFIG = best_config
FINAL_BEST_VAL_F1 = best_val_f1

print("--- üèÜ Final Configuration Set --- ")
print(f"Best Val F1 from HPO search: {FINAL_BEST_VAL_F1:.4f}")
print(FINAL_CONFIG)

# --- Set variables for the K-Fold & submission cells ---
FINAL_MODEL_TYPE = FINAL_CONFIG["rnn_type"]
FINAL_HIDDEN_SIZE = FINAL_CONFIG["hidden_size"]
FINAL_HIDDEN_LAYERS = FINAL_CONFIG["num_layers"]
FINAL_BIDIRECTIONAL = FINAL_CONFIG["bidirectional"]
FINAL_DROPOUT_RATE = FINAL_CONFIG["dropout_rate"]
FINAL_LEARNING_RATE = FINAL_CONFIG["lr"]
FINAL_L2_LAMBDA = FINAL_CONFIG["l2_lambda"]
FINAL_BATCH_SIZE = FINAL_CONFIG["batch_size"]

FINAL_EXPERIMENT_NAME = f"{FINAL_MODEL_TYPE}_H{FINAL_HIDDEN_SIZE}_L{FINAL_HIDDEN_LAYERS}_B{FINAL_BIDIRECTIONAL}_Optuna_KFold_Ensemble"

print(f"Submission name will be: submission_{FINAL_EXPERIMENT_NAME}.csv")

--- üèÜ Final Configuration Set --- 
Best Val F1 from HPO search: 0.9789
{'rnn_type': 'GRU', 'lr': 0.00022830551615609333, 'batch_size': 64, 'hidden_size': 384, 'num_layers': 2, 'dropout_rate': 0.4900942505159511, 'bidirectional': True, 'l2_lambda': 1.1406282234266435e-05}
Submission name will be: submission_GRU_H384_L2_BTrue_Optuna_KFold_Ensemble.csv


## üõ°Ô∏è **8. Phase 2: K-Fold Ensemble Training**

This is the **NEW** robust training step.

Instead of finding one "best epoch," we will train 5 separate models on 5 different splits (folds) of the data. We use the `FINAL_CONFIG` from our HPO search as the configuration for all 5 models.

We use the `fit` function's early stopping to find the best version of each model and save its weights (e.g., `kfold_fold_1_best_model.pt`, `kfold_fold_2_best_model.pt`, etc.).

In [22]:
# --- !! NEW CELL: K-FOLD TRAINING !! ---

from sklearn.model_selection import StratifiedKFold
import numpy as np

N_SPLITS = 5  # 5-fold CV is a standard
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

# --- 1. Reconstruct the full (windowed) training set ---
# We combine the HPO train/val splits to get our full dataset back
X_full_windowed = np.concatenate((X_train_scaled, X_val_scaled), axis=0)
y_full_windowed = np.concatenate((y_train_split, y_val_split), axis=0)

print(f"--- Starting {N_SPLITS}-Fold CV Training ---")
print(f"Full windowed training data shape: {X_full_windowed.shape}")

fold_val_f1_list = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_full_windowed, y_full_windowed)):
    fold_name = f"kfold_fold_{fold+1}"
    print(f"\n--- Fold {fold+1}/{N_SPLITS} --- ({fold_name}) ---")
    
    # --- 2. Create datasets for this fold ---
    X_train_fold = torch.from_numpy(X_full_windowed[train_idx]).float()
    y_train_fold = torch.from_numpy(y_full_windowed[train_idx]).long()
    X_val_fold = torch.from_numpy(X_full_windowed[val_idx]).float()
    y_val_fold = torch.from_numpy(y_full_windowed[val_idx]).long()

    train_ds_fold = TensorDataset(X_train_fold, y_train_fold)
    val_ds_fold = TensorDataset(X_val_fold, y_val_fold)
    
    train_loader_fold = make_loader(train_ds_fold, batch_size=FINAL_BATCH_SIZE, shuffle=True, drop_last=True)
    val_loader_fold = make_loader(val_ds_fold, batch_size=FINAL_BATCH_SIZE, shuffle=False, drop_last=False)
    
    # --- 3. Create a fresh model (using FINAL_CONFIG) ---
    model_fold = RecurrentClassifier(
        input_size=N_FEATURES,
        hidden_size=FINAL_HIDDEN_SIZE,
        num_layers=FINAL_HIDDEN_LAYERS,
        num_classes=N_CLASSES,
        dropout_rate=FINAL_DROPOUT_RATE,
        bidirectional=FINAL_BIDIRECTIONAL,
        rnn_type=FINAL_MODEL_TYPE
    ).to(device)
    
    if torch.__version__[0] >= "2": 
        model_fold = torch.compile(model_fold)

    optimizer_fold = torch.optim.AdamW(model_fold.parameters(), lr=FINAL_LEARNING_RATE, weight_decay=FINAL_L2_LAMBDA)
    scaler_fold = torch.amp.GradScaler(enabled=(device.type == 'cuda'))
    criterion_fold = nn.CrossEntropyLoss()
    
    # --- 4. Train this fold with early stopping ---
    # The 'fit' function will find the best epoch and save the model
    # to 'models/kfold_fold_{fold+1}_best_model.pt'
    model_fold, _, _ = fit(
        model=model_fold,
        train_loader=train_loader_fold,
        val_loader=val_loader_fold,
        epochs=300, # High epoch count, patience will stop it
        criterion=criterion_fold,
        optimizer=optimizer_fold,
        scaler=scaler_fold,
        device=device,
        writer=None, # No need to log to tensorboard
        verbose=25,
        experiment_name=fold_name, # This saves the model with a unique name
        patience=30 # 30-epoch patience
    )
    
    # --- 5. (Optional) Check this fold's final F1 score ---
    val_loss, val_f1 = validate_one_epoch(model_fold, val_loader_fold, criterion_fold, device)
    fold_val_f1_list.append(val_f1)
    print(f"Fold {fold+1} Best Model Val F1: {val_f1:.4f}")

print(f"\n--- üèÜ K-Fold Training Complete ---")
print(f"Fold F1 scores: {[round(f, 4) for f in fold_val_f1_list]}")
print(f"Average F1 across folds: {np.mean(fold_val_f1_list):.4f}")

--- Starting 5-Fold CV Training ---
Full windowed training data shape: (3305, 80, 35)

--- Fold 1/5 --- (kfold_fold_1) ---
--- Starting Training: kfold_fold_1 ---
Will train for 300 epochs with patience=30 monitoring val_f1
Epoch   1/300 | Train: Loss=0.6546, F1=0.7273 | Val: Loss=0.5451, F1=0.7644
Epoch  25/300 | Train: Loss=0.0194, F1=0.9950 | Val: Loss=0.1105, F1=0.9645
Epoch  50/300 | Train: Loss=0.0131, F1=0.9973 | Val: Loss=0.0917, F1=0.9631
Epoch  75/300 | Train: Loss=0.0020, F1=1.0000 | Val: Loss=0.0956, F1=0.9725
Epoch 100/300 | Train: Loss=0.0002, F1=1.0000 | Val: Loss=0.0992, F1=0.9787

Early stopping triggered after 100 epochs.
Restoring best model from epoch 70 with val_f1 0.9804
--- Finished Training: kfold_fold_1 ---
Fold 1 Best Model Val F1: 0.9804

--- Fold 2/5 --- (kfold_fold_2) ---
--- Starting Training: kfold_fold_2 ---
Will train for 300 epochs with patience=30 monitoring val_f1
Epoch   1/300 | Train: Loss=0.6439, F1=0.7524 | Val: Loss=0.5239, F1=0.7601
Epoch  25/3

## üì¨ **9. Phase 3: K-Fold Ensemble Submission**

This is the **NEW** robust submission step.

1.  Prepare the `X_test` data (scaling and windowing) exactly as before.
2.  Create the `test_loader`.
3.  Load each of our 5 saved fold-models.
4.  Get 5 sets of (softmax) probability predictions for the test set.
5.  Average these 5 probability sets into a single, robust probability matrix.
6.  Aggregate these mean probabilities (using mean) from windows to full samples.
7.  Take the `argmax` of the final aggregated probabilities to get the submission class.

In [23]:
# --- !! NEW CELL: K-FOLD ENSEMBLE SUBMISSION !! ---
from scipy.stats import mode 

print("\n--- Preparing full dataset for FINAL SCALER ---")

# --- 1. Prepare Full Training Set for Final Scaler ---
# We fit the scaler on ALL available training data (non-windowed)
scaler_final = StandardScaler()
ns, ts, f = X_train_full.shape
X_train_full_2d = X_train_full.reshape(ns * ts, f)

print(f"Fitting FINAL Scaler on X_train_full_2d shape: {X_train_full_2d.shape}")
scaler_final.fit(X_train_full_2d)

# --- 2. Scale and Window the TEST data ---
# Scale X_test using the scaler_final
ns_test, ts_test, f_test = X_test.shape
X_test_2d = X_test.reshape(ns_test * ts_test, f_test)
X_test_final_scaled_2d = scaler_final.transform(X_test_2d)
X_test_final_scaled = X_test_final_scaled_2d.reshape(ns_test, ts_test, f_test)

print("Final scaling of test set complete.")
print("--- Applying sliding windows to final test set ---")

X_test_final_windowed, test_window_indices = create_sliding_windows(
    X_test_final_scaled,
    y=None, # No labels for the test set
    window_size=NEW_WINDOW_SIZE,
    stride=NEW_STRIDE
)
print(f"Test windowed shape: {X_test_final_windowed.shape}")
print(f"Test window indices shape: {test_window_indices.shape}")

# --- 3. Create Final TestLoader ---
final_test_features = torch.from_numpy(X_test_final_windowed).float()
final_test_ds = TensorDataset(final_test_features) # No labels

def make_final_loader(ds, batch_size, shuffle, drop_last):
    return DataLoader(
        ds, batch_size=int(batch_size), shuffle=shuffle, drop_last=drop_last,
        num_workers=0, pin_memory=True, pin_memory_device="cuda", prefetch_factor=None
    )

test_loader = make_final_loader(final_test_ds, batch_size=FINAL_BATCH_SIZE, shuffle=False, drop_last=False)
print("Final TestLoader created.")

# --- 4. Get Predictions from all K-Fold Models ---
all_fold_probabilities = []
print(f"\n--- Generating predictions from {N_SPLITS} fold models ---")

for fold in range(N_SPLITS):
    fold_name = f"kfold_fold_{fold+1}"
    model_path = f"models/{fold_name}_best_model.pt"
    print(f"Loading model {fold+1}/{N_SPLITS} from {model_path}...")

    # 1. Create a fresh model shell
    model_fold = RecurrentClassifier(
        input_size=N_FEATURES,
        hidden_size=FINAL_HIDDEN_SIZE,
        num_layers=FINAL_HIDDEN_LAYERS,
        num_classes=N_CLASSES,
        dropout_rate=FINAL_DROPOUT_RATE,
        bidirectional=FINAL_BIDIRECTIONAL,
        rnn_type=FINAL_MODEL_TYPE
    ).to(device)
    
    # 2. Load the saved weights (no compilation needed for eval)
    # --- FIX: Load state_dict and clean keys ---
    # The saved model was compiled, so keys have a '_orig_mod.' prefix.
    # We must remove this prefix to load into a non-compiled model.
    state_dict = torch.load(model_path, map_location=device)
    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith('_orig_mod.'):
            new_state_dict[k[len('_orig_mod.'):]] = v # Remove the prefix
        else:
            new_state_dict[k] = v
    
    model_fold.load_state_dict(new_state_dict) # Load the cleaned state_dict
    # --- END FIX ---
    model_fold.eval()

    fold_predictions = []
    with torch.no_grad():
        for (inputs,) in test_loader: 
            inputs = inputs.to(device)
            with torch.amp.autocast(device_type=device.type, enabled=(device.type == 'cuda')):
                logits = model_fold(inputs)
                # --- Get SOFTMAX probabilities, not argmax ---
                probs = torch.softmax(logits, dim=1)
                fold_predictions.append(probs.cpu().numpy())
    
    all_fold_probabilities.append(np.concatenate(fold_predictions))

# --- 5. Average the Probabilities ---
# 'all_fold_probabilities' is a list of 5 arrays, each of shape (n_windows, n_classes)
print(f"\nAveraging {len(all_fold_probabilities)} sets of probabilities...")
# Stack and average across the first dimension (the folds)
mean_probabilities = np.mean(all_fold_probabilities, axis=0)
print(f"Mean probability matrix shape: {mean_probabilities.shape}") # (n_windows, 3)

# --- 6. Aggregate Mean Probabilities (Mean) ---
print("Aggregating window probabilities to sample predictions...")

# Use pandas for easy aggregation
prob_cols = [f"prob_{i}" for i in range(N_CLASSES)]
df_probs = pd.DataFrame(mean_probabilities, columns=prob_cols)
df_probs['original_index'] = test_window_indices # Map from window to original sample

# Group by the original sample index and find the MEAN probability
agg_probs = df_probs.groupby('original_index')[prob_cols].mean().values

# 'agg_probs' now has one probability vector per original sample (shape 1324, 3)
print(f"Aggregated to {len(agg_probs)} final probability vectors.")

# --- NOW get the final class by taking argmax of the mean probabilities ---
final_predictions_numeric = np.argmax(agg_probs, axis=1)

# Inverse transform these aggregated predictions to labels
predicted_labels = le.inverse_transform(final_predictions_numeric)

# --- 7. Save Submission File ---
print("Loading sample submission file for correct formatting...")
X_test_long = pd.read_csv(X_TEST_PATH)
test_sample_indices = sorted(X_test_long['sample_index'].unique())

if len(predicted_labels) != len(test_sample_indices):
    print(f"ERROR: Prediction count mismatch! Predictions: {len(predicted_labels)}, Test Indices: {len(test_sample_indices)}")
else:
    print("Prediction count matches. Creating submission.")
    
    final_submission_df = pd.DataFrame({
        'sample_index': test_sample_indices,
        'label': predicted_labels 
    })
    
    final_submission_df['sample_index'] = final_submission_df['sample_index'].apply(lambda x: f"{x:03d}")

    SUBMISSIONS_DIR = "submissions"
    os.makedirs(SUBMISSIONS_DIR, exist_ok=True)
    
    submission_filename = f"submission_{FINAL_EXPERIMENT_NAME}_w{NEW_WINDOW_SIZE}_s{NEW_STRIDE}.csv"
    submission_filepath = os.path.join(SUBMISSIONS_DIR, submission_filename)
    
    final_submission_df.to_csv(submission_filepath, index=False)

    print(f"\nSuccessfully saved to {submission_filepath}!")
    print("This file is correctly formatted for Kaggle:")
    print(final_submission_df.head())

del all_fold_probabilities, final_test_features, final_test_ds, test_loader


--- Preparing full dataset for FINAL SCALER ---
Fitting FINAL Scaler on X_train_full_2d shape: (105760, 35)
Final scaling of test set complete.
--- Applying sliding windows to final test set ---
Test windowed shape: (6620, 80, 35)
Test window indices shape: (6620,)
Final TestLoader created.

--- Generating predictions from 5 fold models ---
Loading model 1/5 from models/kfold_fold_1_best_model.pt...
Loading model 2/5 from models/kfold_fold_2_best_model.pt...
Loading model 3/5 from models/kfold_fold_3_best_model.pt...
Loading model 4/5 from models/kfold_fold_4_best_model.pt...
Loading model 5/5 from models/kfold_fold_5_best_model.pt...

Averaging 5 sets of probabilities...
Mean probability matrix shape: (6620, 3)
Aggregating window probabilities to sample predictions...
Aggregated to 1324 final probability vectors.
Loading sample submission file for correct formatting...
Prediction count matches. Creating submission.

Successfully saved to submissions\submission_GRU_H384_L2_BTrue_Optun