# Training & inference notebook
Credit to [Tarun Mishra](https://www.kaggle.com/tarundirector) – this code is heavily based on his [notebook](https://www.kaggle.com/code/tarundirector/sensor-pulse-viz-eda-for-bfrb-detection?scriptVersionId=243465321).

## Setup

### imports

In [1]:
import re
import os
import gc
import json 
import math
import shutil
import random
import warnings
from glob import glob
from os.path import join
from functools import partial
from datetime import datetime
from tqdm.notebook import tqdm
from collections import Counter
from operator import methodcaller
from os.path import join, realpath
from typing import Optional, Literal
from typing import Optional, Literal, Iterator
from itertools import pairwise, starmap, product

import torch
import optuna
import numpy as np
import pandas as pd
import polars as pl
from numpy import ndarray
from torch import nn, Tensor
from numpy.linalg import norm
import torch.nn.functional as F
from torch.optim import Optimizer
from pandas import DataFrame as DF
from optuna.trial import TrialState
from sklearn.metrics import f1_score
from kagglehub import competition_download
from torch.utils.data import TensorDataset
from scipy.spatial.transform import Rotation
import kaggle_evaluation.cmi_inference_server
from torch.utils.data import DataLoader as DL
from sklearn.model_selection import GroupKFold
from rich.progress import Progress, Task, track
from sklearn.model_selection import train_test_split
from numpy.lib.stride_tricks import sliding_window_view
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.utils.class_weight import compute_class_weight
from torch.optim.lr_scheduler import ConstantLR, LRScheduler, _LRScheduler

### Configs

In [2]:
# Dataset
COMPETITION_HANDLE = "cmi-detect-behavior-with-sensor-data"
TARGET_NAMES = sorted([
    "Above ear - pull hair",
    "Cheek - pinch skin",
    "Eyebrow - pull hair",
    "Eyelash - pull hair",
    "Feel around in tray and pull out an object",
    "Forehead - pull hairline",
    "Forehead - scratch",
    "Neck - pinch skin",
    "Neck - scratch",
    "Text on phone",
    "Wave hello",
    "Write name in air",
    "Write name on leg",
    "Drink from bottle/cup",
    "Pinch knee/leg skin",
    "Pull air toward your face",
    "Scratch knee/leg skin",
    "Glasses on/off"
])
BFRB_GESTURES = [
    'Above ear - pull hair',
    'Forehead - pull hairline',
    'Forehead - scratch',
    'Eyebrow - pull hair',
    'Eyelash - pull hair',
    'Neck - pinch skin',
    'Neck - scratch',
    'Cheek - pinch skin'
]
BFRB_INDICES = [idx for idx, gesture in enumerate(TARGET_NAMES) if gesture in BFRB_GESTURES]
IMU_FEATS_PREFIXES = (
    "acc",
    "linear_acc",
    "rot",
    "angular",
    "euler",
    "quat_rot_mag",
    "delta_rot_mag",
)
QUATERNION_COLS = ['rot_w', 'rot_x', 'rot_y', 'rot_z']
GRAVITY_WORLD = np.array([0, 0, 9.81], "float32")
RAW_ACCELRATION_COLS = ["acc_x", "acc_y", "acc_z"]
LINEAR_ACC_COLS = ["linear_" + col for col in RAW_ACCELRATION_COLS] # Acceleration without gravity
COMPETITION_HANDLE = "cmi-detect-behavior-with-sensor-data"
CATEGORY_COLUMNS = [
    'row_id',
    'sequence_type',
    'sequence_id',
    'subject',
    'orientation',
    'behavior',
    'phase',
    'gesture',
]
META_DATA_COLUMNS = [
    'row_id',
    'sequence_type',
    'sequence_id',
    'sequence_counter',
    'subject',
    'orientation',
    'behavior',
    'phase',
    'gesture',
]
DATASET_DF_DTYPES = {
    "acc_x": "float32", "acc_y": "float32", "acc_z": "float32",
    "thm_1":"float32", "thm_2":"float32", "thm_3":"float32", "thm_4":"float32", "thm_5":"float32",
    "sequence_counter": "int32",
    **{col: "category" for col in CATEGORY_COLUMNS},
    **{f"tof_{i_1}_v{i_2}": "float32" for i_1, i_2 in product(range(1, 5), range(64))},
}
PREPROCESSED_DATASET_HANDLE = "mauroabidalcarrer/prepocessed-cmi-2025"
# The quantile of the sequences len used to pad/truncate during preprocessing
SEQUENCE_NORMED_LEN_QUANTILE = 0.95
# SAMPLING_FREQUENCY = 10 #Hz
VALIDATION_FRACTION = 0.2
EPSILON=1e-8
DELTA_ROTATION_ANGULAR_VELOCITY_COLS = ["angular_vel_x", "angular_vel_y", "angular_vel_z"]
DELTA_ROTATION_AXES_COLS = ["rotation_axis_x", "rotation_axis_y", "rotation_axis_z"]
EULER_ANGLES_COLS = ["euler_x", "euler_y", "euler_z"]
pad_trunc_mode_type = Literal["pre", "center", "post"]
SEQ_PAD_TRUNC_MODE: pad_trunc_mode_type = "center"
DEFAULT_VERSION_NOTES = "Preprocessed Child Mind Institue 2025 competition preprocessed dataset."
NB_COLS_PER_TOF_SENSOR = 64
TOF_PATCH_SIZE = 2
assert ((NB_COLS_PER_TOF_SENSOR // 2) % TOF_PATCH_SIZE) == 0, "tof side len should be dividable by TOF_PATCH_SIZE!"
TOF_AGG_FUNCTIONS = [
    "mean",
    "std",
    "median",
    "min",
    "max",
]
# Data augmentation
JITTER = 0.25
SCALING = 0.2
MIXUP = 0.3
# Training loop
NB_CROSS_VALIDATIONS = 5
TRAIN_BATCH_SIZE = 256
VALIDATION_BATCH_SIZE = 4 * TRAIN_BATCH_SIZE
PATIENCE = 8
# Optimizer
WEIGHT_DECAY = 3e-3
# Scheduler
TRAINING_EPOCHS = 25 # Including warmup epochs
WARMUP_EPOCHS = 3
WARMUP_LR_INIT = 1.822126131809773e-05
MAX_TO_MIN_LR_DIV_FACTOR = 100
LR_CYCLE_FACTOR = 0.5
CYCLE_LENGTH_FACTOR = 0.9
INIT_CYCLE_EPOCHS = 6
# MIN_LR = 3.810323058740104e-09
# MAX_LR = 1e-3
# Mock training loop
MOCK_TRAINING_EPOCHS = 20
MOCK_TRAINING_GAMMA = 1.01
CHANNELS_DIMENSION = 1
SEED = 42

### Seed everything

In [3]:
def seed_everything(seed=42):
    """Set all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    torch.use_deterministic_algorithms(True, warn_only=True)
seed_everything(seed=SEED)

### Supress performance warngings

In [4]:
warnings.filterwarnings(
    "ignore",
    message=(
        "DataFrame is highly fragmented.  This is usually the result of "
        "calling `frame.insert` many times.*"
    ),
    category=pd.errors.PerformanceWarning,
)

### device setup

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

## Dataset

### Preprocessing

In [6]:
def get_feature_cols(df:DF) -> list[str]:
    return sorted(list(set(df.columns) - set(META_DATA_COLUMNS) - set(TARGET_NAMES)))

# Missing ToF values are already imputed by -1 which is inconvinient since we want all missing values to be NaN.    
# So we replace them by NaN and then perform imputing.
def get_fillna_val_per_feature_col(df:DF) -> dict:
    return {col: 1.0 if col == 'rot_w' else 0 for col in get_feature_cols(df)}

def imputed_features(df:DF) -> DF:
    # Missing ToF values are already imputed by -1 which is inconvinient since we want all missing values to be NaN.    
    # So we replace them by NaN and then perform imputing.  
    tof_vals_to_nan = {col: -1.0 for col in df.columns if col.startswith("tof")}
    # fillna_val_per_col = {col: 1.0 if col == 'rot_w' else 0 for col in df.columns}

    df[get_feature_cols(df)] = (
        df
        .loc[:, get_feature_cols(df)]
        # df.replace with np.nan sets dtype to floar64 so we set it back to float32
        .replace(tof_vals_to_nan, value=np.nan)
        .astype("float32")
        .groupby(df["sequence_id"], observed=True, as_index=False)
        .ffill()
        .groupby(df["sequence_id"], observed=True, as_index=False)
        .bfill()
        # In case there are only nan in the column in the sequence
        .fillna(get_fillna_val_per_feature_col(df))
    )
    return df

def standardize_tof_cols_names(df: DF) -> DF:
    renamed_cols = {}
    pattern = re.compile(r"^(tof_\d_v)(\d)$")  # match 'tof_X_vY' where Y is a single digit

    for col in df.columns:
        match = pattern.match(col)
        if match:
            prefix, version = match.groups()
            new_col = f"{prefix}0{version}"
            renamed_cols[col] = new_col

    return df.rename(columns=renamed_cols)

def norm_quat_rotations(df:DF) -> DF:
    df[QUATERNION_COLS] /= np.linalg.norm(df[QUATERNION_COLS], axis=1, keepdims=True)
    return df

def add_linear_acc_cols(df:DF) -> DF:
    # Vectorized version of https://www.kaggle.com/code/wasupandceacar/lb-0-82-5fold-single-bert-model#Dataset `remove_gravity_from_acc`
    rotations:Rotation = Rotation.from_quat(df[QUATERNION_COLS])
    gravity_sensor_frame = rotations.apply(GRAVITY_WORLD, inverse=True).astype("float32")
    df[LINEAR_ACC_COLS] = df[RAW_ACCELRATION_COLS] - gravity_sensor_frame
    return df

def add_acc_magnitude(df:DF, acc_cols:list[str], acc_mag_col_name:str) -> DF:
    return df.assign(**{acc_mag_col_name: np.linalg.norm(df.loc[:, acc_cols], axis=1)})

def add_quat_angle_mag(df:DF) -> DF:
    return df.assign(quat_rot_mag=np.arccos(df["rot_w"]) * 2)

def add_angular_velocity_features(df:DF) -> DF:
    rotations = Rotation.from_quat(df[QUATERNION_COLS])
    delta_rotations = rotations[1:] * rotations[:-1].inv()
    delta_rot_velocity = delta_rotations.as_rotvec()
    # Add extra line to avoid shape mismatch
    delta_rot_velocity = np.vstack((np.zeros((1, 3)), delta_rot_velocity))
    delta_rot_magnitude = norm(delta_rot_velocity, axis=1, keepdims=True)
    delta_rot_axes = delta_rot_velocity / (delta_rot_magnitude + EPSILON)
    df[DELTA_ROTATION_ANGULAR_VELOCITY_COLS] = delta_rot_velocity
    df[DELTA_ROTATION_AXES_COLS] = delta_rot_axes
    df["delta_rot_mag"] = delta_rot_magnitude.squeeze()

    return df

def rot_euler_angles(df:DF) -> ndarray:
    df[EULER_ANGLES_COLS] = (
        Rotation
        .from_quat(df[QUATERNION_COLS])
        .as_euler("xyz")
        .squeeze()
    )
    return df

def agg_tof_patch(tof_views:np.ndarray, f_name:str) -> ndarray:
    views_agg_func = methodcaller(f_name, tof_views, axis=(1, 2))
    return (
        views_agg_func(np)
        .reshape(tof_views.shape[0], -1)
    )

def agg_tof_cols_per_sensor(df:DF) -> DF:
    """
    ## Description:
    Computes the sensor and patch sensor wise stats.
    ## Resturns:
    The dataframe with the added stats.
    """
    for tof_idx in tqdm(range(1, 6)):
        tof_name = f"tof_{tof_idx}"
        all_tof_cols = [f"{tof_name}_v{v_idx:02d}" for v_idx in range(64)]
        tof_feats = (
            df
            .loc[:, all_tof_cols]
            .values
            .reshape(-1, 8, 8)
        )
        agg_func = partial(df[all_tof_cols].agg, axis="columns")
        mk_fe_col_name = lambda f_name: tof_name + "_" + f_name
        engineered_feats = DF({mk_fe_col_name(f_name): agg_func(f_name) for f_name in TOF_AGG_FUNCTIONS})
        stats_cols_names = list(map(mk_fe_col_name, TOF_AGG_FUNCTIONS))
        # Patch Feature engineering
        tof_views:np.ndarray = sliding_window_view(tof_feats, (TOF_PATCH_SIZE, TOF_PATCH_SIZE), (1, 2))
        patch_fe = {}
        for f_name in TOF_AGG_FUNCTIONS:
            tof_patch_stats = agg_tof_patch(tof_views, f_name)
            for patch_idx in range(tof_patch_stats.shape[1]):
                key = mk_fe_col_name(f_name) + f"_{patch_idx:02d}"
                patch_fe[key] = tof_patch_stats[:, patch_idx]
        patch_df = DF(patch_fe)
        # concat results
        df = pd.concat(
            (
                df.drop(columns=filter(df.columns.__contains__, stats_cols_names)),
                engineered_feats,
                patch_df,
            ),
            axis="columns",
        )
    return df

def add_diff_features(df:DF) -> DF:
    return pd.concat(
        (
            df,
            (
                df
                .groupby("sequence_id", as_index=False, observed=True)
                [get_feature_cols(df)]
                .diff()
                .fillna(get_fillna_val_per_feature_col(df))
                .add_suffix("_diff")
            )
        ),
        axis="columns",
    )

def one_hot_encode_targets(df:DF) -> DF:
    one_hot_target = pd.get_dummies(df["gesture"])
    df[TARGET_NAMES] = one_hot_target[TARGET_NAMES]
    return df

def length_normed_sequence_feat_arr(
        sequence: DF,
        normed_sequence_len: int,
        SEQ_PAD_TRUNC_MODE:Literal["pre", "center", "post"]
    ) -> ndarray:
    features = (
        sequence
        .loc[:, get_feature_cols(sequence)]
        .values
    )
    len_diff = abs(normed_sequence_len - len(features))
    len_diff_h = len_diff // 2 # half len diff
    len_diff_r = len_diff % 2 # len diff remainder
    if len(features) < normed_sequence_len:
        padding_dict = {
            "pre": (len_diff, 0),
            "center": (len_diff_h + len_diff_r, len_diff_h),
            "post": (0, len_diff),
        }
        padded_features = np.pad(
            features,
            (padding_dict[SEQ_PAD_TRUNC_MODE], (0, 0)),
        )
        return padded_features
    elif len(features) > normed_sequence_len:
        truncating_dict = {
            "pre": slice(len_diff),
            "center": slice(len_diff_h, -len_diff_h),
            "post": slice(0, -len_diff),
        }
        return features[len_diff // 2:-len_diff // 2]
    else:
        return features

def df_to_ndarrays(df:DF, normed_sequence_len:int, seq_pad_trunc_mode:str) -> tuple[np.ndarray, np.ndarray]:
    sequence_it = df.groupby("sequence_id", observed=True, as_index=False)
    x = np.empty(
        shape=(len(sequence_it), normed_sequence_len, len(get_feature_cols(df))),
        dtype="float32"
    )
    y = np.empty(
        shape=(len(sequence_it), len(TARGET_NAMES)),
        dtype="float32"
    )
    for sequence_idx, (_, sequence) in tqdm(enumerate(sequence_it), total=len(sequence_it)):
        normed_seq_feat_arr = length_normed_sequence_feat_arr(sequence, normed_sequence_len, seq_pad_trunc_mode)
        x[sequence_idx] = normed_seq_feat_arr
        # Take the first value as they are(or at least should be) all the same in a single sequence
        y[sequence_idx] = sequence[TARGET_NAMES].iloc[0].values

    return x, y

def get_normed_seq_len(dataset:DF) -> int:
    return int(
        dataset
        .groupby("sequence_id", observed=True)
        .size()
        .quantile(SEQUENCE_NORMED_LEN_QUANTILE)
    )

def fold_dfs_to_ndarrays(train:DF, validation:DF, dataset_normed_seq_len:int, seq_pad_trunc_mode:str) -> tuple[ndarray, ndarray, ndarray, ndarray]:
    """
    Returns:
        (train X, train Y, validation X, validation Y)
    """
    # full_dataset_normed_seq_len = get_normed_seq_len(df)
    return (
        *df_to_ndarrays(train, dataset_normed_seq_len, seq_pad_trunc_mode),
        *df_to_ndarrays(validation, dataset_normed_seq_len, seq_pad_trunc_mode),
    )

In [7]:
def preprocess_competitino_dataset() -> DF:
    csv_path = competition_download(COMPETITION_HANDLE, path="train.csv")
    return (
        pd.read_csv(csv_path, dtype=DATASET_DF_DTYPES)
        .pipe(imputed_features)
        .pipe(standardize_tof_cols_names)
        .pipe(norm_quat_rotations)
        .pipe(add_linear_acc_cols)
        .pipe(add_acc_magnitude, RAW_ACCELRATION_COLS, "acc_mag")
        .pipe(add_acc_magnitude, LINEAR_ACC_COLS, "linear_acc_mag")
        .pipe(add_quat_angle_mag)
        .pipe(add_angular_velocity_features)
        .pipe(rot_euler_angles)
        .pipe(add_quat_angle_mag)
        .pipe(one_hot_encode_targets)
        .pipe(agg_tof_cols_per_sensor)
        .pipe(add_diff_features)
    )

def save_sequence_meta_data(df:DF) -> DF:
    seq_meta_data = (
        df
        .groupby("sequence_id", as_index=False, observed=True)
        [META_DATA_COLUMNS]
        .last()
    )
    seq_meta_data.to_parquet("preprocessed_dataset/sequences_meta_data.parquet")
    np.save(
        "preprocessed_dataset/auxialiary_Y.npy",
        pd.get_dummies(seq_meta_data["orientation"]).values,
    )

def save_df_meta_data(df:DF):
    full_dataset_meta_data = {
        "mean": df[get_feature_cols(df)].mean().astype("float32").to_dict(),
        "std": df[get_feature_cols(df)].std().astype("float32").to_dict(),
        "pad_seq_len": get_normed_seq_len(df),
        "feature_cols": get_feature_cols(df),
        "n_aux_classes": df["orientation"].nunique(),
    }
    with open("preprocessed_dataset/full_dataset_meta_data.json", "w") as fp:
        json.dump(full_dataset_meta_data, fp, indent=4)
    
def create_preprocessed_dataset():
    shutil.rmtree("preprocessed_dataset", ignore_errors=True)
    os.makedirs("preprocessed_dataset")
    df = preprocess_competitino_dataset()
    full_dataset_sequence_length_norm = get_normed_seq_len(df)
    full_x, full_y = df_to_ndarrays(df, full_dataset_sequence_length_norm, SEQ_PAD_TRUNC_MODE)
    np.save(join("preprocessed_dataset", "X.npy"), full_x, allow_pickle=False)
    np.save(join("preprocessed_dataset", "Y.npy"), full_y, allow_pickle=False)
    # Save meta data
    save_sequence_meta_data(df)
    save_df_meta_data(df)

In [8]:
# create_preprocessed_dataset()

### Dataset class

In [9]:
class CMIDataset(TensorDataset):
    def __init__(self):
        x = np.load(join("preprocessed_dataset", "X.npy")).swapaxes(1, 2)
        y = np.load(join("preprocessed_dataset", "Y.npy"))
        auxiliary_y = np.load(join("preprocessed_dataset", "auxialiary_Y.npy"))
        super().__init__(
            torch.from_numpy(x).to(device),
            torch.from_numpy(y).to(device),
            torch.from_numpy(auxiliary_y).to(device),
        )

In [10]:
auxiliary_y = np.load(join("preprocessed_dataset", "auxialiary_Y.npy"))
n_aux_classes = auxiliary_y.shape[1]

#### Meta data loading

In [11]:
meta_data_path = join(
    "preprocessed_dataset",
    "full_dataset_meta_data.json"
)
with open(meta_data_path, "r") as fp:
    meta_data = json.load(fp)
# Convert target names into a ndarray to index it batchwise.
def get_sensor_indices(sensor_prefix: str) -> list[int]:
    is_sensor_feat = methodcaller("startswith", sensor_prefix)
    return [feat_idx for feat_idx, feat in enumerate(meta_data["feature_cols"]) if is_sensor_feat(feat)]

tof_idx = get_sensor_indices("tof")
thm_idx = get_sensor_indices("thm")
imu_idx = list(filter(lambda idx: idx not in tof_idx + thm_idx, range(len(meta_data["feature_cols"]))))

## Model definition

In [12]:
class MultiScaleConvs(nn.Module):
    def __init__(self, in_channels:int, kernel_sizes:list[int]):
        super().__init__()
        def mk_conv_block(k_size) -> nn.Sequential:
            return nn.Sequential(
                nn.Conv1d(in_channels, in_channels, k_size, padding=k_size // 2, groups=in_channels),
                nn.BatchNorm1d(in_channels),
                nn.ReLU(),
            )
        self.convs = nn.ModuleList(map(mk_conv_block, kernel_sizes))

    def forward(self, x:Tensor) -> Tensor:
        yes = torch.cat([conv(x) for conv in self.convs] + [x], dim=1)
        # print("stem output shape:", yes.shape)
        return yes

class ImuFeatureExtractor(nn.Module):
    def __init__(self, in_channels:int, kernel_size:int=15):
        super().__init__()

        self.lpf = nn.Conv1d(
            in_channels,
            in_channels,
            kernel_size=kernel_size,
            padding=kernel_size//2,
            groups=in_channels,
            bias=False,
        )
        nn.init.kaiming_uniform_(self.lpf.weight, a=math.sqrt(5))

    def forward(self, x:Tensor) -> Tensor:
        lpf_output = self.lpf(x)
        hpf_output = x - lpf_output
        return torch.cat((lpf_output, hpf_output, x), dim=1)  # (B, C_out, T)

class SqueezeExcitationBlock(nn.Module):
    # Copy/paste of https://www.kaggle.com/code/wasupandceacar/lb-0-82-5fold-single-bert-model#Model implementation
    def __init__(self, channels:int, reduction:int=8):
        super().__init__()
        self.fc1 = nn.Linear(channels, channels // reduction, bias=True)
        self.fc2 = nn.Linear(channels // reduction, channels, bias=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x: (B, C, L)
        se = F.adaptive_avg_pool1d(x, 1).squeeze(-1)      # -> (B, C)
        se = F.relu(self.fc1(se), inplace=True)          # -> (B, C//r)
        se = self.sigmoid(self.fc2(se)).unsqueeze(-1)    # -> (B, C, 1)
        return x * se

class ResidualBlock(nn.Module):
    def __init__(self, in_chns:int, out_chns:int, dropout_ratio:float=0.3, se_reduction:int=8, kernel_size:int=3):
        super().__init__()
        self.blocks = nn.Sequential(
            nn.Conv1d(in_chns, out_chns, kernel_size=kernel_size, padding=kernel_size // 2, bias=False),
            nn.BatchNorm1d(out_chns),
            nn.ReLU(),
            nn.Conv1d(out_chns, out_chns, kernel_size=kernel_size, padding=kernel_size // 2, bias=False),
            nn.BatchNorm1d(out_chns),
            SqueezeExcitationBlock(out_chns, se_reduction),
        )
        self.head = nn.Sequential(nn.ReLU(), nn.Dropout(dropout_ratio))
        if in_chns == out_chns:
            self.skip_connection = nn.Identity() 
        else:
            # TODO: set bias to False ?
            self.skip_connection = nn.Sequential(
                nn.Conv1d(in_chns, out_chns, 1, bias=False),
                nn.BatchNorm1d(out_chns)
            )
            self.head.insert(1, nn.MaxPool1d(2))

    def forward(self, x:Tensor) -> Tensor:
        activaition_maps = self.skip_connection(x) + self.blocks(x)
        return self.head(activaition_maps)

class MBConvBlock(nn.Module):
    # From this schema: https://media.licdn.com/dms/image/v2/D5612AQFjbDOm5uyxdw/article-inline_image-shrink_1500_2232/article-inline_image-shrink_1500_2232/0/1683677500817?e=1758153600&v=beta&t=n48_UW5TZTyDPhRFlJXSidUQQPQpuC756M0kNeKmYTY
    def __init__(self, in_chns:int, out_chns:int, se_reduction:int=8, expansion_ratio:int=4, dropout_ratio:float=0.3):
        super().__init__()
        expanded_channels = in_chns * expansion_ratio
        self.blocks = nn.Sequential(
            nn.Conv1d(in_chns, expanded_channels, kernel_size=1, bias=False),
            nn.BatchNorm1d(expanded_channels),
            nn.ReLU(),
            nn.Conv1d(
                expanded_channels,
                expanded_channels,
                kernel_size=3,
                padding=1,
                groups=expanded_channels,
                bias=False,
            ),
            nn.BatchNorm1d(expanded_channels),
            nn.ReLU(),
            SqueezeExcitationBlock(expanded_channels, se_reduction),
            nn.Conv1d(expanded_channels, out_chns, kernel_size=1, bias=False)
        )
        self.head = nn.Sequential(
            nn.BatchNorm1d(out_chns)
            # nn.ReLU(),
            # nn.Dropout(dropout_ratio),
        )
        if in_chns == out_chns:
            self.skip_connection = nn.Identity() 
        else:
            # TODO: set bias to False ?
            self.skip_connection = nn.Sequential(
                nn.Conv1d(in_chns, out_chns, 1, bias=False),
                nn.BatchNorm1d(out_chns)
            )
            self.head.add_module("max_pool", nn.MaxPool1d(2))
            
    def forward(self, x:Tensor) -> Tensor:
        activaition_maps = self.skip_connection(x) + self.blocks(x)
        return self.head(activaition_maps)

class AdditiveAttentionLayer(nn.Module):
    # Copied (and slightly modified) from https://www.kaggle.com/code/myso1987/cmi3-pyroch-baseline-model-add-aug-folds
    def __init__(self, hidden_dim):
        super().__init__()
        self.attention = nn.Linear(hidden_dim, 1, bias=True)

    def forward(self, x: Tensor) -> Tensor:
        # x shape: (batch, channels, seq_len)
        x = x.swapaxes(1, 2)
        # x shape: (batch, seq_len, hidden_dim)
        scores = torch.tanh(self.attention(x))  # (batch, seq_len, 1)
        weights = F.softmax(scores.squeeze(-1), dim=1)  # (batch, seq_len)
        context = torch.sum(x * weights.unsqueeze(-1), dim=1)  # (batch, hidden_dim)
        return context

class AlexNet(nn.Sequential):
    def __init__(self, channels:list[int], dropout_ratio:float):
        def mk_conv_block(in_channels:int, out_channels:int) -> nn.Module:
            return nn.Sequential(
                nn.Conv1d(in_channels, out_channels, 3, padding=1, bias=False),
                nn.BatchNorm1d(out_channels),
                nn.MaxPool1d(2),
                nn.Dropout(dropout_ratio),
            )
        return super().__init__(*list(starmap(mk_conv_block, pairwise(channels))))

class CMIHARModule(nn.Module):
    def __init__(
            self,
            imu_idx:list[int],
            thm_idx:list[int],
            tof_idx:list[int],
            mlp_width:int,
            n_classes:int,
            n_aux_classes:Optional[int]=None,
            dataset_x:Optional[Tensor]=None,
            tof_dropout_ratio:float=0,
            thm_dropout_ratio:float=0,
        ):
        super().__init__()
        self.imu_idx = imu_idx
        self.tof_idx = tof_idx
        self.thm_idx = thm_idx
        if dataset_x is not None:
            x_mean = dataset_x.mean(dim=(0, 2), keepdim=True)
            x_std = dataset_x.std(dim=(0, 2), keepdim=True)
            self.register_buffer("x_mean", x_mean)
            self.register_buffer("x_std", x_std)
        else:
            x_stats_size = (1, len(meta_data["feature_cols"]), 1)
            self.register_buffer("x_mean", torch.empty(x_stats_size))
            self.register_buffer("x_std", torch.empty(x_stats_size))
        self.imu_branch = nn.Sequential(
            ResidualBlock(len(imu_idx), 219),
            ResidualBlock(219, 500),
        )
        self.tof_branch = AlexNet([len(tof_idx), 82, 500], tof_dropout_ratio)
        self.thm_branch = AlexNet([len(thm_idx), 82, 500], thm_dropout_ratio)
        self.rnn = nn.GRU(500 * 3, mlp_width // 2, bidirectional=True)
        self.attention = AdditiveAttentionLayer(mlp_width)
        self.meain_head = nn.Sequential(
            # Head
            nn.LazyLinear(mlp_width, bias=False),
            nn.BatchNorm1d(mlp_width),
            nn.ReLU(),
            nn.Linear(mlp_width, mlp_width // 2, bias=False),
            nn.BatchNorm1d(mlp_width // 2),
            nn.ReLU(),
            nn.Linear(mlp_width // 2, n_classes),
        )
        if n_aux_classes is not None:
            self.aux_head = nn.Sequential(
                # Head
                nn.LazyLinear(mlp_width, bias=False),
                nn.BatchNorm1d(mlp_width),
                nn.ReLU(),
                nn.Linear(mlp_width, mlp_width // 2, bias=False),
                nn.BatchNorm1d(mlp_width // 2),
                nn.ReLU(),
                nn.Linear(mlp_width // 2, n_aux_classes),
            )

    def forward(self, x:Tensor) -> Tensor:
        assert self.x_mean is not None and self.x_std is not None, f"Nor x_mean nor x_std should be None.\nx_std: {self.x_std}\nx_mean: {self.x_mean}"
        x = (x - self.x_mean) / self.x_std
        concatenated_activation_maps = torch.cat(
            (
                self.imu_branch(x[:, self.imu_idx]),
                self.thm_branch(x[:, self.thm_idx]),
                self.tof_branch(x[:, self.tof_idx]),
            ),
            dim=CHANNELS_DIMENSION,
        )
        lstm_output, _  = self.rnn(concatenated_activation_maps.swapaxes(1, 2))
        lstm_output = lstm_output.swapaxes(1, 2) # redundant
        attended = self.attention(lstm_output)
        if hasattr(self, "aux_head"):
            return self.meain_head(attended), self.aux_head(attended)
        return self.meain_head(attended)

### Create model function

In [13]:
def mk_model(
    dataset_x:Optional[Tensor]=None,
    n_aux_classes:Optional[int]=None,
) -> nn.Module:
    return (
        CMIHARModule(
            imu_idx=imu_idx,
            thm_idx=thm_idx,
            tof_idx=tof_idx,
            mlp_width=256,
            n_classes=18,
            dataset_x=dataset_x,
            n_aux_classes=n_aux_classes,
        )
        .to(device)
    )

display(mk_model(torch.arange(12).view(2, 2, -1).float()))
print("input channels:", len(meta_data["feature_cols"]))

CMIHARModule(
  (imu_branch): Sequential(
    (0): ResidualBlock(
      (blocks): Sequential(
        (0): Conv1d(46, 219, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
        (1): BatchNorm1d(219, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Conv1d(219, 219, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
        (4): BatchNorm1d(219, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (5): SqueezeExcitationBlock(
          (fc1): Linear(in_features=219, out_features=27, bias=True)
          (fc2): Linear(in_features=27, out_features=219, bias=True)
          (sigmoid): Sigmoid()
        )
      )
      (head): Sequential(
        (0): ReLU()
        (1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (2): Dropout(p=0.3, inplace=False)
      )
      (skip_connection): Sequential(
        (0): Conv1d(46, 219, kernel_size=(1,), stride=(1,), bias=False)
       

input channels: 946


## Training

In [14]:
class CosineAnnealingWarmupRestarts(_LRScheduler):
    def __init__(
        self,
        optimizer: Optimizer,
        warmup_steps: int,
        max_lr: float,
        min_lr: float,
        cycle_length: int,
        cycle_mult: float = 1.0,
        gamma: float = 1.0,
        last_epoch: int = -1,
    ) -> None:
        """
        Args:
            optimizer: Wrapped optimizer.
            warmup_steps: Number of steps for linear warmup.
            max_lr: Initial maximum learning rate.
            min_lr: Minimum learning rate after decay.
            cycle_length: Initial number of steps per cosine cycle.
            cycle_mult: Multiplicative factor for increasing cycle lengths.
            gamma: Multiplicative decay factor for max_lr after each cycle.
            last_epoch: The index of last epoch. Default: -1.
        """
        self.warmup_steps = warmup_steps
        self.max_lr = max_lr
        self.min_lr = min_lr
        self.cycle_length = cycle_length
        self.cycle_mult = cycle_mult
        self.gamma = gamma

        self.current_cycle = 0
        self.cycle_step = 0
        self.lr = max_lr

        super().__init__(optimizer, last_epoch)

    def get_lr(self) -> list[float]:
        if self.last_epoch < self.warmup_steps:
            # Linear warmup
            scale = (self.last_epoch + 1) / self.warmup_steps
            return [self.min_lr + scale * (self.max_lr - self.min_lr) for _ in self.base_lrs]

        # Adjust for post-warmup step index
        t = self.cycle_step
        T = self.cycle_length

        cosine_decay = 0.5 * (1 + math.cos(math.pi * t / T))
        lr = self.min_lr + (self.max_lr - self.min_lr) * cosine_decay

        return [lr for _ in self.base_lrs]

    def step(self, epoch: Optional[int] = None) -> None:
        if self.last_epoch >= self.warmup_steps:
            self.cycle_step += 1
            if self.cycle_step >= self.cycle_length:
                self.current_cycle += 1
                self.cycle_step = 0
                self.cycle_length = max(int(self.cycle_length * self.cycle_mult), 1)
                self.max_lr *= self.gamma
        super().step(epoch)

In [15]:
def mixup_data(
    x:Tensor,
    y:Tensor,
    aux_y:Optional[Tensor],
    alpha=0.2
) -> tuple[Tensor, Tensor] | tuple[Tensor, Tensor, Tensor]:
    """
    Return mixed inputs and mixed targets (one-hot) for mixup.
    x: Tensor of shape (batch_size, features, seq_len)
    y: Tensor of shape (batch_size, num_classes)
    """
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1.0
    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(x.device)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    mixed_y = lam * y + (1 - lam) * y[index, :]
    if aux_y is not None:
        mixed_aux_y = lam * aux_y + (1 - lam) * aux_y[index, :]
        return mixed_x, mixed_y, mixed_aux_y
    else:
        return mixed_x, mixed_y

In [16]:
def train_model(
        model:nn.Module,
        train_loader:DL,
        criterion:callable,
        optimizer:torch.optim.Optimizer,
        scheduler:_LRScheduler,
    ) -> dict:
    "Train model on a single epoch"
    train_metrics = {}
    model.train()
    train_metrics["train_loss"] = 0.0
    total = 0
    for batch_x, batch_y, batch_aux_y in train_loader:
        batch_aux_y = batch_aux_y.clone()
        batch_x = batch_x.to(device).clone()
        add_noise = torch.randn_like(batch_x, device=device) * 0.04
        scale_noise = torch.rand_like(batch_x, device=device) * (1.1 - 0.9) + 0.9
        batch_x = (add_noise + batch_x) * scale_noise
        batch_x[:TRAIN_BATCH_SIZE // 2, tof_idx + thm_idx] = 0.0
        batch_y = batch_y.to(device)
        batch_x = batch_x.float()
        
        batch_x, batch_y, batch_aux_y = mixup_data(batch_x, batch_y, batch_aux_y)

        optimizer.zero_grad()
        outputs, aux_output = model(batch_x)
        loss = criterion(outputs, batch_y) + criterion(aux_output, batch_aux_y)
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_metrics["train_loss"] += loss.item() * batch_x.size(0)
        total += batch_x.size(0)
    train_metrics["train_loss"] /= total

    return train_metrics

In [17]:
def evaluate_model(model:nn.Module, validation_loader:DL, criterion:callable) -> dict:
    model.eval()
    eval_metrics = {}
    eval_metrics["val_loss"] = 0.0
    total = 0
    all_true = []
    all_pred = []

    with torch.no_grad():
        for batch_x, batch_y, _ in validation_loader:
            batch_x = batch_x.to(device).clone()
            batch_y = batch_y.to(device)
            # batch_aux_y = batch_aux_y.to(device)
            batch_x[:VALIDATION_BATCH_SIZE // 2, tof_idx + thm_idx] = 0.0

            outputs, _ = model(batch_x)
            loss = criterion(outputs, batch_y)
            eval_metrics["val_loss"] += loss.item() * batch_x.size(0)
            total += batch_x.size(0)

            # Get predicted class indices
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            # Get true class indices from one-hot
            trues = torch.argmax(batch_y, dim=1).cpu().numpy()

            all_true.append(trues)
            all_pred.append(preds)

    eval_metrics["val_loss"] /= total
    all_true = np.concatenate(all_true)
    all_pred = np.concatenate(all_pred)

    # Compute competition metrics
    # Binary classification: BFRB (1) vs non-BFRB (0)
    binary_true = np.isin(all_true, BFRB_INDICES).astype(int)
    binary_pred = np.isin(all_pred, BFRB_INDICES).astype(int)
    eval_metrics["binary_f1"] = f1_score(binary_true, binary_pred)

    # Collapse non-BFRB gestures into a single class
    collapsed_true = np.where(
        np.isin(all_true, BFRB_INDICES),
        all_true,
        len(BFRB_GESTURES)  # Single non-BFRB class
    )
    collapsed_pred = np.where(
        np.isin(all_pred, BFRB_INDICES),
        all_pred,
        len(BFRB_GESTURES)  # Single non-BFRB class
    )

    # Macro F1 on collapsed classes
    eval_metrics["macro_f1"] = f1_score(collapsed_true, collapsed_pred, average='macro')
    eval_metrics["final_metric"] = (eval_metrics["binary_f1"] + eval_metrics["macro_f1"]) / 2

    return eval_metrics

In [18]:
def train_model_on_all_epochs(
        model:nn.Module,
        train_loader:DL,
        validation_loader:DL,
        criterion:callable,
        optimizer:torch.optim.Optimizer,
        scheduler:_LRScheduler,
        fold:int,
    ) -> DF:

    metrics:list[dict] = []
    # Early stopping
    best_metric = -np.inf
    best_binary_f1 = -np.inf
    best_macro_f1 = -np.inf
    epochs_no_improve = 0

    for epoch in range(1, TRAINING_EPOCHS + 1):
        train_metrics = train_model(model, train_loader, criterion, optimizer, scheduler)
        validation_metrics = evaluate_model(model, validation_loader, criterion)
        metrics.append({"fold": fold, "epoch": epoch} | train_metrics | validation_metrics)

        print(f"Epoch {epoch:02d}: Binary F1 = {validation_metrics['binary_f1']:.4f}, Macro F1 = {validation_metrics['macro_f1']:.4f}, Final Metric = {validation_metrics['final_metric']:.4f}")

        if validation_metrics["final_metric"] > best_metric:
            best_metric = validation_metrics["final_metric"]
            best_binary_f1 = validation_metrics["binary_f1"]
            best_macro_f1 = validation_metrics["macro_f1"]
            epochs_no_improve = 0
            best_model_state = model.state_dict()
            print(f"  New best metric! Saving model...")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"Early stopping triggered at epoch {epoch}")
                model.load_state_dict(best_model_state)
                break

    torch.save(best_model_state, f"best_model_fold{fold}.pth")

    return DF.from_records(metrics).set_index(["fold", "epoch"])

In [19]:
def sgkf_from_tensor_dataset(
    dataset: TensorDataset,
    n_splits: int = 5,
    shuffle: bool = True,
) -> Iterator[tuple[Subset, Subset]]:
    # Load sequence meta data to get classes and groups parameters
    seq_meta = pd.read_parquet("preprocessed_dataset/sequences_meta_data.parquet")
    X, *_ = dataset.tensors
    sgkf = StratifiedGroupKFold(
        n_splits=n_splits,
        shuffle=shuffle,
    )

    for train_idx, val_idx in sgkf.split(X.cpu().numpy(), seq_meta["gesture"], seq_meta["subject"]):
        yield Subset(dataset, train_idx), Subset(dataset, val_idx)

In [20]:
def train_on_all_folds(lr_scheduler_kw:dict, optimizer_kw:dict) -> tuple[float, DF]:
    seed_everything(seed=SEED)

    metrics:DF = DF()
    full_dataset = CMIDataset()
    folds_it = sgkf_from_tensor_dataset(full_dataset, NB_CROSS_VALIDATIONS)

    for fold_idx, (train_dataset, validation_dataset) in enumerate(folds_it):
        seed_everything(seed=SEED + fold_idx)
        # Debugging
        print(f"\n{'='*50}")
        print("training:", fold_idx + 1)
        print(f"Fold {fold_idx + 1}/{NB_CROSS_VALIDATIONS}")
        criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
        train_loader = DL(train_dataset, TRAIN_BATCH_SIZE, shuffle=True, drop_last=False)
        validation_loader = DL(validation_dataset, VALIDATION_BATCH_SIZE, shuffle=False, drop_last=False)
        print("train dataset indices:", len(train_dataset.indices))
        print("validation dataset indices:", len(validation_dataset.indices))
        all_train_x = train_dataset.dataset.tensors[0][train_dataset.indices]
        model = mk_model(all_train_x, meta_data["n_aux_classes"])

        # Optimizer et scheduler
        optimizer = torch.optim.AdamW(
            model.parameters(),
            WARMUP_LR_INIT,
            weight_decay=optimizer_kw["weight_decay"],
            betas=(optimizer_kw["beta_0"], optimizer_kw["beta_1"]),
        )
        steps_per_epoch = len(train_loader)
        scheduler = CosineAnnealingWarmupRestarts(
            optimizer,
            warmup_steps=lr_scheduler_kw["warmup_epochs"] * steps_per_epoch,
            cycle_mult=lr_scheduler_kw["cycle_mult"],
            max_lr=lr_scheduler_kw["max_lr"],
            min_lr=lr_scheduler_kw["max_lr"] / lr_scheduler_kw["max_to_min_div_factor"],
            cycle_length=lr_scheduler_kw["init_cycle_epochs"] * steps_per_epoch,
            gamma=lr_scheduler_kw["lr_cycle_factor"],
        ) 
        fold_metrics = train_model_on_all_epochs(
            model,
            train_loader,
            validation_loader,
            criterion,
            optimizer,
            scheduler,
            fold_idx,
        )
        # Free memory used by datasets and data loaders
        del train_dataset
        del validation_dataset
        del train_loader
        del validation_loader
        gc.collect()
        torch.cuda.empty_cache()

        best_fold_metrics = fold_metrics.loc[fold_metrics["final_metric"].idxmax()]
        final_fold_metrics = fold_metrics.iloc[-1]
        print(f"Best validation metrics - Binary F1: {best_fold_metrics['binary_f1']:.4f}, Macro F1: {best_fold_metrics['macro_f1']:.4f}, Final: {best_fold_metrics['final_metric']:.4f}")
        print(f"Final validation metrics - Binary F1: {final_fold_metrics['binary_f1']:.4f}, Macro F1: {final_fold_metrics['macro_f1']:.4f}, Final: {final_fold_metrics['final_metric']:.4f}")

        metrics = pd.concat((metrics, fold_metrics))

    print("\n" + "="*50)
    print("Cross-Validation Results")
    print("="*50)

    # Statistiques pour les meilleures métriques
    best_metrics:DF = (
        metrics
        .loc[:, ["binary_f1", "macro_f1", "final_metric"]]
        .groupby(level=0)
        .max()
    )

    print("\nBest Fold-wise Metrics:")
    display(best_metrics)
    
    print("\nGlobal Statistics (Best Metrics):")
    print(f"Mean Best Final Metric: {best_metrics['final_metric'].mean():.4f} ± {best_metrics['final_metric'].std():.4f}")
    print(f"Mean Best Binary F1: {best_metrics['binary_f1'].mean():.4f} ± {best_metrics['binary_f1'].std():.4f}")
    print(f"Mean Best Macro F1: {best_metrics['macro_f1'].mean():.4f} ± {best_metrics['macro_f1'].std():.4f}")
    
    return best_metrics["final_metric"].mean(), metrics

In [27]:
mean_best_cv_score, metrics = train_on_all_folds(
    lr_scheduler_kw={
        "warmup_epochs": 8,
        "cycle_mult": 0.7994284370327427,
        "max_lr": 0.005581907927062619,
        "max_to_min_div_factor": 275.0,
        "init_cycle_epochs": 5,
        "lr_cycle_factor": 0.5033112105827083,
    },
    optimizer_kw={
        "weight_decay": 0.0006702308864102119,
        "beta_0": 0.9089203414971434,
        "beta_1": 0.9969898035522793,
    }
)
print("mean best CV:", mean_best_cv_score)
display(metrics)


training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.7809, Macro F1 = 0.2075, Final Metric = 0.4942
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.9115, Macro F1 = 0.3174, Final Metric = 0.6145
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.9632, Macro F1 = 0.4072, Final Metric = 0.6852
  New best metric! Saving model...
Epoch 04: Binary F1 = 0.9673, Macro F1 = 0.4301, Final Metric = 0.6987
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.9177, Macro F1 = 0.4205, Final Metric = 0.6691
Epoch 06: Binary F1 = 0.9650, Macro F1 = 0.5193, Final Metric = 0.7422
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9595, Macro F1 = 0.4795, Final Metric = 0.7195
Epoch 08: Binary F1 = 0.9560, Macro F1 = 0.4625, Final Metric = 0.7092
Epoch 09: Binary F1 = 0.9774, Macro F1 = 0.5263, Final Metric = 0.7518
  New best metric! Saving model...
Epoch 10: Binary F1 = 0.9885, Macro F1 = 0.5786, Final Metric = 0.78

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.991123,0.669662,0.830139
1,0.968059,0.58992,0.77899
2,0.974815,0.605067,0.788758
3,0.945455,0.541383,0.741822
4,0.977409,0.572269,0.773951



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7827 ± 0.0318
Mean Best Binary F1: 0.9714 ± 0.0167
Mean Best Macro F1: 0.5957 ± 0.0477
mean best CV: 0.7827318390357785


Unnamed: 0_level_0,Unnamed: 1_level_0,train_loss,val_loss,binary_f1,macro_f1,final_metric
fold,epoch,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,3.855682,2.579842,0.780890,0.207507,0.494198
0,2,3.139727,2.067492,0.911458,0.317444,0.614451
0,3,2.791345,1.765862,0.963197,0.407174,0.685185
0,4,2.521093,1.760678,0.967300,0.430104,0.698702
0,5,2.689573,1.743890,0.917749,0.420530,0.669140
...,...,...,...,...,...,...
4,21,1.974361,1.416081,0.973223,0.566158,0.769690
4,22,1.941672,1.414669,0.972748,0.564224,0.768486
4,23,1.781919,1.401836,0.975452,0.563893,0.769672
4,24,1.891745,1.397817,0.975632,0.572269,0.773951


## Hyperparameter tuning

In [None]:
def objective(trial: optuna.trial.Trial) -> float:
    return train_on_all_folds(
        lr_scheduler_kw={
            "warmup_epochs": trial.suggest_int("warmup_epochs", 1, 10),
            "cycle_mult": trial.suggest_float("cycle_mult", 0.5, 2),
            "max_lr": trial.suggest_float("max_lr", 0.005581907927062619 / 3, 0.005581907927062619 * 3),
            "max_to_min_div_factor": trial.suggest_float("max_to_min_div_factor", 100, 300, step=25),
            "init_cycle_epochs": trial.suggest_int("init_cycle_epochs", 2, 10),
            "lr_cycle_factor": trial.suggest_float("lr_cycle_factor", 0.3, 1),
        },
        optimizer_kw={
            "weight_decay": trial.suggest_float("weight_decay", 5e-4, 1e-3),
            "beta_0":trial.suggest_float("beta_0", 0.8, 0.999),
            "beta_1":trial.suggest_float("beta_1", 0.99, 0.9999),
        }
    )[0]

In [23]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=60 * 45)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2025-08-15 15:51:57,807] A new study created in memory with name: no-name-5776b365-cfe2-4f37-9a86-ea822f6194e4



training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.8891, Macro F1 = 0.1515, Final Metric = 0.5203
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.9392, Macro F1 = 0.2649, Final Metric = 0.6020
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.9225, Macro F1 = 0.3405, Final Metric = 0.6315
  New best metric! Saving model...
Epoch 04: Binary F1 = 0.9466, Macro F1 = 0.4405, Final Metric = 0.6935
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.9613, Macro F1 = 0.4459, Final Metric = 0.7036
  New best metric! Saving model...
Epoch 06: Binary F1 = 0.9771, Macro F1 = 0.4832, Final Metric = 0.7301
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9822, Macro F1 = 0.5454, Final Metric = 0.7638
  New best metric! Saving model...
Epoch 08: Binary F1 = 0.9853, Macro F1 = 0.5641, Final Metric = 0.7747
  New best metric! Saving model...
Epoch 09: Binary F1 = 0.9821, Macro F1 = 0.5748, Final Metric = 0.778

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.98847,0.651793,0.820131
1,0.959681,0.569623,0.764652
2,0.967033,0.576029,0.770739
3,0.943898,0.527004,0.73534
4,0.967801,0.568388,0.768095


[I 2025-08-15 15:55:48,335] Trial 0 finished with value: 0.7717912879581916 and parameters: {'warmup_epochs': 1, 'cycle_mult': 0.6353026971600697, 'max_lr': 0.015328053293172283, 'max_to_min_div_factor': 125.0, 'init_cycle_epochs': 8, 'lr_cycle_factor': 0.6092692037813834, 'weight_decay': 0.0005176411985832053, 'beta_0': 0.854185488026505, 'beta_1': 0.9930227409050381}. Best is trial 0 with value: 0.7717912879581916.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7718 ± 0.0305
Mean Best Binary F1: 0.9654 ± 0.0161
Mean Best Macro F1: 0.5786 ± 0.0453

training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.5612, Macro F1 = 0.1446, Final Metric = 0.3529
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.8731, Macro F1 = 0.2801, Final Metric = 0.5766
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.9397, Macro F1 = 0.3311, Final Metric = 0.6354
  New best metric! Saving model...
Epoch 04: Binary F1 = 0.9563, Macro F1 = 0.3713, Final Metric = 0.6638
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.9523, Macro F1 = 0.4244, Final Metric = 0.6884
  New best metric! Saving model...
Epoch 06: Binary F1 = 0.9687, Macro F1 = 0.4695, Final Metric = 0.7191
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9722, Macro F1 = 0.4828, Final Metric = 0.7275
  New best metric! Saving model...
Epoch 08: Binary F1 = 0.972

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.982659,0.57839,0.780525
1,0.950525,0.522383,0.736454
2,0.963664,0.533164,0.747646
3,0.930278,0.459338,0.692157
4,0.957163,0.496312,0.726016


[I 2025-08-15 15:59:37,221] Trial 1 finished with value: 0.7365594304433494 and parameters: {'warmup_epochs': 2, 'cycle_mult': 0.6258297216760773, 'max_lr': 0.004168065124612039, 'max_to_min_div_factor': 200.0, 'init_cycle_epochs': 8, 'lr_cycle_factor': 0.4157179707240828, 'weight_decay': 0.0005513717880897944, 'beta_0': 0.9845895117264251, 'beta_1': 0.9993557116727305}. Best is trial 0 with value: 0.7717912879581916.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7366 ± 0.0322
Mean Best Binary F1: 0.9569 ± 0.0191
Mean Best Macro F1: 0.5179 ± 0.0442

training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.7917, Macro F1 = 0.1942, Final Metric = 0.4929
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.9468, Macro F1 = 0.3015, Final Metric = 0.6241
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.8298, Macro F1 = 0.3352, Final Metric = 0.5825
Epoch 04: Binary F1 = 0.8954, Macro F1 = 0.4116, Final Metric = 0.6535
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.9547, Macro F1 = 0.4337, Final Metric = 0.6942
  New best metric! Saving model...
Epoch 06: Binary F1 = 0.9687, Macro F1 = 0.4676, Final Metric = 0.7181
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9333, Macro F1 = 0.4409, Final Metric = 0.6871
Epoch 08: Binary F1 = 0.9697, Macro F1 = 0.5120, Final Metric = 0.7408
  New best metric! Saving 

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.990605,0.647174,0.817291
1,0.961078,0.584021,0.771456
2,0.974232,0.615783,0.795008
3,0.94649,0.543281,0.744885
4,0.975161,0.583095,0.779128


[I 2025-08-15 16:03:27,516] Trial 2 finished with value: 0.7815535501475724 and parameters: {'warmup_epochs': 9, 'cycle_mult': 1.6086338891157224, 'max_lr': 0.01247960649238369, 'max_to_min_div_factor': 125.0, 'init_cycle_epochs': 6, 'lr_cycle_factor': 0.9756645283103911, 'weight_decay': 0.0006426589350875416, 'beta_0': 0.8865596855893878, 'beta_1': 0.993067277153746}. Best is trial 2 with value: 0.7815535501475724.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7816 ± 0.0270
Mean Best Binary F1: 0.9695 ± 0.0166
Mean Best Macro F1: 0.5947 ± 0.0390

training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.8340, Macro F1 = 0.1856, Final Metric = 0.5098
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.9056, Macro F1 = 0.3099, Final Metric = 0.6078
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.8301, Macro F1 = 0.3451, Final Metric = 0.5876
Epoch 04: Binary F1 = 0.9566, Macro F1 = 0.3526, Final Metric = 0.6546
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.9661, Macro F1 = 0.4252, Final Metric = 0.6956
  New best metric! Saving model...
Epoch 06: Binary F1 = 0.9723, Macro F1 = 0.4843, Final Metric = 0.7283
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9429, Macro F1 = 0.4068, Final Metric = 0.6748
Epoch 08: Binary F1 = 0.9732, Macro F1 = 0.4978, Final Metric = 0.7355
  New best metric! Saving 

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.988002,0.65163,0.819816
1,0.964022,0.591337,0.777493
2,0.969031,0.606913,0.787972
3,0.950429,0.541598,0.737931
4,0.972401,0.564216,0.768308


[I 2025-08-15 16:07:16,714] Trial 3 finished with value: 0.7783039375749408 and parameters: {'warmup_epochs': 10, 'cycle_mult': 0.762726561732343, 'max_lr': 0.014580667385914434, 'max_to_min_div_factor': 300.0, 'init_cycle_epochs': 8, 'lr_cycle_factor': 0.8906999608327624, 'weight_decay': 0.0007128630395335294, 'beta_0': 0.8842250465210296, 'beta_1': 0.9909435144010375}. Best is trial 2 with value: 0.7815535501475724.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7783 ± 0.0298
Mean Best Binary F1: 0.9688 ± 0.0136
Mean Best Macro F1: 0.5911 ± 0.0421

training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.3742, Macro F1 = 0.0863, Final Metric = 0.2302
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.5177, Macro F1 = 0.1358, Final Metric = 0.3268
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.9030, Macro F1 = 0.2924, Final Metric = 0.5977
  New best metric! Saving model...
Epoch 04: Binary F1 = 0.9187, Macro F1 = 0.3052, Final Metric = 0.6119
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.9456, Macro F1 = 0.3740, Final Metric = 0.6598
  New best metric! Saving model...
Epoch 06: Binary F1 = 0.9567, Macro F1 = 0.3588, Final Metric = 0.6578
Epoch 07: Binary F1 = 0.9648, Macro F1 = 0.4058, Final Metric = 0.6853
  New best metric! Saving model...
Epoch 08: Binary F1 = 0.9649, Macro F1 = 0.4062, Final Metric 

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.984841,0.577105,0.779643
1,0.96289,0.537678,0.750284
2,0.961847,0.520251,0.739763
3,0.941752,0.472012,0.698082
4,0.957525,0.48597,0.718008


[I 2025-08-15 16:11:04,992] Trial 4 finished with value: 0.7371560124063384 and parameters: {'warmup_epochs': 1, 'cycle_mult': 1.2837976029113978, 'max_lr': 0.01577257224459994, 'max_to_min_div_factor': 125.0, 'init_cycle_epochs': 7, 'lr_cycle_factor': 0.7169950046636422, 'weight_decay': 0.0008831859207229212, 'beta_0': 0.9787264930356049, 'beta_1': 0.9919787184692516}. Best is trial 2 with value: 0.7815535501475724.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7372 ± 0.0311
Mean Best Binary F1: 0.9618 ± 0.0154
Mean Best Macro F1: 0.5186 ± 0.0419

training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.8102, Macro F1 = 0.1604, Final Metric = 0.4853
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.8446, Macro F1 = 0.2179, Final Metric = 0.5312
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.8967, Macro F1 = 0.2801, Final Metric = 0.5884
  New best metric! Saving model...
Epoch 04: Binary F1 = 0.9455, Macro F1 = 0.3724, Final Metric = 0.6590
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.8889, Macro F1 = 0.3703, Final Metric = 0.6296
Epoch 06: Binary F1 = 0.9410, Macro F1 = 0.4115, Final Metric = 0.6763
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9645, Macro F1 = 0.4687, Final Metric = 0.7166
  New best metric! Saving model...
Epoch 08: Binary F1 = 0.9605, Macro F1 = 0.4490, Final Metric 

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.983795,0.605198,0.794318
1,0.956049,0.537274,0.746661
2,0.960638,0.537536,0.748867
3,0.931683,0.495933,0.709389
4,0.958487,0.518953,0.738337


[I 2025-08-15 16:14:53,202] Trial 5 finished with value: 0.7475144606986305 and parameters: {'warmup_epochs': 8, 'cycle_mult': 1.0348593345336696, 'max_lr': 0.0032811900954498206, 'max_to_min_div_factor': 175.0, 'init_cycle_epochs': 5, 'lr_cycle_factor': 0.30887681114422977, 'weight_decay': 0.0005035153518304227, 'beta_0': 0.9904763598444131, 'beta_1': 0.9911208116531066}. Best is trial 2 with value: 0.7815535501475724.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7475 ± 0.0305
Mean Best Binary F1: 0.9581 ± 0.0185
Mean Best Macro F1: 0.5390 ± 0.0408

training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.8231, Macro F1 = 0.2415, Final Metric = 0.5323
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.9358, Macro F1 = 0.3301, Final Metric = 0.6330
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.9365, Macro F1 = 0.4049, Final Metric = 0.6707
  New best metric! Saving model...
Epoch 04: Binary F1 = 0.9591, Macro F1 = 0.4149, Final Metric = 0.6870
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.9318, Macro F1 = 0.4494, Final Metric = 0.6906
  New best metric! Saving model...
Epoch 06: Binary F1 = 0.9682, Macro F1 = 0.4723, Final Metric = 0.7202
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9718, Macro F1 = 0.5010, Final Metric = 0.7364
  New best metric! Saving model...
Epoch 08: Binary F1 = 0.974

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.991141,0.648107,0.818055
1,0.966894,0.598286,0.781409
2,0.972569,0.599816,0.786192
3,0.948533,0.552152,0.744161
4,0.970054,0.566491,0.767545


[I 2025-08-15 16:18:26,230] Trial 6 finished with value: 0.7794723052879817 and parameters: {'warmup_epochs': 6, 'cycle_mult': 1.5661702953682082, 'max_lr': 0.005861897036256811, 'max_to_min_div_factor': 250.0, 'init_cycle_epochs': 8, 'lr_cycle_factor': 0.9328620013647129, 'weight_decay': 0.0008260933013601367, 'beta_0': 0.8913836265588733, 'beta_1': 0.994569258531828}. Best is trial 2 with value: 0.7815535501475724.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7795 ± 0.0270
Mean Best Binary F1: 0.9698 ± 0.0152
Mean Best Macro F1: 0.5930 ± 0.0370

training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.8062, Macro F1 = 0.2115, Final Metric = 0.5088
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.9253, Macro F1 = 0.3049, Final Metric = 0.6151
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.9473, Macro F1 = 0.4225, Final Metric = 0.6849
  New best metric! Saving model...
Epoch 04: Binary F1 = 0.9544, Macro F1 = 0.3365, Final Metric = 0.6455
Epoch 05: Binary F1 = 0.9313, Macro F1 = 0.4345, Final Metric = 0.6829
Epoch 06: Binary F1 = 0.9763, Macro F1 = 0.4915, Final Metric = 0.7339
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9672, Macro F1 = 0.5411, Final Metric = 0.7542
  New best metric! Saving model...
Epoch 08: Binary F1 = 0.9765, Macro F1 = 0.4901, Final Metric = 0.7333
Epoch 09: Binary F1 = 0.97

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.990089,0.639514,0.812201
1,0.96572,0.576298,0.767812
2,0.975345,0.588413,0.781003
3,0.950049,0.531806,0.738371
4,0.972057,0.581745,0.775484


[I 2025-08-15 16:21:54,678] Trial 7 finished with value: 0.7749740504738172 and parameters: {'warmup_epochs': 5, 'cycle_mult': 1.228131717418772, 'max_lr': 0.006840709697609272, 'max_to_min_div_factor': 200.0, 'init_cycle_epochs': 10, 'lr_cycle_factor': 0.7183829896649212, 'weight_decay': 0.0006099397330074811, 'beta_0': 0.9120918202125637, 'beta_1': 0.9920621369091511}. Best is trial 2 with value: 0.7815535501475724.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7750 ± 0.0265
Mean Best Binary F1: 0.9707 ± 0.0146
Mean Best Macro F1: 0.5836 ± 0.0384

training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.8559, Macro F1 = 0.2278, Final Metric = 0.5418
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.9412, Macro F1 = 0.3613, Final Metric = 0.6513
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.8015, Macro F1 = 0.3489, Final Metric = 0.5752
Epoch 04: Binary F1 = 0.9562, Macro F1 = 0.4158, Final Metric = 0.6860
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.8957, Macro F1 = 0.3934, Final Metric = 0.6445
Epoch 06: Binary F1 = 0.9681, Macro F1 = 0.4574, Final Metric = 0.7128
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9214, Macro F1 = 0.3870, Final Metric = 0.6542
Epoch 08: Binary F1 = 0.9741, Macro F1 = 0.5195, Final Metric = 0.7468
  New best metric! Saving model...
Epoch 09: Binary F1 = 0.98

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.989583,0.649215,0.819399
1,0.9619,0.595557,0.77582
2,0.975538,0.612961,0.79211
3,0.943953,0.534434,0.73798
4,0.974335,0.57112,0.771773


[I 2025-08-15 16:25:40,786] Trial 8 finished with value: 0.7794164368559062 and parameters: {'warmup_epochs': 6, 'cycle_mult': 0.6318786936708536, 'max_lr': 0.00965395527877553, 'max_to_min_div_factor': 100.0, 'init_cycle_epochs': 9, 'lr_cycle_factor': 0.7487843761252408, 'weight_decay': 0.0008385601664308573, 'beta_0': 0.8739520098362586, 'beta_1': 0.9915229462740467}. Best is trial 2 with value: 0.7815535501475724.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7794 ± 0.0298
Mean Best Binary F1: 0.9691 ± 0.0171
Mean Best Macro F1: 0.5927 ± 0.0432

training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.7320, Macro F1 = 0.2036, Final Metric = 0.4678
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.9255, Macro F1 = 0.3268, Final Metric = 0.6261
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.9153, Macro F1 = 0.3690, Final Metric = 0.6421
  New best metric! Saving model...
Epoch 04: Binary F1 = 0.9614, Macro F1 = 0.4198, Final Metric = 0.6906
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.9685, Macro F1 = 0.4475, Final Metric = 0.7080
  New best metric! Saving model...
Epoch 06: Binary F1 = 0.9600, Macro F1 = 0.4621, Final Metric = 0.7110
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9679, Macro F1 = 0.5018, Final Metric = 0.7348
  New best metric! Saving model...
Epoch 08: Binary F1 = 0.964

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.99115,0.657579,0.823816
1,0.964462,0.593332,0.776607
2,0.978911,0.621274,0.799063
3,0.949219,0.559265,0.752539
4,0.974359,0.585613,0.779282


[I 2025-08-15 16:29:29,203] Trial 9 finished with value: 0.7862613762820982 and parameters: {'warmup_epochs': 10, 'cycle_mult': 1.7998502928518423, 'max_lr': 0.0025181502225046317, 'max_to_min_div_factor': 100.0, 'init_cycle_epochs': 3, 'lr_cycle_factor': 0.32010655568134283, 'weight_decay': 0.0008280764974713931, 'beta_0': 0.8135831041238173, 'beta_1': 0.9986389373043834}. Best is trial 9 with value: 0.7862613762820982.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7863 ± 0.0267
Mean Best Binary F1: 0.9716 ± 0.0158
Mean Best Macro F1: 0.6034 ± 0.0375

training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.8275, Macro F1 = 0.2443, Final Metric = 0.5359
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.9241, Macro F1 = 0.2561, Final Metric = 0.5901
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.9459, Macro F1 = 0.3585, Final Metric = 0.6522
  New best metric! Saving model...
Epoch 04: Binary F1 = 0.9137, Macro F1 = 0.3799, Final Metric = 0.6468
Epoch 05: Binary F1 = 0.9519, Macro F1 = 0.4871, Final Metric = 0.7195
  New best metric! Saving model...
Epoch 06: Binary F1 = 0.9811, Macro F1 = 0.5589, Final Metric = 0.7700
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9753, Macro F1 = 0.5286, Final Metric = 0.7520
Epoch 08: Binary F1 = 0.9865, Macro F1 = 0.5794, Final Metric = 0.7830
  New best metric! Saving 

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.989034,0.656388,0.822465
1,0.964497,0.592564,0.77638
2,0.976517,0.5939,0.782339
3,0.948681,0.536492,0.737534
4,0.973096,0.572141,0.772309


[I 2025-08-15 16:33:11,695] Trial 10 finished with value: 0.7782052300540603 and parameters: {'warmup_epochs': 4, 'cycle_mult': 1.9965051802671627, 'max_lr': 0.008806833350084415, 'max_to_min_div_factor': 175.0, 'init_cycle_epochs': 2, 'lr_cycle_factor': 0.5135989644506006, 'weight_decay': 0.0009879945821523915, 'beta_0': 0.8031796450754048, 'beta_1': 0.9989271776177022}. Best is trial 9 with value: 0.7862613762820982.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7782 ± 0.0303
Mean Best Binary F1: 0.9704 ± 0.0150
Mean Best Macro F1: 0.5903 ± 0.0436

training: 1
Fold 1/5
train dataset indices: 6623
validation dataset indices: 1528
Epoch 01: Binary F1 = 0.7464, Macro F1 = 0.2342, Final Metric = 0.4903
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.9489, Macro F1 = 0.3405, Final Metric = 0.6447
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.9322, Macro F1 = 0.3260, Final Metric = 0.6291
Epoch 04: Binary F1 = 0.9290, Macro F1 = 0.4394, Final Metric = 0.6842
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.8188, Macro F1 = 0.4062, Final Metric = 0.6125
Epoch 06: Binary F1 = 0.9722, Macro F1 = 0.4634, Final Metric = 0.7178
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9465, Macro F1 = 0.4743, Final Metric = 0.7104
Epoch 08: Binary F1 = 0.9619, Macro F1 = 0.4350, Final Metric = 0.6985
Epoch 09: Binary F1 = 0.9630, Macro F1 = 0.5196, Final Metric

Unnamed: 0_level_0,binary_f1,macro_f1,final_metric
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.990586,0.652396,0.821241
1,0.966027,0.583506,0.770661
2,0.968579,0.612552,0.790115
3,0.955545,0.560526,0.751522
4,0.974453,0.572151,0.771315


[I 2025-08-15 16:36:59,380] Trial 11 finished with value: 0.7809707413941991 and parameters: {'warmup_epochs': 10, 'cycle_mult': 1.7629531553395195, 'max_lr': 0.012002107801761525, 'max_to_min_div_factor': 100.0, 'init_cycle_epochs': 4, 'lr_cycle_factor': 0.974071581828462, 'weight_decay': 0.0007025264535144536, 'beta_0': 0.8039366856216432, 'beta_1': 0.9972985976966278}. Best is trial 9 with value: 0.7862613762820982.



Global Statistics (Best Metrics):
Mean Best Final Metric: 0.7810 ± 0.0263
Mean Best Binary F1: 0.9710 ± 0.0129
Mean Best Macro F1: 0.5962 ± 0.0369
Study statistics: 
  Number of finished trials:  12
  Number of pruned trials:  0
  Number of complete trials:  12
Best trial:
  Value:  0.7862613762820982
  Params: 
    warmup_epochs: 10
    cycle_mult: 1.7998502928518423
    max_lr: 0.0025181502225046317
    max_to_min_div_factor: 100.0
    init_cycle_epochs: 3
    lr_cycle_factor: 0.32010655568134283
    weight_decay: 0.0008280764974713931
    beta_0: 0.8135831041238173
    beta_1: 0.9986389373043834


## Submission

### Reloading best models

In [24]:
model_ensemble = []
for fold in range(5):
    model = mk_model(n_aux_classes=meta_data["n_aux_classes"])
    checkpoint = torch.load(f"best_model_fold{fold}.pth", map_location=device, weights_only=True)
    model.load_state_dict(checkpoint)
    model.eval()
    model_ensemble.append(model)

### Define prediction function

In [25]:
def preprocess_sequence_at_inference(sequence_df:pl.DataFrame) -> ndarray:
    return (
        sequence_df                     
        .to_pandas()                            # Convert to pandas dataframe.
        .pipe(imputed_features)                 # Impute missing data.
        .pipe(standardize_tof_cols_names)
        .pipe(norm_quat_rotations)              # Norm quaternions
        .pipe(add_linear_acc_cols)              # Add gravity free acceleration.
        .pipe(add_acc_magnitude, RAW_ACCELRATION_COLS, "acc_mag")
        .pipe(add_acc_magnitude, LINEAR_ACC_COLS, "linear_acc_mag")
        .pipe(add_quat_angle_mag)
        .pipe(add_angular_velocity_features)
        .pipe(rot_euler_angles)                 # Add rotation acc expressed as euler angles.
        .pipe(agg_tof_cols_per_sensor)          # Aggregate ToF columns.
        .pipe(add_diff_features)                # 
        .loc[:, sorted(meta_data["feature_cols"])]      # Retain only the usefull columns a.k.a features.
        # .sub(meta_data["mean"])                 # Subtract features by their mean, std norm pt.1.
        # .div(meta_data["std"])                  # Divide by Standard deviation, std norm pt.2.
        .pipe(length_normed_sequence_feat_arr, meta_data["pad_seq_len"], SEQ_PAD_TRUNC_MODE)  # get feature ndarray of sequence.
        .T                                      # Transpose to swap channel and X dimensions.
    )

def predict(sequence: pl.DataFrame, _: pl.DataFrame) -> str:
    """
    Kaggle evaluation API will call this for each sequence.
    sequence: polars DataFrame for a single sequence
    demographics: unused in this model
    Returns: predicted gesture string
    """
    x_tensor = (
        torch.unsqueeze(Tensor(preprocess_sequence_at_inference(sequence)), dim=0)
        .float()
        .to(device)
    )
    print(x_tensor.shape)

    all_outputs = []
    with torch.no_grad():
        for model_idx, model in enumerate(model_ensemble): # Only take the first one bc it's the only one that takes in the correct input shape
            outputs, _ = model(x_tensor)
            all_outputs.append(outputs)

    avg_outputs = torch.mean(torch.stack(all_outputs), dim=0)
    pred_idx = torch.argmax(avg_outputs, dim=1).item()

    return str(TARGET_NAMES[pred_idx])

### Run inference server

In [26]:
inference_server = kaggle_evaluation.cmi_inference_server.CMIInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    competition_dataset_path = competition_download(COMPETITION_HANDLE)
    inference_server.run_local_gateway(
        data_paths=(
            join(competition_dataset_path, 'test.csv'),
            join(competition_dataset_path, 'test_demographics.csv'),
        )
    )
    inference_server = kaggle_evaluation.cmi_inference_server.CMIInferenceServer(predict)

                This exceeds the startup time limit of 900 seconds that the gateway will enforce
                during the rerun on the hidden test set. Start the server before performing any time consuming steps.


  0%|          | 0/5 [00:00<?, ?it/s]

torch.Size([1, 946, 127])


  0%|          | 0/5 [00:00<?, ?it/s]

torch.Size([1, 946, 127])
