# Training & inference notebook
Credit to [Tarun Mishra](https://www.kaggle.com/tarundirector) – this code is heavily based on his [notebook](https://www.kaggle.com/code/tarundirector/sensor-pulse-viz-eda-for-bfrb-detection?scriptVersionId=243465321).

## Setup

### imports

#### Training imports

In [26]:
import os
import gc
import json
import math
from glob import glob
from functools import partial
from datetime import datetime
from collections import Counter
from os.path import join, realpath
from typing import Optional, Literal
from itertools import pairwise, starmap

import torch
import kagglehub
import numpy as np
import pandas as pd
import plotly.express as px
from torch import nn, Tensor
from pandas import DataFrame as DF
from torch.utils.data import TensorDataset
from torch.optim import Optimizer
# from timm.scheduler import CosineLRScheduler
from torch.utils.data import DataLoader as DL
from rich.progress import Progress, Task, track
from torch.optim.lr_scheduler import ConstantLR, LRScheduler, _LRScheduler
metric_package = kagglehub.package_import('wasupandceacar/cmi-metric', bypass_confirmation=True)

#### inference imports

In [27]:
import os
import json
import warnings
from os.path import join
from tqdm.notebook import tqdm
from itertools import pairwise, product

import torch
import numpy as np
import pandas as pd
import polars as pl
from numpy import ndarray
from torch import nn, Tensor
from numpy.linalg import norm
from pandas import DataFrame as DF
from scipy.spatial.transform import Rotation
# from kagglehub import competition_download, dataset_download, model_download
import kagglehub
metric_package = kagglehub.package_import('wasupandceacar/cmi-metric', bypass_confirmation=True)

import training
import kaggle_evaluation.cmi_inference_server

#### kaggle notbook Imports

In [28]:
import random
import numpy as np
import torch
import os

def seed_everything(seed=42):
    """Set all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    torch.use_deterministic_algorithms(True, warn_only=True)

SEED = 42
seed_everything(seed=SEED)

import pandas as pd
import polars as pl
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GroupKFold
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm

from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import kaggle_evaluation.cmi_inference_server
from matplotlib import pyplot as plt

### Configs

#### Training config

In [29]:
# Dataset
DATASET_HANDLE = "mauroabidalcarrer/prepocessed-cmi-2025/versions/34"
COMPETITION_HANDLE = "cmi-detect-behavior-with-sensor-data"
TARGET_NAMES = sorted([
    "Above ear - pull hair",
    "Cheek - pinch skin",
    "Eyebrow - pull hair",
    "Eyelash - pull hair",
    "Feel around in tray and pull out an object",
    "Forehead - pull hairline",
    "Forehead - scratch",
    "Neck - pinch skin",
    "Neck - scratch",
    "Text on phone",
    "Wave hello",
    "Write name in air",
    "Write name on leg",
    "Drink from bottle/cup",
    "Pinch knee/leg skin",
    "Pull air toward your face",
    "Scratch knee/leg skin",
    "Glasses on/off"
])
IMU_FEATS_PREFIXES = (
    "acc",
    "linear_acc",
    "rot",
    "angular",
    "euler",
    "quat_rot_mag",
    "delta_rot_mag",
)
# Data augmentation
JITTER = 0.25
SCALING = 0.2
MIXUP = 0.3
# Training loop
NB_CROSS_VALIDATIONS = 5
TRAIN_BATCH_SIZE = 256
VALIDATION_BATCH_SIZE = 4 * TRAIN_BATCH_SIZE
PATIENCE = 8
# Optimizer
WEIGHT_DECAY = 3e-3
# Scheduler
TRAINING_EPOCHS = 25 # Including warmup epochs
WARMUP_EPOCHS = 3
WARMUP_LR_INIT = 1.822126131809773e-05
MAX_TO_MIN_LR_DIV_FACTOR = 100
LR_CYCLE_FACTOR = 0.5
CYCLE_LENGTH_FACTOR = 0.9
INIT_CYCLE_EPOCHS = 6
# MIN_LR = 3.810323058740104e-09
# MAX_LR = 1e-3
# Mock training loop
MOCK_TRAINING_EPOCHS = 15
MOCK_TRAINING_GAMMA = 1.01

#### Preprocessing (for inference) config 

In [30]:
QUATERNION_COLS = ['rot_w', 'rot_x', 'rot_y', 'rot_z']
GRAVITY_WORLD = np.array([0, 0, 9.81], "float32")
RAW_ACCELRATION_COLS = ["acc_x", "acc_y", "acc_z"]
LINEAR_ACC_COLS = ["linear_" + col for col in RAW_ACCELRATION_COLS] # Acceleration without gravity
COMPETITION_HANDLE = "cmi-detect-behavior-with-sensor-data"
CATEGORY_COLUMNS = [
    'row_id',
    'sequence_type',
    'sequence_id',
    'subject',
    'orientation',
    'behavior',
    'phase',
    'gesture',
]
META_DATA_COLUMNS = [
    'row_id',
    'sequence_type',
    'sequence_id',
    'sequence_counter',
    'subject',
    'orientation',
    'behavior',
    'phase',
    'gesture',
]
DATASET_DF_DTYPES = {
    "acc_x": "float32", "acc_y": "float32", "acc_z": "float32",
    "thm_1":"float32", "thm_2":"float32", "thm_3":"float32", "thm_4":"float32", "thm_5":"float32",
    "sequence_counter": "int32",
    **{col: "category" for col in CATEGORY_COLUMNS},
    **{f"tof_{i_1}_v{i_2}": "float32" for i_1, i_2 in product(range(1, 5), range(64))},
}
PREPROCESSED_DATASET_HANDLE = "mauroabidalcarrer/prepocessed-cmi-2025"
# The quantile of the sequences len used to pad/truncate during preprocessing
SEQUENCE_NORMED_LEN_QUANTILE = 0.95
# SAMPLING_FREQUENCY = 10 #Hz
N_FOLDS = 5
VALIDATION_FRACTION = 0.2
TARGET_NAMES = sorted([
    "Above ear - pull hair",
    "Cheek - pinch skin",
    "Eyebrow - pull hair",
    "Eyelash - pull hair",
    "Feel around in tray and pull out an object",
    "Forehead - pull hairline",
    "Forehead - scratch",
    "Neck - pinch skin",
    "Neck - scratch",
    "Text on phone",
    "Wave hello",
    "Write name in air",
    "Write name on leg",
    "Drink from bottle/cup",
    "Pinch knee/leg skin",
    "Pull air toward your face",
    "Scratch knee/leg skin",
    "Glasses on/off"
])
EPSILON=1e-8
DELTA_ROTATION_ANGULAR_VELOCITY_COLS = ["angular_vel_x", "angular_vel_y", "angular_vel_z"]
DELTA_ROTATION_AXES_COLS = ["rotation_axis_x", "rotation_axis_y", "rotation_axis_z"]
EULER_ANGLES_COLS = ["euler_x", "euler_y", "euler_z"]
TOF_AGG_FUNCS = [
    "mean",
    "std",
    "min",
    "max",
    "median",
]

### Define function to get the feature columns
Feature columns change over time so it's better to have a function to get them than manually update a variable every time we add/remove features.

In [31]:
def get_feature_cols(df:DF) -> list[str]:
    return sorted(list(set(df.columns) - set(META_DATA_COLUMNS) - set(TARGET_NAMES)))

### Supress performance warngings

In [32]:
warnings.filterwarnings(
    "ignore",
    message=(
        "DataFrame is highly fragmented.  This is usually the result of "
        "calling `frame.insert` many times.*"
    ),
    category=pd.errors.PerformanceWarning,
)

### device setup

In [33]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

### Dataset Setup

#### Dataset class

In [34]:
class CMIDataset(TensorDataset):
    def __init__(
        self,
        parent_dir: str,
        split: Optional[Literal["train", "validation"]]=None,
        subset: Optional[int]=None,
        force_download=False
    ):
        dataset_path = kagglehub.dataset_download(DATASET_HANDLE, force_download)
        parent_dir = join(dataset_path, "preprocessed_dataset", parent_dir)
        split = "" if split is None else split + "_"
        x = np.load(join(parent_dir, f"{split}X.npy")).swapaxes(1, 2)
        y = np.load(join(parent_dir, f"{split}Y.npy"))
        if subset is not None:
            x = x[:subset]
            y = y[:subset]
        super().__init__(
            torch.from_numpy(x).to(device),
            torch.from_numpy(y).to(device),
        )

#### Meta data loading

In [35]:
dataset_path = kagglehub.dataset_download(DATASET_HANDLE)
meta_data_path = join(
    dataset_path,
    "preprocessed_dataset",
    "full_dataset_meta_data.json"
)
with open(meta_data_path, "r") as fp:
    meta_data = json.load(fp)
# Convert target names into a ndarray to index it batchwise.
is_thm_tof_feat = lambda feat: feat.startswith(("thm", "tof"))
non_imu_feats_idx = [feat_idx for feat_idx, feat in enumerate(meta_data["feature_cols"]) if is_thm_tof_feat(feat)]
imu_feats_idx = [feat_idx for feat_idx, feat in enumerate(meta_data["feature_cols"]) if not is_thm_tof_feat(feat)]
print(meta_data["feature_cols"])
print("non_imu_feats_idx:", non_imu_feats_idx)
print("imu_feats_idx:", imu_feats_idx)

['acc_mag', 'acc_mag_diff', 'acc_x', 'acc_x_diff', 'acc_y', 'acc_y_diff', 'acc_z', 'acc_z_diff', 'angular_vel_x', 'angular_vel_x_diff', 'angular_vel_y', 'angular_vel_y_diff', 'angular_vel_z', 'angular_vel_z_diff', 'delta_rot_mag', 'delta_rot_mag_diff', 'euler_x', 'euler_x_diff', 'euler_y', 'euler_y_diff', 'euler_z', 'euler_z_diff', 'linear_acc_mag', 'linear_acc_mag_diff', 'linear_acc_x', 'linear_acc_x_diff', 'linear_acc_y', 'linear_acc_y_diff', 'linear_acc_z', 'linear_acc_z_diff', 'quat_rot_mag', 'quat_rot_mag_diff', 'rot_w', 'rot_w_diff', 'rot_x', 'rot_x_diff', 'rot_y', 'rot_y_diff', 'rot_z', 'rot_z_diff', 'rotation_axis_x', 'rotation_axis_x_diff', 'rotation_axis_y', 'rotation_axis_y_diff', 'rotation_axis_z', 'rotation_axis_z_diff', 'thm_1', 'thm_1_diff', 'thm_2', 'thm_2_diff', 'thm_3', 'thm_3_diff', 'thm_4', 'thm_4_diff', 'thm_5', 'thm_5_diff', 'tof_1_v0', 'tof_1_v0_diff', 'tof_1_v1', 'tof_1_v10', 'tof_1_v10_diff', 'tof_1_v11', 'tof_1_v11_diff', 'tof_1_v12', 'tof_1_v12_diff', 'tof_1_

#### Compute class weights.

In [36]:
def compute_weighted_cross_entropy_loss(
    dataset: Dataset[tuple[torch.Tensor, torch.Tensor]]
) -> nn.CrossEntropyLoss:
    """
    Computes class weights from a dataset with one-hot encoded targets and returns a CrossEntropyLoss with those weights.

    Args:
        dataset: A PyTorch Dataset that yields (x, y) where y is a one-hot encoded tensor of shape (num_classes,)

    Returns:
        A torch.nn.CrossEntropyLoss object with class weights based on inverse class frequency.
    """
    class_counts: Counter = Counter()
    num_samples = 0

    for _, y in dataset:
        class_idx = y.argmax().item()
        class_counts[class_idx] += 1
        num_samples += 1

    num_classes = len(class_counts)
    weights = torch.tensor(
        [num_samples / class_counts[i] for i in range(num_classes)],
        dtype=torch.float32,
    )

    # Optional: normalize weights so they sum to 1
    weights = weights / weights.sum()

    return nn.CrossEntropyLoss(weight=weights.to(device))


### BFRBs indices

In [37]:
print("Loading datasets...")
competition_dataset_path = kagglehub.competition_download(COMPETITION_HANDLE)
train_df = pd.read_csv(join(competition_dataset_path, "train.csv"))
train_dem_df = pd.read_csv(join(competition_dataset_path, "train_demographics.csv"))
test_df = pd.read_csv(join(competition_dataset_path, "test.csv"))
test_dem_df = pd.read_csv(join(competition_dataset_path, "test_demographics.csv"))
print(f"Train rows: {len(train_df)}, Test rows: {len(test_df)}")

# Encode labels
label_encoder = LabelEncoder()
train_df['gesture'] = label_encoder.fit_transform(train_df['gesture'].astype(str))
gesture_classes = label_encoder.classes_

bfrb_gestures = [
    'Above ear - pull hair',
    'Forehead - pull hairline',
    'Forehead - scratch',
    'Eyebrow - pull hair',
    'Eyelash - pull hair',
    'Neck - pinch skin',
    'Neck - scratch',
    'Cheek - pinch skin'
]
bfrb_indices = label_encoder.transform(bfrb_gestures)

Loading datasets...
Train rows: 574945, Test rows: 107


### Scheduler

In [38]:
class CosineAnnealingWarmupRestarts(_LRScheduler):
    def __init__(
        self,
        optimizer: Optimizer,
        warmup_steps: int,
        max_lr: float,
        min_lr: float,
        cycle_length: int,
        cycle_mult: float = 1.0,
        gamma: float = 1.0,
        last_epoch: int = -1,
    ) -> None:
        """
        Args:
            optimizer: Wrapped optimizer.
            warmup_steps: Number of steps for linear warmup.
            max_lr: Initial maximum learning rate.
            min_lr: Minimum learning rate after decay.
            cycle_length: Initial number of steps per cosine cycle.
            cycle_mult: Multiplicative factor for increasing cycle lengths.
            gamma: Multiplicative decay factor for max_lr after each cycle.
            last_epoch: The index of last epoch. Default: -1.
        """
        self.warmup_steps = warmup_steps
        self.max_lr = max_lr
        self.min_lr = min_lr
        self.cycle_length = cycle_length
        self.cycle_mult = cycle_mult
        self.gamma = gamma

        self.current_cycle = 0
        self.cycle_step = 0
        self.lr = max_lr

        super().__init__(optimizer, last_epoch)

    def get_lr(self) -> list[float]:
        if self.last_epoch < self.warmup_steps:
            # Linear warmup
            scale = (self.last_epoch + 1) / self.warmup_steps
            return [self.min_lr + scale * (self.max_lr - self.min_lr) for _ in self.base_lrs]

        # Adjust for post-warmup step index
        t = self.cycle_step
        T = self.cycle_length

        cosine_decay = 0.5 * (1 + math.cos(math.pi * t / T))
        lr = self.min_lr + (self.max_lr - self.min_lr) * cosine_decay

        return [lr for _ in self.base_lrs]

    def step(self, epoch: Optional[int] = None) -> None:
        if self.last_epoch >= self.warmup_steps:
            self.cycle_step += 1
            if self.cycle_step >= self.cycle_length:
                self.current_cycle += 1
                self.cycle_step = 0
                self.cycle_length = max(int(self.cycle_length * self.cycle_mult), 1)
                self.max_lr *= self.gamma
        super().step(epoch)


## Model definition

In [39]:
class MultiScaleConvs(nn.Module):
    def __init__(self, in_channels:int, kernel_sizes:list[int]):
        super().__init__()
        def mk_conv_block(k_size) -> nn.Sequential:
            return nn.Sequential(
                nn.Conv1d(in_channels, in_channels, k_size, padding=k_size // 2, groups=in_channels),
                nn.BatchNorm1d(in_channels),
                nn.ReLU(),
            )
        self.convs = nn.ModuleList(map(mk_conv_block, kernel_sizes))

    def forward(self, x:Tensor) -> Tensor:
        yes = torch.cat([conv(x) for conv in self.convs] + [x], dim=1)
        # print("stem output shape:", yes.shape)
        return yes

class ImuFeatureExtractor(nn.Module):
    def __init__(self, in_channels:int, kernel_size:int=15):
        super().__init__()

        self.lpf = nn.Conv1d(
            in_channels,
            in_channels,
            kernel_size=kernel_size,
            padding=kernel_size//2,
            groups=in_channels,
            bias=False,
        )
        nn.init.kaiming_uniform_(self.lpf.weight, a=math.sqrt(5))

    def forward(self, x:Tensor) -> Tensor:
        lpf_output = self.lpf(x)
        hpf_output = x - lpf_output
        return torch.cat((lpf_output, hpf_output, x), dim=1)  # (B, C_out, T)

class SqueezeExcitationBlock(nn.Module):
    # Copy/paste of https://www.kaggle.com/code/wasupandceacar/lb-0-82-5fold-single-bert-model#Model implementation
    def __init__(self, channels:int, reduction:int=8):
        super().__init__()
        self.fc1 = nn.Linear(channels, channels // reduction, bias=True)
        self.fc2 = nn.Linear(channels // reduction, channels, bias=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x: (B, C, L)
        se = F.adaptive_avg_pool1d(x, 1).squeeze(-1)      # -> (B, C)
        se = F.relu(self.fc1(se), inplace=True)          # -> (B, C//r)
        se = self.sigmoid(self.fc2(se)).unsqueeze(-1)    # -> (B, C, 1)
        return x * se

class ResidualBlock(nn.Module):
    def __init__(self, in_chns:int, out_chns:int, dropout_ratio:float=0.3, se_reduction:int=8, kernel_size:int=3):
        super().__init__()
        self.blocks = nn.Sequential(
            nn.Conv1d(in_chns, out_chns, kernel_size=kernel_size, padding=kernel_size // 2, bias=False),
            nn.BatchNorm1d(out_chns),
            nn.ReLU(),
            nn.Conv1d(out_chns, out_chns, kernel_size=kernel_size, padding=kernel_size // 2, bias=False),
            nn.BatchNorm1d(out_chns),
            SqueezeExcitationBlock(out_chns, se_reduction),
        )
        self.head = nn.Sequential(nn.ReLU(), nn.Dropout(dropout_ratio))
        if in_chns == out_chns:
            self.skip_connection = nn.Identity() 
        else:
            # TODO: set bias to False ?
            self.skip_connection = nn.Sequential(
                nn.Conv1d(in_chns, out_chns, 1, bias=False),
                nn.BatchNorm1d(out_chns)
            )
            self.head.insert(1, nn.MaxPool1d(2))

    def forward(self, x:Tensor) -> Tensor:
        activaition_maps = self.skip_connection(x) + self.blocks(x)
        return self.head(activaition_maps)

class MBConvBlock(nn.Module):
    # From this schema: https://media.licdn.com/dms/image/v2/D5612AQFjbDOm5uyxdw/article-inline_image-shrink_1500_2232/article-inline_image-shrink_1500_2232/0/1683677500817?e=1758153600&v=beta&t=n48_UW5TZTyDPhRFlJXSidUQQPQpuC756M0kNeKmYTY
    def __init__(self, in_chns:int, out_chns:int, se_reduction:int=8, expansion_ratio:int=4, dropout_ratio:float=0.3):
        super().__init__()
        expanded_channels = in_chns * expansion_ratio
        self.blocks = nn.Sequential(
            nn.Conv1d(in_chns, expanded_channels, kernel_size=1, bias=False),
            nn.BatchNorm1d(expanded_channels),
            nn.ReLU(),
            nn.Conv1d(
                expanded_channels,
                expanded_channels,
                kernel_size=3,
                padding=1,
                groups=expanded_channels,
                bias=False,
            ),
            nn.BatchNorm1d(expanded_channels),
            nn.ReLU(),
            SqueezeExcitationBlock(expanded_channels, se_reduction),
            nn.Conv1d(expanded_channels, out_chns, kernel_size=1, bias=False)
        )
        self.head = nn.Sequential(
            nn.BatchNorm1d(out_chns)
            # nn.ReLU(),
            # nn.Dropout(dropout_ratio),
        )
        if in_chns == out_chns:
            self.skip_connection = nn.Identity() 
        else:
            # TODO: set bias to False ?
            self.skip_connection = nn.Sequential(
                nn.Conv1d(in_chns, out_chns, 1, bias=False),
                nn.BatchNorm1d(out_chns)
            )
            self.head.add_module("max_pool", nn.MaxPool1d(2))
            
    def forward(self, x:Tensor) -> Tensor:
        activaition_maps = self.skip_connection(x) + self.blocks(x)
        return self.head(activaition_maps)

class AdditiveAttentionLayer(nn.Module):
    # Copied (and slightly modified) from https://www.kaggle.com/code/myso1987/cmi3-pyroch-baseline-model-add-aug-folds
    def __init__(self, hidden_dim):
        super().__init__()
        self.attention = nn.Linear(hidden_dim, 1, bias=True)

    def forward(self, x: Tensor) -> Tensor:
        # x shape: (batch, channels, seq_len)
        x = x.swapaxes(1, 2)
        # x shape: (batch, seq_len, hidden_dim)
        scores = torch.tanh(self.attention(x))  # (batch, seq_len, 1)
        weights = F.softmax(scores.squeeze(-1), dim=1)  # (batch, seq_len)
        context = torch.sum(x * weights.unsqueeze(-1), dim=1)  # (batch, hidden_dim)
        return context

class CMIHARModule(nn.Module):
    def __init__(
            self,
            imu_idx:list[int],
            tof_thm_idx:list[int],
            mlp_width:int,
            n_class:int,            
            tof_thm_dropout_ratio:float=0,
        ):
        super().__init__()
        self.imu_idx = imu_idx
        self.tof_thm_idx = tof_thm_idx
        self.imu_branch = nn.Sequential(
            ImuFeatureExtractor(len(imu_idx)),
            ResidualBlock(len(imu_idx) * 3, 64),
            ResidualBlock(64, 128),
        )
        self.tof_and_thm_branch = nn.Sequential(
            nn.Conv1d(len(tof_thm_idx), 64, 3, padding=1, bias=False),
            nn.BatchNorm1d(64),
            nn.MaxPool1d(2),
            nn.Dropout(tof_thm_dropout_ratio),
            nn.Conv1d(64, 128, 3, padding=1, bias=False),
            nn.BatchNorm1d(128),
            nn.MaxPool1d(2),
            nn.Dropout(tof_thm_dropout_ratio),
        )
        self.lstm = nn.GRU(128 * 2, mlp_width // 2, bidirectional=True)
        self.attention = AdditiveAttentionLayer(mlp_width)
        self.head = nn.Sequential(
            # Head
            nn.LazyLinear(mlp_width, bias=False),
            nn.BatchNorm1d(mlp_width),
            nn.ReLU(),
            nn.Linear(mlp_width, mlp_width // 2, bias=False),
            nn.BatchNorm1d(mlp_width // 2),
            nn.ReLU(),
            nn.Linear(mlp_width // 2, n_class),
        )

    def forward(self, x:Tensor) -> Tensor:
        imu_activation_maps = self.imu_branch(x[:, self.imu_idx])
        tof_thm_activation_maps = self.tof_and_thm_branch(x[:, self.tof_thm_idx])
        concatenated_activation_maps = torch.cat((imu_activation_maps, tof_thm_activation_maps), 1)
        lstm_output, _  = self.lstm(concatenated_activation_maps.swapaxes(1, 2))
        lstm_output = lstm_output.swapaxes(1, 2) # redundant
        attended = self.attention(lstm_output)
        return self.head(attended)

In [40]:
len(imu_feats_idx)

46

### Create model function

In [41]:
def mk_model() -> nn.Module:
    return (
        CMIHARModule(
            imu_idx=imu_feats_idx,
            tof_thm_idx=non_imu_feats_idx,
            mlp_width=256,
            n_class=18,
        )
        .to(device)
    )

display(mk_model())
print("input channels:", len(meta_data["feature_cols"]))

CMIHARModule(
  (imu_branch): Sequential(
    (0): ImuFeatureExtractor(
      (lpf): Conv1d(46, 46, kernel_size=(15,), stride=(1,), padding=(7,), groups=46, bias=False)
    )
    (1): ResidualBlock(
      (blocks): Sequential(
        (0): Conv1d(138, 64, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
        (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
        (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (5): SqueezeExcitationBlock(
          (fc1): Linear(in_features=64, out_features=8, bias=True)
          (fc2): Linear(in_features=8, out_features=64, bias=True)
          (sigmoid): Sigmoid()
        )
      )
      (head): Sequential(
        (0): ReLU()
        (1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (2): Dropout(p=0.3, inplace=False)

input channels: 696


## Training loop

In [42]:
def fit(epochs:int,
        model: nn.Module,
        scheduler: LRScheduler,
        optimizer: torch.optim.Optimizer,
        train_loader: DL,
        criterion: callable=nn.L1Loss(),
        evaluation_func: callable=None,
        validation_loader: DL=None,
        save_checkpoints=True,
    ) -> tuple[DF, str]:
    """
    Returns:
        (training_metrics, path_to_checkpoints)
    """
    # Setup
    metrics: list[dict] = []
    step = 0
    model_device = next(model.parameters()).device
    last_epoch_metric = {}
    # Training loop
    with Progress() as progress:
        task: Task = progress.add_task(
            "training...",
            total=len(train_loader),
        )
        for epoch in range(epochs):
            progress.update(
                task,
                description=f"epoch: {epoch}",
                completed=0,
            )
            total_epoch_loss = 0
            total_accuracy = 0
            for batch_idx, (x, y) in enumerate(train_loader):
                # forward
                x = x.to(model_device)
                y = y.to(model_device)
                model.train()
                optimizer.zero_grad()
                y_pred: Tensor = model(x)
                loss_value = criterion(y_pred, y)
                # Verify loss value
                if torch.isnan(loss_value).any().item():
                    progress.print("Warning: Got NaN loss, stopped training.")
                    return DF.from_records(metrics)
                if torch.isinf(loss_value).any().item():
                    progress.print("Warning: Got infinite loss, stopped training.")
                    return DF.from_records(metrics)
                # TODO: Use gradient clipping?
                loss_value.backward()
                optimizer.step()
                if step > 0: # If it's not the first training step, idk why it throws an error otherwise
                    scheduler.step()
                # metrics
                total_epoch_loss += loss_value.item()
                metrics.append({
                    "step": step,
                    "epoch": epoch,
                    "batch_train_loss": loss_value.item(),
                    "lr": optimizer.state_dict()["param_groups"][-1]["lr"],
                })
                step += 1
                if "validation_accuracy" in last_epoch_metric:
                    last_validation_acc = "%.2f" % last_epoch_metric["validation_accuracy"]
                    val_acc_str = "val. acc: " + last_validation_acc
                else:
                    val_acc_str = ""
                progress.update(
                    task,
                    advance=1,
                    description=f"epoch: {epoch}, batch_loss: {(total_epoch_loss / (batch_idx+1)):.2f}, {val_acc_str}"
                )
            # Post epoch evalution
            metrics[-1]["train_epoch_loss"] = total_epoch_loss / len(train_loader)
            metrics[-1]["train_epoch_accuracy"] = total_accuracy / len(train_loader)
            if evaluation_func:
                progress.update(
                    task,
                    completed=0,
                    description=f"epoch: {epoch}, evaluating..."
                )
                eval_metrics = evaluation_func(model, criterion, validation_loader)
                metrics[-1].update(eval_metrics)
            last_epoch_metric = metrics[-1]

    return DF.from_records(metrics)

### Create model and train model

In [43]:
def mk_model_and_fit(
        train_loader:DL,
        mk_scheduler:callable,
        epochs:int,
        validation_loader:Optional[DL]=None,
        save_checkpoints=False,
        criterion=nn.CrossEntropyLoss()
    ) -> tuple[nn.Module, DF, list[str]]:
    model = mk_model()
    optimizer = torch.optim.AdamW(model.parameters(), WARMUP_LR_INIT)
    lr_scheduler = mk_scheduler(optimizer)
    training_metrics = fit(
        epochs=epochs,
        model=model,
        scheduler=lr_scheduler,
        optimizer=optimizer,
        train_loader=train_loader,
        criterion=criterion,
        # evaluation_func=evaluate_model if validation_loader else None,
        validation_loader=validation_loader,
        save_checkpoints=save_checkpoints,
    )

    return model, training_metrics

## Search max learning rate

In [44]:
def post_process_mock_training_metrics(training_metrics:DF) -> DF:
    training_metrics = (
        training_metrics
        .query("batch_train_loss.notna()")
        .set_index("lr", drop=False)
        .sort_index()
    )
    training_metrics["ewm_batch_train_loss"] = (
        training_metrics
        .ewm(com=30, ignore_na=False)
        ["batch_train_loss"]
        .mean()
    )
    training_metrics["ewm_batch_train_loss_diff"] = training_metrics["ewm_batch_train_loss"].diff()
    return training_metrics

In [45]:
def plt_lr_search_training_metrics(training_metrics:DF):
    (    
        px.line(
            (
                training_metrics
                .reset_index(drop=True)
                .melt(
                    id_vars="lr",
                    value_vars=[
                        "batch_train_loss",
                        "ewm_batch_train_loss",
                        # "ewm_batch_train_loss_diff",
                    ],
                )
            ),
            x="lr",
            facet_row="variable",
            y="value",
            log_x=True,
            log_y=True,
            height=750,
        )
        .update_yaxes(matches=None)
        .show()
    )

In [46]:
train_dataset = CMIDataset("full_dataset")
full_dataset_loader = DL(train_dataset, TRAIN_BATCH_SIZE, shuffle=True)
_, mock_training_metrics = mk_model_and_fit(
    full_dataset_loader,
    partial(torch.optim.lr_scheduler.ExponentialLR, gamma=MOCK_TRAINING_GAMMA),
    MOCK_TRAINING_EPOCHS,
    criterion= nn.CrossEntropyLoss(label_smoothing=0.1),
)
mock_training_metrics = post_process_mock_training_metrics(mock_training_metrics)
plt_lr_search_training_metrics(mock_training_metrics)
max_lr = mock_training_metrics["ewm_batch_train_loss"].idxmin()
print("Maximum learning rate:", max_lr)

Output()

Maximum learning rate: 0.002056984330154654


## Training loop

In [47]:
def mixup_data(x, y, alpha=0.2):
    """
    Return mixed inputs and mixed targets (one-hot) for mixup.
    x: Tensor of shape (batch_size, features, seq_len)
    y: Tensor of shape (batch_size, num_classes)
    """
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1.0
    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(x.device)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    mixed_y = lam * y + (1 - lam) * y[index, :]
    return mixed_x, mixed_y

In [None]:
seed_everything(seed=SEED)

n_splits = 5

fold_metrics = []
best_fold_metrics = []
best_models = []

fold_patterns = join(dataset_path, "preprocessed_dataset", "fold*")
fold_pths = glob(fold_patterns)
all_training_metrics = {}

for fold, fold_pth in enumerate(fold_pths):
    print("training:", fold + 1)
    train_dataset = CMIDataset(fold_pth, "train")
    # criterion = compute_weighted_cross_entropy_loss(train_dataset)
    criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
    train_loader = DL(train_dataset, TRAIN_BATCH_SIZE, shuffle=True, drop_last=True)
    validation_dataset = CMIDataset(fold_pth, "validation")
    validation_loader = DL(validation_dataset, VALIDATION_BATCH_SIZE, shuffle=False, drop_last=True)
    print(f"\n{'='*50}")
    print(f"Fold {fold + 1}/{n_splits}")

    seed_everything(seed=SEED + fold)
    model = mk_model()

    # Optimizer et scheduler
    min_lr = max_lr / 100
    optimizer = torch.optim.AdamW(
        model.parameters(),
        WARMUP_LR_INIT,
        weight_decay=WEIGHT_DECAY,
    )
    steps_per_epoch = len(train_loader)
    scheduler = CosineAnnealingWarmupRestarts(
        optimizer,
        warmup_steps=WARMUP_EPOCHS * steps_per_epoch,
        cycle_mult=CYCLE_LENGTH_FACTOR,
        max_lr = max_lr,
        min_lr = max_lr / MAX_TO_MIN_LR_DIV_FACTOR,
        cycle_length=INIT_CYCLE_EPOCHS * steps_per_epoch,
        gamma=LR_CYCLE_FACTOR,
    ) 


    # Early stopping
    best_metric = -np.inf
    best_binary_f1 = -np.inf
    best_macro_f1 = -np.inf
    epochs_no_improve = 0

    for epoch in range(1, TRAINING_EPOCHS + 1):
        # Training phase
        model.train()
        train_loss = 0.0
        total = 0
        for batch_x, batch_y in train_loader:
            batch_x = batch_x.to(device).clone()
            add_noise = torch.randn_like(batch_x, device=device) * 0.04
            scale_noise = torch.rand_like(batch_x, device=device) * (1.1 - 0.9) + 0.9
            batch_x = (add_noise + batch_x) * scale_noise
            batch_x[:TRAIN_BATCH_SIZE // 2, non_imu_feats_idx] = 0.0
            batch_y = batch_y.to(device)
            batch_x = batch_x.float()
            
            batch_x, batch_y = mixup_data(batch_x, batch_y)
           
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss.item() * batch_x.size(0)
            total += batch_x.size(0)
        train_loss /= total

        # Validation phase
        model.eval()
        val_loss = 0.0
        total = 0
        all_true = []
        all_pred = []

        with torch.no_grad():
            for batch_x, batch_y in validation_loader:
                batch_x = batch_x.to(device).clone()
                batch_y = batch_y.to(device)
                batch_x[:VALIDATION_BATCH_SIZE // 2, non_imu_feats_idx] = 0.0

                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item() * batch_x.size(0)
                total += batch_x.size(0)

                # Get predicted class indices
                preds = torch.argmax(outputs, dim=1).cpu().numpy()
                # Get true class indices from one-hot
                trues = torch.argmax(batch_y, dim=1).cpu().numpy()

                all_true.append(trues)
                all_pred.append(preds)

        val_loss /= total
        all_true = np.concatenate(all_true)
        all_pred = np.concatenate(all_pred)

        # Compute competition metrics
        # Binary classification: BFRB (1) vs non-BFRB (0)
        binary_true = np.isin(all_true, bfrb_indices).astype(int)
        binary_pred = np.isin(all_pred, bfrb_indices).astype(int)
        binary_f1 = f1_score(binary_true, binary_pred)

        # Collapse non-BFRB gestures into a single class
        collapsed_true = np.where(
            np.isin(all_true, bfrb_indices),
            all_true,
            len(bfrb_gestures)  # Single non-BFRB class
        )
        collapsed_pred = np.where(
            np.isin(all_pred, bfrb_indices),
            all_pred,
            len(bfrb_gestures)  # Single non-BFRB class
        )

        # Macro F1 on collapsed classes
        macro_f1 = f1_score(collapsed_true, collapsed_pred, average='macro')
        final_metric = (binary_f1 + macro_f1) / 2

        print(f"Epoch {epoch:02d}: Binary F1 = {binary_f1:.4f}, Macro F1 = {macro_f1:.4f}, Final Metric = {final_metric:.4f}")

        if final_metric > best_metric:
            best_metric = final_metric
            best_binary_f1 = binary_f1
            best_macro_f1 = macro_f1
            epochs_no_improve = 0
            best_model_state = model.state_dict()
            print(f"  New best metric! Saving model...")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print(f"Early stopping triggered at epoch {epoch}")
                model.load_state_dict(best_model_state)
                break

    torch.save(best_model_state, f"best_model_fold{fold}.pth")
    # Free memory used by datasets and data loaders
    del train_dataset
    del validation_dataset
    del train_loader
    del validation_loader
    gc.collect()
    torch.cuda.empty_cache()

    best_models.append(best_model_state)
    fold_metrics.append({
        'binary_f1': binary_f1,
        'macr, drop_last=Trueo_f1': macro_f1,
        'final_metric': final_metric
    })
    
    best_fold_metrics.append({
        'binary_f1': best_binary_f1,
        'macro_f1': best_macro_f1,
        'final_metric': best_metric
    })
    
    print(f"\nFold {fold + 1} completed.")
    print(f"Final validation metrics - Binary F1: {binary_f1:.4f}, Macro F1: {macro_f1:.4f}, Final: {final_metric:.4f}")
    print(f"Best validation metrics - Binary F1: {best_binary_f1:.4f}, Macro F1: {best_macro_f1:.4f}, Final: {best_metric:.4f}")

print("\n" + "="*50)
print("Cross-Validation Results")
print("="*50)

# Statistiques pour les meilleures métriques
best_binary_f1 = [m['binary_f1'] for m in best_fold_metrics]
best_macro_f1 = [m['macro_f1'] for m in best_fold_metrics]
best_metrics = [m['final_metric'] for m in best_fold_metrics]

print("\nBest Fold-wise Metrics:")
for i, (bf1, mf1, fm) in enumerate(zip(best_binary_f1, best_macro_f1, best_metrics)):
    print(f"Fold {i+1}: Binary F1 = {bf1:.4f}, Macro F1 = {mf1:.4f}, Final = {fm:.4f}")

print("\nGlobal Statistics (Best Metrics):")
print(f"Mean Best Final Metric: {np.mean(best_metrics):.4f} ± {np.std(best_metrics):.4f}")
print(f"Mean Best Binary F1: {np.mean(best_binary_f1):.4f} ± {np.std(best_binary_f1):.4f}")
print(f"Mean Best Macro F1: {np.mean(best_macro_f1):.4f} ± {np.std(best_macro_f1):.4f}")

training: 1

Fold 1/5
Epoch 01: Binary F1 = 0.8023, Macro F1 = 0.2454, Final Metric = 0.5239
  New best metric! Saving model...
Epoch 02: Binary F1 = 0.9262, Macro F1 = 0.3530, Final Metric = 0.6396
  New best metric! Saving model...
Epoch 03: Binary F1 = 0.9594, Macro F1 = 0.4270, Final Metric = 0.6932
  New best metric! Saving model...
Epoch 04: Binary F1 = 0.9328, Macro F1 = 0.4936, Final Metric = 0.7132
  New best metric! Saving model...
Epoch 05: Binary F1 = 0.9680, Macro F1 = 0.5231, Final Metric = 0.7456
  New best metric! Saving model...
Epoch 06: Binary F1 = 0.9720, Macro F1 = 0.5569, Final Metric = 0.7645
  New best metric! Saving model...
Epoch 07: Binary F1 = 0.9711, Macro F1 = 0.5365, Final Metric = 0.7538
Epoch 08: Binary F1 = 0.9711, Macro F1 = 0.5711, Final Metric = 0.7711
  New best metric! Saving model...
Epoch 09: Binary F1 = 0.9734, Macro F1 = 0.5808, Final Metric = 0.7771
  New best metric! Saving model...
Epoch 10: Binary F1 = 0.9750, Macro F1 = 0.5592, Final Metr

## Submission

### Reloading best model

In [None]:
model_ensemble = []
for fold in range(5):
    model = mk_model()
    checkpoint = torch.load(f"best_model_fold{fold}.pth", map_location=device, weights_only=True)
    model.load_state_dict(checkpoint)
    model.eval()
    model_ensemble.append(model)

RuntimeError: Error(s) in loading state_dict for CMIHARModule:
	Missing key(s) in state_dict: "imu_branch.0.lpf.weight", "imu_branch.2.blocks.0.weight", "imu_branch.2.blocks.1.weight", "imu_branch.2.blocks.1.bias", "imu_branch.2.blocks.1.running_mean", "imu_branch.2.blocks.1.running_var", "imu_branch.2.blocks.3.weight", "imu_branch.2.blocks.4.weight", "imu_branch.2.blocks.4.bias", "imu_branch.2.blocks.4.running_mean", "imu_branch.2.blocks.4.running_var", "imu_branch.2.blocks.5.fc1.weight", "imu_branch.2.blocks.5.fc1.bias", "imu_branch.2.blocks.5.fc2.weight", "imu_branch.2.blocks.5.fc2.bias", "imu_branch.2.skip_connection.0.weight", "imu_branch.2.skip_connection.1.weight", "imu_branch.2.skip_connection.1.bias", "imu_branch.2.skip_connection.1.running_mean", "imu_branch.2.skip_connection.1.running_var", "head.1.weight", "head.1.bias", "head.1.running_mean", "head.1.running_var", "head.3.weight", "head.4.running_mean", "head.4.running_var", "head.6.weight", "head.6.bias". 
	Unexpected key(s) in state_dict: "imu_branch.0.blocks.0.weight", "imu_branch.0.blocks.0.bias", "imu_branch.0.blocks.1.weight", "imu_branch.0.blocks.1.bias", "imu_branch.0.blocks.1.running_mean", "imu_branch.0.blocks.1.running_var", "imu_branch.0.blocks.1.num_batches_tracked", "imu_branch.0.blocks.3.weight", "imu_branch.0.blocks.3.bias", "imu_branch.0.blocks.4.weight", "imu_branch.0.blocks.4.bias", "imu_branch.0.blocks.4.running_mean", "imu_branch.0.blocks.4.running_var", "imu_branch.0.blocks.4.num_batches_tracked", "imu_branch.0.blocks.5.fc1.weight", "imu_branch.0.blocks.5.fc1.bias", "imu_branch.0.blocks.5.fc2.weight", "imu_branch.0.blocks.5.fc2.bias", "imu_branch.0.skip_connection.0.weight", "imu_branch.0.skip_connection.0.bias", "imu_branch.0.skip_connection.1.weight", "imu_branch.0.skip_connection.1.bias", "imu_branch.0.skip_connection.1.running_mean", "imu_branch.0.skip_connection.1.running_var", "imu_branch.0.skip_connection.1.num_batches_tracked", "imu_branch.1.blocks.0.bias", "imu_branch.1.blocks.3.bias", "imu_branch.1.skip_connection.0.bias", "head.0.bias", "head.2.weight", "head.2.bias". 
	size mismatch for imu_branch.1.blocks.0.weight: copying a param with shape torch.Size([128, 64, 3]) from checkpoint, the shape in current model is torch.Size([64, 138, 3]).
	size mismatch for imu_branch.1.blocks.1.weight: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.blocks.1.bias: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.blocks.1.running_mean: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.blocks.1.running_var: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.blocks.3.weight: copying a param with shape torch.Size([128, 128, 3]) from checkpoint, the shape in current model is torch.Size([64, 64, 3]).
	size mismatch for imu_branch.1.blocks.4.weight: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.blocks.4.bias: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.blocks.4.running_mean: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.blocks.4.running_var: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.blocks.5.fc1.weight: copying a param with shape torch.Size([16, 128]) from checkpoint, the shape in current model is torch.Size([8, 64]).
	size mismatch for imu_branch.1.blocks.5.fc1.bias: copying a param with shape torch.Size([16]) from checkpoint, the shape in current model is torch.Size([8]).
	size mismatch for imu_branch.1.blocks.5.fc2.weight: copying a param with shape torch.Size([128, 16]) from checkpoint, the shape in current model is torch.Size([64, 8]).
	size mismatch for imu_branch.1.blocks.5.fc2.bias: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.skip_connection.0.weight: copying a param with shape torch.Size([128, 64, 1]) from checkpoint, the shape in current model is torch.Size([64, 138, 1]).
	size mismatch for imu_branch.1.skip_connection.1.weight: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.skip_connection.1.bias: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.skip_connection.1.running_mean: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for imu_branch.1.skip_connection.1.running_var: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
	size mismatch for head.4.weight: copying a param with shape torch.Size([18, 128]) from checkpoint, the shape in current model is torch.Size([128]).
	size mismatch for head.4.bias: copying a param with shape torch.Size([18]) from checkpoint, the shape in current model is torch.Size([128]).

### Define preprocessing function

In [None]:
def get_fillna_val_per_feature_col(df:DF) -> dict:
    return {col: 1.0 if col == 'rot_w' else 0 for col in get_feature_cols(df)}

def imputed_features(df:DF) -> DF:
    # Missing ToF values are already imputed by -1 which is inconvinient since we want all missing values to be NaN.    
    # So we replace them by NaN and then perform imputing.  
    tof_vals_to_nan = {col: -1.0 for col in df.columns if col.startswith("tof")}

    df[get_feature_cols(df)] = (
        df
        .loc[:, get_feature_cols(df)]
        # df.replace with np.nan sets dtype to floar64 so we set it back to float32
        .replace(tof_vals_to_nan, value=np.nan)
        .astype("float32")
        .groupby(df["sequence_id"], observed=True, as_index=False)
        .ffill()
        .groupby(df["sequence_id"], observed=True, as_index=False)
        .bfill()
        # In case there are only nan in the column in the sequence
        .fillna(get_fillna_val_per_feature_col(df))
    )
    return df

def norm_quat_rotations(df:DF) -> DF:
    df[QUATERNION_COLS] /= np.linalg.norm(df[QUATERNION_COLS], axis=1, keepdims=True)
    return df

def add_linear_acc_cols(df:DF) -> DF:
    # Vectorized version of https://www.kaggle.com/code/wasupandceacar/lb-0-82-5fold-single-bert-model#Dataset `remove_gravity_from_acc`
    rotations:Rotation = Rotation.from_quat(df[QUATERNION_COLS])
    gravity_sensor_frame = rotations.apply(GRAVITY_WORLD, inverse=True).astype("float32")
    df[LINEAR_ACC_COLS] = df[RAW_ACCELRATION_COLS] - gravity_sensor_frame
    return df

def add_acc_magnitude(df:DF, acc_cols:list[str], acc_mag_col_name:str) -> DF:
    return df.assign(**{acc_mag_col_name: np.linalg.norm(df.loc[:, acc_cols], axis=1)})

def add_quat_angle_mag(df:DF) -> DF:
    return df.assign(quat_rot_mag=np.arccos(df["rot_w"]) * 2)

def add_angular_velocity_features(df:DF) -> DF:
    rotations = Rotation.from_quat(df[QUATERNION_COLS])
    delta_rotations = rotations[1:] * rotations[:-1].inv()
    delta_rot_velocity = delta_rotations.as_rotvec()
    # Add extra line to avoid shape mismatch
    delta_rot_velocity = np.vstack((np.zeros((1, 3)), delta_rot_velocity))
    delta_rot_magnitude = norm(delta_rot_velocity, axis=1, keepdims=True)
    delta_rot_axes = delta_rot_velocity / (delta_rot_magnitude + EPSILON)
    df[DELTA_ROTATION_ANGULAR_VELOCITY_COLS] = delta_rot_velocity
    df[DELTA_ROTATION_AXES_COLS] = delta_rot_axes
    df["delta_rot_mag"] = delta_rot_magnitude.squeeze()

    return df

def rot_euler_angles(df:DF) -> ndarray:
    df[EULER_ANGLES_COLS] = (
        Rotation
        .from_quat(df[QUATERNION_COLS])
        .as_euler("xyz")
        .squeeze()
    )
    return df

def agg_tof_cols_per_sensor(df:DF) -> DF:
    for tof_idx in tqdm(range(1, 6)):
        tof_name = f"tof_{tof_idx}"
        tof_cols = [f"{tof_name}_v{v_idx}" for v_idx in range(64)]
        if any(map(lambda col: col not in df.columns, tof_cols)):
            print(f"Some (or) all ToF {tof_idx} columns are not in the df. Maybe you already ran this cell?")
            continue
        df = pd.concat(
            (
                df.drop(columns=tof_cols),
                # For some reasons, it's faster to call all the aggregation functions seperatly than agg(list of functions)
                df[tof_cols].mean(axis="columns").to_frame(tof_name + "_mean"),
                df[tof_cols].std(axis="columns").to_frame(tof_name + "_std"),
                df[tof_cols].median(axis="columns").to_frame(tof_name + "_median"),
                df[tof_cols].min(axis="columns").to_frame(tof_name + "_min"),
                df[tof_cols].max(axis="columns").to_frame(tof_name + "_max"),
            ),
            axis="columns",
        )
    return df

def add_diff_features(df:DF) -> DF:
    return pd.concat(
        (
            df,
            df
            .groupby("sequence_id", as_index=False, observed=True)
            [get_feature_cols(df)]
            .diff()
            .fillna(get_fillna_val_per_feature_col(df))
            .add_suffix("_diff")
        ),
        axis="columns",
    )

def length_normed_sequence_feat_arr(sequence: DF) -> ndarray:
    features = (
        sequence
        .loc[:, meta_data["feature_cols"]]
        .values
    )
    normed_sequence_len = meta_data["pad_seq_len"]
    len_diff = abs(normed_sequence_len - len(features))
    if len(features) < normed_sequence_len:
        padded_features = np.pad(
            features,
            ((len_diff // 2 + len_diff % 2, len_diff // 2), (0, 0)),
        )
        return padded_features
    elif len(features) > normed_sequence_len:
        return features[len_diff // 2:-len_diff // 2]
    else:
        return features

def preprocess_sequence(sequence_df:pl.DataFrame) -> ndarray:
    return (
        sequence_df                     
        .to_pandas()                            # Convert to pandas dataframe.
        .pipe(imputed_features)                 # Impute missing data.
        .pipe(norm_quat_rotations)              # Norm quaternions
        .pipe(add_linear_acc_cols)              # Add gravity free acceleration.
        .pipe(add_acc_magnitude, RAW_ACCELRATION_COLS, "acc_mag")
        .pipe(add_acc_magnitude, LINEAR_ACC_COLS, "linear_acc_mag")
        .pipe(add_quat_angle_mag)
        .pipe(add_angular_velocity_features)
        .pipe(rot_euler_angles)                 # Add rotation acc expressed as euler angles.
        # .pipe(agg_tof_cols_per_sensor)          # Aggregate ToF columns.
        .pipe(add_diff_features)                # 
        .loc[:, sorted(meta_data["feature_cols"])]      # Retain only the usefull columns a.k.a features.
        .sub(meta_data["mean"])                 # Subtract features by their mean, std norm pt.1.
        .div(meta_data["std"])                  # Divide by Standard deviation, std norm pt.2.
        .pipe(length_normed_sequence_feat_arr)  # get feature ndarray of sequence.
        .T                                      # Transpose to swap channel and X dimensions.
    )

### Define prediction function

In [None]:
def predict(sequence: pl.DataFrame, _: pl.DataFrame) -> str:
    """
    Kaggle evaluation API will call this for each sequence.
    sequence: polars DataFrame for a single sequence
    demographics: unused in this model
    Returns: predicted gesture string
    """
    x_tensor = (
        torch.unsqueeze(Tensor(preprocess_sequence(sequence)), dim=0)
        .float()
        .to(device)
    )
    print(x_tensor.shape)

    all_outputs = []
    with torch.no_grad():
        for model_idx, model in enumerate(model_ensemble): # Only take the first one bc it's the only one that takes in the correct input shape
            outputs = model(x_tensor)
            all_outputs.append(outputs)

    avg_outputs = torch.mean(torch.stack(all_outputs), dim=0)
    pred_idx = torch.argmax(avg_outputs, dim=1).item()

    return str(gesture_classes[pred_idx])

### Run inference server

In [None]:
inference_server = kaggle_evaluation.cmi_inference_server.CMIInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        data_paths=(
            join(competition_dataset_path, 'test.csv'),
            join(competition_dataset_path, 'test_demographics.csv'),
        )
    )
    inference_server = kaggle_evaluation.cmi_inference_server.CMIInferenceServer(predict)
    inference_server.run_local_gateway(
        data_paths=(
            join(competition_dataset_path, 'train.csv'),
            join(competition_dataset_path, 'train_demographics.csv'),
        )
    )