In [1]:
%load_ext autoreload
%autoreload 2
import sys, os, glob
import gc
import time, math, random
import ast
from itertools import product, combinations

import numpy as np
import pandas as pd
import pandas.api.types
from pandas.api.types import CategoricalDtype
import polars as pl
import polars.selectors as cs
import mlflow
import mlflow.xgboost

import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim
from torch import Tensor


from sklearn.preprocessing import TargetEncoder, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import QuantileTransformer, PowerTransformer, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, StratifiedGroupKFold, KFold, GroupShuffleSplit, GroupKFold, ParameterGrid
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import validation_curve, ValidationCurveDisplay
import joblib
from xgboost import XGBRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from scipy.stats import rankdata
from scipy.optimize import dual_annealing

from tqdm.auto import tqdm
sys.path.insert(0, os.path.abspath('../scripts'))
from cmi_2025 import score

  import pkg_resources


In [2]:
seed = 0
random.seed(seed)
np.random.seed(seed + 1)
torch.manual_seed(seed + 2);

In [3]:
torch.cuda.is_available()

True

In [4]:
BASE = ".."
COMP_DATA_BASE = os.path.join(BASE, "data", "raw")
PREP_DATA_BASE = os.path.join(BASE, "data", "processed")
FIGURES_BASE = os.path.join(BASE, "figures")

TRAIN_PATH = os.path.join(COMP_DATA_BASE, "train.csv")
TRAIN_DEMO_PATH = os.path.join(COMP_DATA_BASE, "train_demographics.csv")
TEST_PATH = os.path.join(COMP_DATA_BASE, "test.csv")
TEST_DEMO_PATH = os.path.join(COMP_DATA_BASE, "test_demographics.csv")

features = list(pl.read_csv(TRAIN_PATH).select(pl.all().exclude("ID")).columns)
train_ds = pl.read_csv(TRAIN_PATH)
train_demo_ds = pl.read_csv(TRAIN_DEMO_PATH)

In [5]:
print(f"Test columns")
test_cols = list(pl.read_csv(TEST_PATH).columns)
id_cols = test_cols[:4]
acc_cols = [col  for col in test_cols if col.startswith("acc")]
rot_cols = [col  for col in test_cols if col.startswith("rot")]
thm_cols = [col  for col in test_cols if col.startswith("thm")]
tof_cols = [[col  for col in test_cols if col.startswith(f"tof_{i+1}")] for i in range(5)]
tof_cols_all = [col for cl in tof_cols for col in cl]
target_cols = [col for col in train_ds.columns if col not in test_cols]
features = acc_cols+rot_cols+thm_cols+[c for cl in tof_cols for c in cl]

demo_features = ['adult_child', 'age', 'sex', 'handedness', 'height_cm', 'shoulder_to_wrist_cm', 'elbow_to_wrist_cm']

# target
gestures = ['Pull air toward your face', 'Feel around in tray and pull out an object', 'Neck - scratch', 'Pinch knee/leg skin', 
            'Forehead - scratch', 'Eyelash - pull hair', 'Drink from bottle/cup', 'Wave hello', 'Cheek - pinch skin', 
            'Forehead - pull hairline', 'Text on phone', 'Write name in air', 'Scratch knee/leg skin', 'Neck - pinch skin', 
            'Write name on leg', 'Above ear - pull hair', 'Eyebrow - pull hair', 'Glasses on/off']
le = LabelEncoder()
le.fit(gestures)
train_ds = train_ds.with_columns(pl.Series(name="gesture_id", values=le.transform(train_ds.select("gesture"))))

print(f"id-cols: {id_cols}")
print(f"acc: {acc_cols}")
print(f"rot: {rot_cols}")
print(f"thm: {thm_cols}")
for i in range(5):
    print(f"tof_{i+1}: {tof_cols[i]}")
    
print(f"Extra train columns")
print(f"{target_cols}")
for col in target_cols:
    print(f"{col+':':15} {train_ds.select(col).unique().to_series().to_list()}")

Test columns


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


id-cols: ['row_id', 'sequence_id', 'sequence_counter', 'subject']
acc: ['acc_x', 'acc_y', 'acc_z']
rot: ['rot_w', 'rot_x', 'rot_y', 'rot_z']
thm: ['thm_1', 'thm_2', 'thm_3', 'thm_4', 'thm_5']
tof_1: ['tof_1_v0', 'tof_1_v1', 'tof_1_v2', 'tof_1_v3', 'tof_1_v4', 'tof_1_v5', 'tof_1_v6', 'tof_1_v7', 'tof_1_v8', 'tof_1_v9', 'tof_1_v10', 'tof_1_v11', 'tof_1_v12', 'tof_1_v13', 'tof_1_v14', 'tof_1_v15', 'tof_1_v16', 'tof_1_v17', 'tof_1_v18', 'tof_1_v19', 'tof_1_v20', 'tof_1_v21', 'tof_1_v22', 'tof_1_v23', 'tof_1_v24', 'tof_1_v25', 'tof_1_v26', 'tof_1_v27', 'tof_1_v28', 'tof_1_v29', 'tof_1_v30', 'tof_1_v31', 'tof_1_v32', 'tof_1_v33', 'tof_1_v34', 'tof_1_v35', 'tof_1_v36', 'tof_1_v37', 'tof_1_v38', 'tof_1_v39', 'tof_1_v40', 'tof_1_v41', 'tof_1_v42', 'tof_1_v43', 'tof_1_v44', 'tof_1_v45', 'tof_1_v46', 'tof_1_v47', 'tof_1_v48', 'tof_1_v49', 'tof_1_v50', 'tof_1_v51', 'tof_1_v52', 'tof_1_v53', 'tof_1_v54', 'tof_1_v55', 'tof_1_v56', 'tof_1_v57', 'tof_1_v58', 'tof_1_v59', 'tof_1_v60', 'tof_1_v61', 'tof

# Preprocessing

In [None]:
def preprocess(data):
    return (data
            .sort(by="row_id")
            .with_columns(pl.all().fill_null(strategy="forward").over("sequence_id"))
            .with_columns(pl.all().fill_null(strategy="backward").over("sequence_id"))
            .with_columns(pl.col(tof_cols_all).fill_null(-1)) 
            .with_columns(pl.col(thm_cols).fill_null(strategy="mean").over("sequence_id"))
            .with_columns(pl.all().fill_null(0))
            
            .filter(pl.col("sequence_id") != 'SEQ_011975')
            )
train_ds_prep = preprocess(train_ds)


In [None]:
# Standardize
def split_data(data, data_demo):
    sequences = (data
                .group_by(["sequence_id", "subject"])
                .agg(pl.col("gesture").first())
                )
    sgkf = StratifiedGroupKFold(n_splits=4, shuffle=True, random_state=42)
    sgkf2 = StratifiedGroupKFold(n_splits=2, shuffle=True, random_state=42)
    train_index, test_index = next(sgkf.split(sequences, 
                                            sequences.select("gesture").to_series(), 
                                            sequences.select("subject").to_series() ))
    train_index2, test_index2 = next(sgkf2.split(sequences[test_index], 
                                            sequences[test_index].select("gesture").to_series(), 
                                            sequences[test_index].select("subject").to_series() ))
    
    data_dict = {}
    
    for part_name, part_index in zip(["train", "val", "test"], [train_index, train_index2, test_index2]):
        data_dict[part_name] = data.filter(pl.col("sequence_id").is_in(sequences[part_index].select("sequence_id").to_series().implode()))
        data_dict[part_name + "_demo"] = data_demo.filter(pl.col("subject").is_in(sequences[part_index].select("subject").to_series().implode()))
        
    return data_dict

   
# train, test = split_dataset(train_ds_prep, with_val=False)
# data_dict = split_dataset(train_ds_prep, with_val=True)



def create_folds(data, data_demo, num_folds=5):
    data = data.with_columns(pl.lit(-1).alias("FOLD"))
    data_demo = data_demo.with_columns(pl.lit(-1).alias("FOLD"))
    subjects = data.select("subject").unique()
    kf = KFold(n_splits=num_folds, random_state=24, shuffle=True)
    for i, (train_index, test_index) in enumerate(kf.split(subjects)):
        data = data.with_columns(pl.when(pl.col("subject").is_in(subjects[test_index].to_series().implode()))
                                 .then(pl.lit(i))
                                 .otherwise(pl.col("FOLD"))
                                 .alias("FOLD"))
        
        data_demo = data_demo.with_columns(pl.when(pl.col("subject").is_in(subjects[test_index].to_series().implode()))
                                 .then(pl.lit(i))
                                 .otherwise(pl.col("FOLD"))
                                 .alias("FOLD"))
    return data, data_demo
# train_ds_2 = pl.read_csv(TRAIN_PATH)
# create_folds(train_ds_2)


In [8]:
def preprocess(data_dict, features=features):
    ct = ColumnTransformer(
            [('std', StandardScaler(), features)],
            verbose_feature_names_out=False, 
            remainder="passthrough"
        )
    ct.set_output(transform="polars")

    data_dict["train"] = ct.fit_transform(data_dict["train"])
    if "val" in data_dict and data_dict["val"] is not None:
        data_dict["val"] = ct.transform(data_dict["val"])
    
    if "test" in data_dict and  data_dict["test"] is not None:
        data_dict["test"] = ct.transform(data_dict["test"])
    return data_dict

# data_dict = preprocess(data_dict)

In [9]:
# group_kfold = GroupKFold(n_splits=5, shuffle=True, random_state=24)
# for i, (train_index, test_index) in enumerate(group_kfold.split(train, groups=train.select("subject"))):
#     print(f"Fold {i}:")
#     train_folds = train.with_row_index().filter(pl.col("index").is_in(train_index))
#     test_folds = train.with_row_index().filter(pl.col("index").is_in(test_index))
    
#     ct = ColumnTransformer(
#         [('std', StandardScaler(), features)],
#         verbose_feature_names_out=False, 
#         remainder="passthrough"
#     )
#     ct.set_output(transform="polars")
#     train_folds = ct.fit_transform(train_folds)
#     test_folds = ct.transform(test_folds)
    
#     X_train = train_folds.select(features)
#     y_train = train_folds.select("gesture")
    
    
#     print(f"  Train: index={train_folds.shape[0]}, group={train_folds.select("subject").unique().shape}")
#     print(f"  Test:  index={test_folds.shape[0]}, group={test_folds.select("subject").unique().shape}")


# LSTM

In [10]:
def create_dataset(train, val, test, target="gesture"):
    
    def perpare_part(part, tail_length=75):
        sequences = {col_name: [] for col_name in ["acc", "rot", "thm", "tof", "target"]}
        for name, data in (part
                            .sort(by=['sequence_id', 'sequence_counter'])
                            .group_by("sequence_id")
                            .tail(tail_length)
                            .group_by("sequence_id")
                            ):
            
            for col_name, cols in zip(["acc", "rot", "thm", "tof"], [acc_cols, rot_cols, thm_cols, tof_cols_all ]):
                array = data.select(cols).to_numpy()
                if array.shape[0] < tail_length:
                    # TODO: Better imputation
                    padding = np.zeros((tail_length -  array.shape[0], array.shape[1]) , dtype=float)
                    array = np.vstack((padding, array))
                    
                sequences[col_name].append(array)
            
            sequences["target"].append(data.select("gesture_id").tail(1).item())
            
        return sequences

    data = {}
    for part_name, part in zip(["train", "val", "test"], [train, val, test]):
        if part is not None:
            part_data = perpare_part(part)
            data[part_name] = {'x_acc': np.array(part_data["acc"]).astype(np.float32), 
                               'x_rot': np.array(part_data["rot"]).astype(np.float32), 
                               'x_thm': np.array(part_data["thm"]).astype(np.float32), 
                               'x_tof': np.array(part_data["tof"]).astype(np.float32), 
                               'y': np.array(part_data["target"]),
                               }
            
    return data


# train_ds_prep =  clean_dataset(train_ds)
# data_dict = split_dataset(train_ds_prep, with_val=True)
# data_dict = preprocess(data_dict)

# data_prep = create_dataset(data_dict["train"], data_dict["val"], data_dict["test"])


def perpare_part(part, part_demo, tail_length=75):
    sequences = {col_name: [] for col_name in ["acc", "rot", "thm", "tof", "target", "subject", "demo"]}
    for name, data in (part
                        .sort(by=['sequence_id', 'sequence_counter'])
                        .group_by("sequence_id")
                        .tail(tail_length)
                        .group_by("sequence_id")
                        ):
        
        for col_name, cols in zip(["acc", "rot", "thm", "tof"], [acc_cols, rot_cols, thm_cols, tof_cols_all ]):
            array = data.select(cols).to_numpy()
            if array.shape[0] < tail_length:
                # TODO: Better imputation
                padding = np.zeros((tail_length -  array.shape[0], array.shape[1]) , dtype=float)
                array = np.vstack((padding, array))
                
            sequences[col_name].append(array)
        
        
        sequences["target"].append(data.select("gesture_id").tail(1).item())
        subject = data.select("subject").tail(1).item()

        sequences["subject"].append(subject)
        sequences["demo"].append(part_demo.filter(pl.col("subject") == subject).select(demo_features).to_numpy())
        
    return sequences

def create_cv_datasets(_data, _data_demo, num_folds=5, tail_length=75):
    data_prep = clean_dataset(_data)
    data_prep, data_demo_prep = create_folds(data_prep, _data_demo, num_folds=num_folds)
    
    for i in range(num_folds):
        data_train = data_prep.filter(pl.col("FOLD") != i)
        data_val = data_prep.filter(pl.col("FOLD") == i)
        data_dict = preprocess({"train": data_train, "val": data_val})
        
        data_demo_train = data_demo_prep.filter(pl.col("FOLD") != i)
        data_demo_val = data_demo_prep.filter(pl.col("FOLD") == i)
        data_demo_dict = preprocess({"train": data_demo_train, "val": data_demo_val}, features=demo_features)
        
        data = {}
        for part_name, part in data_dict.items():
            part_data = perpare_part(part, data_demo_dict[part_name], tail_length=tail_length)
            data[part_name] = {'x_acc': np.array(part_data["acc"]).astype(np.float32), 
                            'x_rot': np.array(part_data["rot"]).astype(np.float32), 
                            'x_thm': np.array(part_data["thm"]).astype(np.float32), 
                            'x_tof': np.array(part_data["tof"]).astype(np.float32), 
                            'demo': np.array(part_data["demo"]).astype(np.float32),
                            'y': np.array(part_data["target"]),
                            }
        
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        # stack it together for now
        data_torch = {
                part: {'X': torch.as_tensor((np.dstack([(data[part][mtype]) for mtype in ["x_acc", "x_rot", "x_thm", "x_tof"]])), device=device),
                       'X_demo': torch.as_tensor(data[part]["demo"], device=device),
                       'y': torch.as_tensor(data[part]["y"], device=device) }
                for part in data
        }
        
        
        def save_fold(data_torch, fold, dir=PREP_DATA_BASE, prefix=""):
            for part, d in data_torch.items():
                for d_part, tensor in d.items():
                    filename = os.path.join(dir, f"{prefix}_FOLD_{i}_{part}_{d_part}.pt")
                    torch.save(tensor, filename)
        
        save_fold(data_torch, i, prefix="20250628_v2")
        for part in data_torch.values():
            part.clear()
        for part in data.values():
            part.clear()
        gc.collect()
        torch.cuda.empty_cache()
            


def load_fold(fold, dir=PREP_DATA_BASE, prefix=""):
    data_dict = {}
    for part in ("train", "val"):
        data_dict[part] = {}
        for d_part in ("X", "X_demo" , "y"):
            filename = os.path.join(dir, f"{prefix}_FOLD_{i}_{part}_{d_part}.pt")
            data_dict[part][d_part] = torch.load(filename)
    return data_dict
        

In [11]:
# create_cv_datasets(train_ds, train_demo_ds, tail_length=100)

In [12]:
# a = load_fold(3, prefix="20250628_v2")
# print(a["train"]["X"].shape)
# print(a["train"]["X_demo"].shape)
# print(a["val"]["X"].shape)
# print(a["val"]["X_demo"].shape)

In [14]:
def test_model():
    
        
    class LSTMClassifier(nn.Module):

        def __init__(self, input_dim, input_demo_dim, hidden_dim, classes_dim, num_layers, dropout_rate=0):
            super(LSTMClassifier, self).__init__()
            self.hidden_dim = hidden_dim
            self.input_dim = input_dim
            
            self.rnn = nn.RNN(12+input_demo_dim, 12, num_layers=1, batch_first=True)
            
            # conv layer 1: K: 64x5, b: 64
            self.features_maps = [64, 64, 64, 64]
            self.kernel_sizes = [4,4,4,4]
            # self.conv1 = nn.Conv1d(input_dim, input_dim*self.features_maps[0], self.kernel_sizes[0], stride=1, groups=input_dim)
            self.conv1 = nn.Conv1d(1, self.features_maps[0], self.kernel_sizes[0], stride=1, groups=1)
            
            # conv layer 2-4:  K: 64x64x5, b: 64
            self.conv2 = nn.Conv2d(1, self.features_maps[1], (self.features_maps[0], self.kernel_sizes[1]), stride=1, groups=1)
            self.conv3 = nn.Conv2d(1, self.features_maps[2], (self.features_maps[1], self.kernel_sizes[1]), stride=1, groups=1)
            self.conv4 = nn.Conv2d(1, self.features_maps[3], (self.features_maps[2], self.kernel_sizes[1]), stride=1, groups=1)
            
            # tof conv layers:
            self.tof_features_maps = [8,16,32]
            self.tof_conv1 = nn.Conv3d(1, self.tof_features_maps[0], (5,3,3), stride=1)
            self.tof_conv2 = nn.Conv3d(self.tof_features_maps[0], self.tof_features_maps[1], (5,3,3), stride=1)
            self.tof_conv3 = nn.Conv3d(self.tof_features_maps[1], self.tof_features_maps[2], (5,3,3), stride=1)
            self.tof_maxp = nn.MaxPool3d(kernel_size=(1,2,2))
            
            # The LSTM takes word embeddings as inputs, and outputs hidden states
            # with dimensionality hidden_dim.
            self.lstm = nn.LSTM(12*self.features_maps[3] + 5*self.tof_features_maps[2], hidden_dim, num_layers, dropout=dropout_rate, batch_first=True)

            # The linear layer that maps from hidden state space to tag space
            self.hidden2class = nn.Linear(hidden_dim+input_demo_dim, classes_dim)

        # x: ( BATCH_SIZE, time, features)
        # x_demo: (BATCH_SIZE, 1, demo features)
        def forward(self, x, x_demo):

            # 1.1 only acc, rot and thm (body heat) through rnn
            
            # demo input to (time, batch_size, demo features) and repeat in time dimension
            x_demo_repeated = x_demo.repeat(1, x.shape[1], 1)
            x_in = torch.cat([x[:, :, :12], x_demo_repeated], dim=2)
            rnn_out, _ = self.rnn(x_in)
            
            # swap channels and time dimension
            # (BATCH_SIZE, channels, time)
            rnn_out = rnn_out.transpose(1,2)
            # channels to batch dimensions to apply same convolution per channel
            # (BATCH_SIZE * channels, time)
            rnn_out = rnn_out.view(-1, 1, rnn_out.shape[-1])
            
            
            # 1.2 rnn output through CNN
            x_conv = nn.functional.relu(self.conv1(rnn_out))
            
            for i, conv in enumerate([self.conv2, self.conv3, self.conv4]):
                # print(f"Before conv {i+2} ", x_conv.shape)
                x_conv = nn.functional.relu(conv(x_conv.squeeze().unsqueeze(1)))
                # print(f"After conv {i+2} ", x_conv.shape)
            
            # After convolutions we get tensor of shape (BATCH_SIZE * channels, last-feature-map, 1, time)
            x_conv = x_conv.squeeze(2)
            # separate channels from batch dimenison
            x_conv = x_conv.view(-1, 12, x_conv.shape[1], x_conv.shape[2])
            # merge feature map dimension into channels dimension 
            x_conv = x_conv.view(x_conv.shape[0],-1, x_conv.shape[3])
            # Transformed tensor to shape (BATCH_SIZE, channels * last-feature-map, time)
            # -> 64*12=768 Features
            
            # 2.1 tof features
            x_tof = x[:, :, 12:].view(x.shape[0],x.shape[1], 5, 8, 8)
            
            # 2.2. tof Features through CNN
            # (BATCH_SIZE, time, Depth, H, W) -> (BATCH_SIZE, Depth, time, H, W)
            x_tof = x_tof.transpose(1,2)
            # (BATCH_SIZE, Depth, time, H, W) -> (BATCH_SIZE * Depth, time, H, W)
            x_tof = x_tof.reshape(-1,1, x_tof.shape[2], x_tof.shape[3], x_tof.shape[4])
            
            # 8x8 -> 4x4x8 -> 2x2x16 -> 1x1x32 -> 32*5=160 features
            x_tof = nn.functional.relu(self.tof_conv1(x_tof))
            x_tof = nn.functional.relu(self.tof_conv2(x_tof))
            x_tof = nn.functional.relu(self.tof_maxp(self.tof_conv3(x_tof)))
            
            # Separate Batchsize from Depth dimension
            x_tof = x_tof.squeeze()
            # Separate batch and channels (BATCH_SIZE, Channels, feature maps, time)
            x_tof = x_tof.view(-1, 5, x_tof.shape[1], x_tof.shape[2])
            # Flatten channel and feature maps dim (BATCH_SIZE, Channels*feature maps, time)
            x_tof = x_tof.view(x_tof.shape[0],-1, x_tof.shape[3])
            
            # Cat acc, rot, thm and tof feature maps
            lstm_in = torch.cat([x_conv, x_tof], dim=1)
            lstm_out, _ = self.lstm(lstm_in.transpose(1,2))

            x_cat = torch.cat([lstm_out[:,-1,:], x_demo.squeeze(1)], 1)
            class_space = self.hidden2class(x_cat)
            class_scores = F.softmax(class_space, dim=1)
            return class_scores


    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    NUM_LAYERS = 2
    HIDDEN_SIZE = 128
    model = LSTMClassifier(332,7, hidden_dim=HIDDEN_SIZE, classes_dim=len(le.classes_), num_layers=NUM_LAYERS , dropout_rate=0).to(device)


    BATCH_SIZE = 256
    
    data = load_fold(0, prefix="20250628_v2")
    data["test"] = data["val"]
    idx = np.random.randint(low=0, high=5000, size=BATCH_SIZE)
    print(data["train"]["X"][idx].shape)
    print(data["train"]["X_demo"][idx].shape)
    
    # output (L,N,D∗H): L=sequence length,  N=Batch size, H=Hiddensize
    # hidden size (D∗num_layers,N,H): num_layers, N=Batch Size, H=hidden_size
    def apply_model(part: str, idx: Tensor) -> Tensor:
        return (
            model(
                data[part]['X'][idx],
                data[part]['X_demo'][idx]
            )
        )
    out = apply_model("train", idx)
    print(out.shape)
    print(out)

 
test_model()   

torch.Size([256, 100, 332])
torch.Size([256, 1, 7])
torch.Size([256, 18])
tensor([[0.0527, 0.0635, 0.0545,  ..., 0.0628, 0.0535, 0.0558],
        [0.0484, 0.0610, 0.0597,  ..., 0.0671, 0.0556, 0.0587],
        [0.0479, 0.0574, 0.0661,  ..., 0.0617, 0.0462, 0.0472],
        ...,
        [0.0428, 0.0718, 0.0677,  ..., 0.0776, 0.0619, 0.0640],
        [0.0460, 0.0606, 0.0619,  ..., 0.0688, 0.0578, 0.0601],
        [0.0741, 0.0654, 0.0432,  ..., 0.0488, 0.0505, 0.0459]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)


In [11]:


CONFIG = {
    "compile_model": False,
    
    "n_features": 332,
    "n_demo_features": 7,
    "n_classes": len(le.classes_),
    "target": "gesture_id"
}

hyper_params = {
    "lstm_layers": 2,
    "hidden_size": 128,
    'dropout': 0.5,
    "learning_rate": 10e-3, 
    "weight_decay": 0.9,
}

class LSTMClassifier(nn.Module):

    def __init__(self, input_dim, input_demo_dim, hidden_dim, classes_dim, num_layers, dropout_rate=0):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        
        self.rnn = nn.RNN(input_dim+input_demo_dim, input_dim, num_layers=1, batch_first=True)
        
        # conv layer 1: K: 64x5, b: 64
        self.features_maps = [64, 64, 64, 64]
        self.kernel_sizes = [5,5,5,5]
        # self.conv1 = nn.Conv1d(input_dim, input_dim*self.features_maps[0], self.kernel_sizes[0], stride=1, groups=input_dim)
        self.conv1 = nn.Conv1d(1, self.features_maps[0], self.kernel_sizes[0], stride=1, groups=1)
        
        # conv layer 2-4:  K: 64x64x5, b: 64
        self.conv2 = nn.Conv2d(1, self.features_maps[1], (self.features_maps[0], self.kernel_sizes[1]), stride=1, groups=1)
        self.conv3 = nn.Conv2d(1, self.features_maps[2], (self.features_maps[1], self.kernel_sizes[1]), stride=1, groups=1)
        self.conv4 = nn.Conv2d(1, self.features_maps[3], (self.features_maps[2], self.kernel_sizes[1]), stride=1, groups=1)
        
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(input_dim*self.features_maps[3], hidden_dim, num_layers, dropout=dropout_rate, batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2class = nn.Linear(hidden_dim+input_demo_dim, classes_dim)

    # x: ( BATCH_SIZE, time, features)
    # x_demo: (BATCH_SIZE, 1, demo features)
    def forward(self, x, x_demo):
        # demo input to (time, batch_size, demo features) and repeat in time dimension
        x_demo_repeated = x_demo.repeat(1, x.shape[1], 1)
        x_in = torch.cat([x, x_demo_repeated], dim=2)
        rnn_out, _ = self.rnn(x_in)
        
        # swap channels and time dimension
        # (BATCH_SIZE, channels, time)
        rnn_out = rnn_out.transpose(1,2)
        # channels to batch dimensions to apply same convolution per channel
        # (BATCH_SIZE * channels, time)
        rnn_out = rnn_out.view(-1, 1, rnn_out.shape[-1])
        
        x_conv = nn.functional.relu(self.conv1(rnn_out))
        
        for i, conv in enumerate([self.conv2, self.conv3, self.conv4]):
            # print(f"Before conv {i+2} ", x_conv.shape)
            x_conv = nn.functional.relu(conv(x_conv.squeeze().unsqueeze(1)))
            # print(f"After conv {i+2} ", x_conv.shape)
        
        # After convolutions we get tensor of shape (BATCH_SIZE * channels, last-feature-map, 1, time)
        x_conv = x_conv.squeeze()
        # separate channels from batch dimenison
        x_conv = x_conv.view(-1, self.input_dim, x_conv.shape[1], x_conv.shape[2])
        # merge feature map dimension into channels dimension 
        x_conv = x_conv.view(x_conv.shape[0],-1, x_conv.shape[3])
        # Transformed tensor to shape (BATCH_SIZE, channels * last-feature-map, time)
        
        # x_conv = x_conv.reshape((x_conv.shape[0], self.input_dim, -1, x_conv.shape[2]))
        # print(x_conv.shape)
        # print(x_conv[0, 0,0,:])
        lstm_out, _ = self.lstm(x_conv.transpose(1,2))
        
        x_cat = torch.cat([lstm_out[:,-1,:], x_demo.squeeze(1)], 1)
        class_space = self.hidden2class(x_cat)
        class_scores = F.softmax(class_space, dim=1)
        return class_scores


def prepare_model(config):
    # Device
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # Automatic mixed precision (AMP)
    # torch.float16 is implemented for completeness,
    # but it was not tested in the project,
    # so torch.bfloat16 is used by default.
    amp_dtype = (
        torch.bfloat16
        if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
        else torch.float16
        if torch.cuda.is_available()
        else None
    )
    # Changing False to True will result in faster training on compatible hardware.
    amp_enabled = False and amp_dtype is not None
    grad_scaler = torch.amp.GradScaler("cuda") if amp_dtype is torch.float16 else None  # type: ignore

    # torch.compile
    compile_model = config["compile_model"]

    # fmt: off
    print(
        f'Device:        {device.type.upper()}'
        f'\nAMP:           {amp_enabled} (dtype: {amp_dtype})'
        f'\ntorch.compile: {compile_model}'
    )
    
    # Choose one of the two configurations below.
    # TODO
    model = LSTMClassifier(config["n_features"], config["n_demo_features"], config["hidden_size"], config["n_classes"], config["lstm_layers"], config["dropout"]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])

    if compile_model:
        # NOTE
        # `torch.compile` is intentionally called without the `mode` argument
        # (mode="reduce-overhead" caused issues during training with torch==2.0.1).
        model = torch.compile(model)
        evaluation_mode = torch.no_grad
    else:
        evaluation_mode = torch.inference_mode
        
    model_dict = {"model": model,
                  "eval_mode": evaluation_mode,
                  "optimizer": optimizer,
                  "device": device,
                  "grad_scaler": grad_scaler,
                  "amp_enabled": amp_enabled,
                  "amp_dtype": amp_dtype,
                  "target": config["target"]
                  }
        
    return model_dict

prepare_model(CONFIG | hyper_params)

Device:        CUDA
AMP:           False (dtype: torch.bfloat16)
torch.compile: False


{'model': LSTMClassifier(
   (rnn): RNN(339, 332, batch_first=True)
   (conv1): Conv1d(1, 64, kernel_size=(5,), stride=(1,))
   (conv2): Conv2d(1, 64, kernel_size=(64, 5), stride=(1, 1))
   (conv3): Conv2d(1, 64, kernel_size=(64, 5), stride=(1, 1))
   (conv4): Conv2d(1, 64, kernel_size=(64, 5), stride=(1, 1))
   (lstm): LSTM(21248, 128, num_layers=2, batch_first=True, dropout=0.5)
   (hidden2class): Linear(in_features=135, out_features=18, bias=True)
 ),
 'eval_mode': torch.autograd.grad_mode.inference_mode,
 'optimizer': AdamW (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 0.01
     maximize: False
     weight_decay: 0.9
 ),
 'device': device(type='cuda', index=0),
 'grad_scaler': None,
 'amp_enabled': False,
 'amp_dtype': torch.bfloat16,
 'target': 'gesture_id'}

In [None]:


def train(model_dict, data, config, verbose=False):
    model = model_dict["model"]
    optimizer = model_dict["optimizer"]
    evaluation_mode = model_dict["eval_mode"]
    device = model_dict["device"]
    grad_scaler = model_dict["grad_scaler"]
    amp_enabled = model_dict["amp_enabled"]
    amp_dtype = model_dict["amp_dtype"]
    target = model_dict["target"]
    
    
    @torch.autocast(device.type, enabled=amp_enabled, dtype=amp_dtype)  # type: ignore[code]
    def apply_model(part: str, idx: Tensor) -> Tensor:
        return (
            model(
                data[part]['X'][idx],
                data[part]['X_demo'][idx]
            )
        )

    task_type = "classification"
    base_loss_fn = F.mse_loss if task_type == 'regression' else F.cross_entropy


    def loss_fn(y_pred: Tensor, y_true: Tensor) -> Tensor:
        return base_loss_fn(y_pred, y_true)

    def score_fn(y_true, y_pred):
        sol = pd.DataFrame({"gesture": le.inverse_transform(y_true)}).reset_index(names=["id"])
        sub = pd.DataFrame({"gesture": le.inverse_transform(y_pred)}).reset_index(names=["id"])
        return score(sol, sub, row_id_column_name='id')

    @evaluation_mode()
    def evaluate(part: str) -> tuple[float, float]:
        model.eval()

        # When using torch.compile, you may need to reduce the evaluation batch size.
        eval_batch_size = 8096
        y_pred = (
            torch.cat(
                [
                    apply_model(part, idx)
                    for idx in torch.arange(len(data[part]['y']), device=device).split(
                        eval_batch_size
                    )
                ]
            )
        )


        loss = loss_fn(y_pred, data[part]["y"]).detach().cpu().numpy()

        if task_type != 'regression':
            # For classification, the mean must be computed in the probabily space.
            y_pred = F.softmax(y_pred, dim=1).cpu().numpy()

        y_true = data[part]['y'].cpu().numpy()
        
        sc = (
            score_fn(y_true, y_pred.argmax(1))
        )
        return float(sc), float(loss)  # The higher -- the better.

    if verbose:
        print(f'Test score before training: {evaluate("test")[0]:.4f}')
    
    # For demonstration purposes (fast training and bad performance),
    # one can set smaller values:
    # n_epochs = 20
    # patience = 2
    n_epochs = 1_000_000_000
    if "n_epochs" in config:
        n_epochs =  config["n_epochs"]
    
    # Early stopping: the training stops when
    # there are more than `patience` consequtive bad updates.
    patience = 10
    if "patience" in config:
        patience =  config["patience"]
    

    batch_size = 256
    epoch_size = math.ceil(len(data["train"]["X"]) / batch_size)
    best = {
        'val': -math.inf,
        'test': -math.inf,
        'epoch': -1,
    }
    
    remaining_patience = patience

    if verbose:
        print('-' * 88 + '\n')
    
    with mlflow.start_run():
        mlflow.log_params(config)
        
        
        for epoch in range(n_epochs):
            pred_train = torch.zeros((len(data["train"]["X"]), config["n_classes"]), device=device)
            for batch_idx in tqdm(
                torch.randperm(len(data['train']['y']), device=device).split(batch_size),
                desc=f'Epoch {epoch}',
                total=epoch_size,
                disable=not verbose
            ):
                model.train()
                optimizer.zero_grad()
                pred = apply_model('train', batch_idx)
                loss = loss_fn(pred, data["train"]["y"][batch_idx])
                pred_train[batch_idx] = pred.detach()
                if grad_scaler is None:
                    loss.backward()
                    optimizer.step()
                else:
                    grad_scaler.scale(loss).backward()  # type: ignore
                    grad_scaler.step(optimizer)
                    grad_scaler.update()
                    
            train_loss = loss_fn(pred_train, data["train"]["y"]).cpu().numpy()
            train_score = float(score_fn(data["train"]["y"].cpu().numpy(), 
                                         F.softmax(pred_train, dim=1).cpu().numpy().argmax(1)) )
            
            val_score, val_loss = evaluate('val')
            test_score, test_loss = evaluate('test')
            if verbose:
                print(f'(val) {val_score:.4f} (test) {test_score:.4f}')

            mlflow.log_metrics({"train_loss": float(train_loss), "val_loss": val_loss, "test_loss": test_loss,
                                "train_f1-score": train_score, "val_f1-score": val_score, "test_f1-score": test_score,  
                                }, step=epoch)
            
            
            # if patience is set to 0, don't do early stopping
            if (val_score > best['val']) or (patience == 0):
                if verbose:
                    print('🌸 New best epoch! 🌸')
                best = {'train': train_score, 'val': val_score, 'test': test_score, 'epoch': epoch}
                
                # mlflow.pytorch.log_model(pytorch_model=model_dict["model"], 
                #                          artifact_path="", 
                #                          registered_model_name=f"model_{epoch}",
                #                          input_example=data["train"]["X"][0,:,:].cpu().numpy())
                
                
                remaining_patience = patience
            else:
                remaining_patience -= 1

            if remaining_patience < 0:
                break
            
            if verbose:
                print()
        
        mlflow.log_metrics({ "best_train_score": best["train"], "best_val_score": best["val"], "best_test_score": best["test"], 
                              "best_epoch": best["epoch"] })
        if verbose:
            print('\n\nResult:')
            print(best)
    return best


mlflow.set_experiment(f"CMI LSTM Experiment {pd.Timestamp.now()}")
metrics = pl.DataFrame()
for i in range(5):
    print(f"FOLD {i}")
    data = load_fold(i, prefix="20250628_v2")
    data["test"] = data["val"]

    model_dict = prepare_model(config=CONFIG | hyper_params)

    best = train(model_dict, data, CONFIG | hyper_params, verbose=True)
    print(best)
    metrics = pl.concat([metrics, pl.DataFrame(best)])
    
    data.clear()
    model_dict.clear() 
    gc.collect()
    torch.cuda.empty_cache()

    
    
    
print(metrics)
print(metrics.mean())

2025/06/29 21:53:22 INFO mlflow.tracking.fluent: Experiment with name 'CMI LSTM Experiment 2025-06-29 21:53:22.915970' does not exist. Creating a new experiment.


FOLD 0


In [None]:
# Quick grid search 
hyper_params = {
    "lstm_layers": 1,
    "hidden_size": 32,
    'dropout': 0.1,
    "learning_rate": 2e-3, 
    "weight_decay": 3e-4,
}

param_grid = {
    "lstm_layers": [1,2,4,8,16],
    "hidden_size": [32, 64, 128, 256],
    "dropout": [0, 0.1, 0.25, 0.5],
    "learning_rate": [1e-1, 1e-2, 1e-3]
}

mlflow.set_experiment(f"CMI LSTM Gridsearch {pd.Timestamp.now()}")
gs_results = pd.DataFrame()

for params in tqdm(ParameterGrid(param_grid)):
    hp_run = hyper_params.copy()
    hp_run.update(params)
    model_dict, data = prepare_model(data_prep, config=CONFIG | hp_run)
    
    final_scores = train(model_dict, data, CONFIG | hp_run, verbose=False)
    hp_run.update(final_scores)
    gs_results = pd.concat([gs_results, pd.DataFrame(hp_run, index=[0])], axis=0)


In [19]:
gs_results.sort_values(by=["val"], ascending=False)

Unnamed: 0,lstm_layers,hidden_size,dropout,learning_rate,weight_decay,train,val,test,epoch
0,1,256,0.10,0.001,0.0003,0.997903,0.741873,0.694525,41
0,2,256,0.00,0.001,0.0003,0.927747,0.741161,0.698365,19
0,4,128,0.25,0.001,0.0003,0.940990,0.737523,0.692909,41
0,2,256,0.50,0.001,0.0003,0.953921,0.737216,0.711294,23
0,4,256,0.50,0.001,0.0003,0.944009,0.736636,0.680967,35
...,...,...,...,...,...,...,...,...,...
0,8,256,0.50,0.100,0.0003,0.381525,0.394450,0.393623,5
0,16,256,0.50,0.001,0.0003,0.412665,0.394450,0.393623,0
0,8,256,0.50,0.010,0.0003,0.409881,0.394450,0.393623,4
0,8,128,0.00,0.100,0.0003,0.396385,0.394378,0.393623,2


In [None]:
hyper_params = {
    "lstm_layers": 2,
    "hidden_size": 256,
    'dropout': 0.5,
    "learning_rate": 0.001, 
    "weight_decay": 3e-4,
    "n_epochs": 23,
    "patience": 0
}

# Feature Engineering