In [1]:
%load_ext autoreload
%autoreload 2
import sys, os, glob
import gc
import time, math, random
import ast
from collections import namedtuple

import numpy as np
import pandas as pd
import pandas.api.types
from pandas.api.types import CategoricalDtype
import polars as pl
import polars.selectors as cs
import mlflow
import mlflow.xgboost

import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim
from torch import Tensor


from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import QuantileTransformer, PowerTransformer, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, StratifiedGroupKFold, KFold, GroupShuffleSplit, GroupKFold, ParameterGrid
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import validation_curve, ValidationCurveDisplay
import joblib
from xgboost import XGBRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from scipy.stats import rankdata
from scipy.optimize import dual_annealing

from tqdm.auto import tqdm

sys.path.insert(0, os.path.abspath('../scripts'))
from cmi_2025 import score
from lstm_cnn import LSTMClassifier, LSTMClassifierIMUonly, prepare_model

  import pkg_resources


In [2]:
seed = 0
random.seed(seed)
np.random.seed(seed + 1)
torch.manual_seed(seed + 2);

In [3]:
torch.cuda.is_available()

True

In [4]:
BASE = ".."
COMP_DATA_BASE = os.path.join(BASE, "data", "raw")
PREP_DATA_BASE = os.path.join(BASE, "data", "processed")
FIGURES_BASE = os.path.join(BASE, "figures")

TRAIN_PATH = os.path.join(COMP_DATA_BASE, "train.csv")
TRAIN_DEMO_PATH = os.path.join(COMP_DATA_BASE, "train_demographics.csv")
TEST_PATH = os.path.join(COMP_DATA_BASE, "test.csv")
TEST_DEMO_PATH = os.path.join(COMP_DATA_BASE, "test_demographics.csv")

features = list(pl.read_csv(TRAIN_PATH).select(pl.all().exclude("ID")).columns)
train_ds = pl.read_csv(TRAIN_PATH)
train_demo_ds = pl.read_csv(TRAIN_DEMO_PATH)

In [5]:
print(f"Test columns")
test_cols = list(pl.read_csv(TEST_PATH).columns)
id_cols = test_cols[:4]
acc_cols = [col  for col in test_cols if col.startswith("acc")]
rot_cols = [col  for col in test_cols if col.startswith("rot")]
thm_cols = [col  for col in test_cols if col.startswith("thm")]
tof_cols = [[col  for col in test_cols if col.startswith(f"tof_{i+1}")] for i in range(5)]
tof_cols_all = [col for cl in tof_cols for col in cl]
target_cols = [col for col in train_ds.columns if col not in test_cols]
features = acc_cols+rot_cols+thm_cols+[c for cl in tof_cols for c in cl]

demo_features = ['adult_child', 'age', 'sex', 'handedness', 'height_cm', 'shoulder_to_wrist_cm', 'elbow_to_wrist_cm']

# target
gestures = ['Pull air toward your face', 'Feel around in tray and pull out an object', 'Neck - scratch', 'Pinch knee/leg skin', 
            'Forehead - scratch', 'Eyelash - pull hair', 'Drink from bottle/cup', 'Wave hello', 'Cheek - pinch skin', 
            'Forehead - pull hairline', 'Text on phone', 'Write name in air', 'Scratch knee/leg skin', 'Neck - pinch skin', 
            'Write name on leg', 'Above ear - pull hair', 'Eyebrow - pull hair', 'Glasses on/off']
le = LabelEncoder()
le.fit(gestures)
train_ds = train_ds.with_columns(pl.Series(name="gesture_id", values=le.transform(train_ds.select("gesture"))))

print(f"id-cols: {id_cols}")
print(f"acc: {acc_cols}")
print(f"rot: {rot_cols}")
print(f"thm: {thm_cols}")
for i in range(5):
    print(f"tof_{i+1}: {tof_cols[i]}")
    
print(f"Extra train columns")
print(f"{target_cols}")
for col in target_cols:
    print(f"{col+':':15} {train_ds.select(col).unique().to_series().to_list()}")

Test columns


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


id-cols: ['row_id', 'sequence_id', 'sequence_counter', 'subject']
acc: ['acc_x', 'acc_y', 'acc_z']
rot: ['rot_w', 'rot_x', 'rot_y', 'rot_z']
thm: ['thm_1', 'thm_2', 'thm_3', 'thm_4', 'thm_5']
tof_1: ['tof_1_v0', 'tof_1_v1', 'tof_1_v2', 'tof_1_v3', 'tof_1_v4', 'tof_1_v5', 'tof_1_v6', 'tof_1_v7', 'tof_1_v8', 'tof_1_v9', 'tof_1_v10', 'tof_1_v11', 'tof_1_v12', 'tof_1_v13', 'tof_1_v14', 'tof_1_v15', 'tof_1_v16', 'tof_1_v17', 'tof_1_v18', 'tof_1_v19', 'tof_1_v20', 'tof_1_v21', 'tof_1_v22', 'tof_1_v23', 'tof_1_v24', 'tof_1_v25', 'tof_1_v26', 'tof_1_v27', 'tof_1_v28', 'tof_1_v29', 'tof_1_v30', 'tof_1_v31', 'tof_1_v32', 'tof_1_v33', 'tof_1_v34', 'tof_1_v35', 'tof_1_v36', 'tof_1_v37', 'tof_1_v38', 'tof_1_v39', 'tof_1_v40', 'tof_1_v41', 'tof_1_v42', 'tof_1_v43', 'tof_1_v44', 'tof_1_v45', 'tof_1_v46', 'tof_1_v47', 'tof_1_v48', 'tof_1_v49', 'tof_1_v50', 'tof_1_v51', 'tof_1_v52', 'tof_1_v53', 'tof_1_v54', 'tof_1_v55', 'tof_1_v56', 'tof_1_v57', 'tof_1_v58', 'tof_1_v59', 'tof_1_v60', 'tof_1_v61', 'tof

# Preprocessing

In [6]:
def clean_data(data, config, features_dict):
    
    # Remove Sequence without gesture phase
    data = data.filter(pl.col("sequence_id") != 'SEQ_011975')
    
    # if using all features, remove sequences without thm and tof data (96 in training set)
    if not config["imu_only"]:
        imu_only_sequences = (data
                              .filter(pl.all_horizontal(pl.col(features_dict["tof"] + features_dict["thm"]).is_null()))
                              .group_by("sequence_id")
                              .agg(pl.col("row_id").len())
                              .select("sequence_id")
                              .unique())
        data = data.filter(~pl.col("sequence_id").is_in(imu_only_sequences.to_series().implode()))
    
    return data



In [7]:
def split_data(data, data_demo):
    sequences = (data
                .group_by(["sequence_id", "subject"])
                .agg(pl.col("gesture").first())
                )
    sgkf = StratifiedGroupKFold(n_splits=4, shuffle=True, random_state=42)
    sgkf2 = StratifiedGroupKFold(n_splits=2, shuffle=True, random_state=42)
    train_index, test_index = next(sgkf.split(sequences, 
                                            sequences.select("gesture").to_series(), 
                                            sequences.select("subject").to_series() ))
    train_index2, test_index2 = next(sgkf2.split(sequences[test_index], 
                                            sequences[test_index].select("gesture").to_series(), 
                                            sequences[test_index].select("subject").to_series() ))
    
    data_dict = {}
    
    for part_name, part_index in zip(["train", "val", "test"], [train_index, train_index2, test_index2]):
        data_dict[part_name] = data.filter(pl.col("sequence_id").is_in(sequences[part_index].select("sequence_id").to_series().implode()))
        data_dict[part_name + "_demo"] = data_demo.filter(pl.col("subject").is_in(sequences[part_index].select("subject").to_series().implode()))
        if part_name != 'train':
            data_dict[part_name] = data.filter(pl.col("sequence_id").is_in(sequences[test_index][part_index].select("sequence_id").to_series().implode()))
            data_dict[part_name + "_demo"] = data_demo.filter(pl.col("subject").is_in(sequences[test_index][part_index].select("subject").to_series().implode()))
        
        
        
    return data_dict
    
# train, test = split_dataset(train_ds_prep, with_val=False)
# data_dict = split_dataset(train_ds_prep, train_demo_ds_prep)


In [8]:
def impute(data):
    return (data
            .sort(by="row_id")
            .with_columns(pl.all().fill_null(strategy="forward").over("sequence_id"))
            .with_columns(pl.all().fill_null(strategy="backward").over("sequence_id"))
            .with_columns(pl.col(tof_cols_all).fill_null(-1)) 
            .with_columns(pl.col(thm_cols).fill_null(strategy="mean").over("sequence_id"))
            .with_columns(pl.all().fill_null(0))
            )
    

Stats = namedtuple('Stats', ['mean', 'std'])

def standardize(data_dict, features=features, demo=False):
    train_part, val_part, test_part = "train", "val", "test"
    if demo:
        train_part, val_part, test_part = "train_demo", "val_demo", "test_demo"

    means = data_dict[train_part].select(pl.col(features)).mean().to_dicts()[0]
    std = data_dict[train_part].select(pl.col(features)).std().to_dicts()[0]
    scaling_dict = {feat: Stats(means[feat], std[feat])   for feat in features}
    data_dict[train_part] = data_dict[train_part].with_columns([(pl.col(col)-scaling_dict[col].mean) / scaling_dict[col].std  for col in features])
    
    if val_part in data_dict and data_dict[val_part] is not None:
        data_dict[val_part] = data_dict[val_part].with_columns([(pl.col(col)-scaling_dict[col].mean) / scaling_dict[col].std  for col in features])
    
    if test_part in data_dict and  data_dict[test_part] is not None:
        data_dict[test_part] = data_dict[test_part].with_columns([(pl.col(col)-scaling_dict[col].mean) / scaling_dict[col].std  for col in features])
    
    return data_dict, scaling_dict

# _data is a data dict with part and part_demo datasets
def preprocess(_data_dict, features_dict, tail_length=75):
    # impute
    for part_name in ["train", "val", "test"]:
        if part_name in _data_dict:
            _data_dict[part_name] = impute(_data_dict[part_name])

    # standardize
    _data_dict, scaling_dict = standardize(_data_dict, features=features_dict["all"] , demo=False)
    
    if "train_demo" in _data_dict:
        _data_dict, scaling_dict_demo = standardize(_data_dict, features=features_dict["demo"] , demo=True)
            
    return _data_dict, scaling_dict, scaling_dict_demo
    
    


In [9]:
def create_dataset(_data_dict, features_dict, tail_length=75):
    
    def perpare_part(part, part_demo, tail_length=75):
        sequences = {col_name: [] for col_name in ["acc", "rot", "thm", "tof", "target", "subject", "demo"]}
        for name, data in (part
                            .sort(by=['sequence_id', 'sequence_counter'])
                            .group_by("sequence_id")
                            .tail(tail_length)
                            .group_by("sequence_id")
                            ):
            
            
            # Take last tail_length sequence parts and impute with null if sequence is not long enough
            for col_name in ["acc", "rot", "thm", "tof"]:
                array = data.select(features_dict[col_name]).to_numpy()
                if array.shape[0] < tail_length:
                    padding = np.zeros((tail_length -  array.shape[0], array.shape[1]) , dtype=float)
                    array = np.vstack((padding, array))
                    
                sequences[col_name].append(array)
            
            # Prepare target, subject and demo features
            sequences["target"].append(data.select("gesture_id").tail(1).item())
            subject = data.select("subject").tail(1).item()
            
            sequences["subject"].append(subject)
            sequences["demo"].append(part_demo.filter(pl.col("subject") == subject).select(features_dict["demo"]).to_numpy())
            
        return sequences

    data = {}
    
    # Prepare each split separately
    for part_name in ["train", "val", "test"]:
        if part_name not in _data_dict:
            continue
        
        part_data = perpare_part(_data_dict[part_name], _data_dict[part_name + "_demo"], tail_length=tail_length)
        data[part_name] = {'x_acc': np.array(part_data["acc"]).astype(np.float32), 
                        'x_rot': np.array(part_data["rot"]).astype(np.float32), 
                        'x_thm': np.array(part_data["thm"]).astype(np.float32), 
                        'x_tof': np.array(part_data["tof"]).astype(np.float32), 
                        'demo': np.array(part_data["demo"]).astype(np.float32),
                        'y': np.array(part_data["target"]),
                        }
    
    # Data to torch tensors
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    data_torch = {
            part: {'X': torch.as_tensor((np.dstack([(data[part][mtype]) for mtype in ["x_acc", "x_rot", "x_thm", "x_tof"]])), device=device),
                    'X_demo': torch.as_tensor(data[part]["demo"], device=device),
                    'y': torch.as_tensor(data[part]["y"], device=device) }
            for part in data
    }
    return data_torch
        


In [10]:




def train(model_dict, data, config, verbose=False):
    model = model_dict["model"]
    optimizer = model_dict["optimizer"]
    evaluation_mode = model_dict["eval_mode"]
    device = model_dict["device"]
    grad_scaler = model_dict["grad_scaler"]
    amp_enabled = model_dict["amp_enabled"]
    amp_dtype = model_dict["amp_dtype"]
    target = model_dict["target"]
    le = config["le"]
    
    
    @torch.autocast(device.type, enabled=amp_enabled, dtype=amp_dtype)  # type: ignore[code]
    def apply_model(part: str, idx: Tensor) -> Tensor:
        return (
            model(
                data[part]['X'][idx],
                data[part]['X_demo'][idx]
            )
        )

    task_type = "classification"
    base_loss_fn = F.mse_loss if task_type == 'regression' else F.cross_entropy


    def loss_fn(y_pred: Tensor, y_true: Tensor) -> Tensor:
        return base_loss_fn(y_pred, y_true)

    def score_fn(y_true, y_pred):
        sol = pd.DataFrame({"gesture": le.inverse_transform(y_true)}).reset_index(names=["id"])
        sub = pd.DataFrame({"gesture": le.inverse_transform(y_pred)}).reset_index(names=["id"])
        return score(sol, sub, row_id_column_name='id')

    @evaluation_mode()
    def evaluate(part: str) -> tuple[float, float]:
        model.eval()

        # When using torch.compile, you may need to reduce the evaluation batch size.
        eval_batch_size = 8096
        y_pred = (
            torch.cat(
                [
                    apply_model(part, idx)
                    for idx in torch.arange(len(data[part]['y']), device=device).split(
                        eval_batch_size
                    )
                ]
            )
        )


        loss = loss_fn(y_pred, data[part]["y"]).detach().cpu().numpy()

        if task_type != 'regression':
            # For classification, the mean must be computed in the probabily space.
            y_pred = F.softmax(y_pred, dim=1).cpu().numpy()

        y_true = data[part]['y'].cpu().numpy()
        
        sc = (
            score_fn(y_true, y_pred.argmax(1))
        )
        return float(sc), float(loss)  # The higher -- the better.

    if verbose:
        print(f'Test score before training: {evaluate("test")[0]:.4f}')
    
    # For demonstration purposes (fast training and bad performance),
    # one can set smaller values:
    # n_epochs = 20
    # patience = 2
    n_epochs = 1_000_000_000
    if "n_epochs" in config:
        n_epochs =  config["n_epochs"]
    
    # Early stopping: the training stops when
    # there are more than `patience` consequtive bad updates.
    patience = 10
    if "patience" in config:
        patience =  config["patience"]
    

    batch_size = 256
    epoch_size = math.ceil(len(data["train"]["X"]) / batch_size)
    best = {
        'val': -math.inf,
        'test': -math.inf,
        'epoch': -1,
    }
    
    remaining_patience = patience

    if verbose:
        print('-' * 88 + '\n')
    
    with mlflow.start_run():
        mlflow.log_params(config)
        
        
        for epoch in range(n_epochs):
            # on cpu to save GPU RAM space?
            pred_train = torch.zeros((len(data["train"]["X"]), config["n_classes"]), device=device)
            for batch_idx in tqdm(
                torch.randperm(len(data['train']['y']), device=device).split(batch_size),
                desc=f'Epoch {epoch}',
                total=epoch_size,
                disable=not verbose
            ):
                model.train()
                optimizer.zero_grad()
                pred = apply_model('train', batch_idx)
                loss = loss_fn(pred, data["train"]["y"][batch_idx])
                pred_train[batch_idx] = pred.detach()
                if grad_scaler is None:
                    loss.backward()
                    optimizer.step()
                else:
                    grad_scaler.scale(loss).backward()  # type: ignore
                    grad_scaler.step(optimizer)
                    grad_scaler.update()
                    
            train_loss = loss_fn(pred_train, data["train"]["y"]).cpu().numpy()
            train_score = float(score_fn(data["train"]["y"].cpu().numpy(), 
                                         F.softmax(pred_train, dim=1).cpu().numpy().argmax(1)) )
            
            val_score, val_loss = evaluate('val')
            test_score, test_loss = evaluate('test')
            if verbose:
                print(f'(train) {train_score:.4f} (val) {val_score:.4f} (test) {test_score:.4f}')

            mlflow.log_metrics({"train_loss": float(train_loss), "val_loss": val_loss, "test_loss": test_loss,
                                "train_f1-score": train_score, "val_f1-score": val_score, "test_f1-score": test_score,  
                                }, step=epoch)
            
            
            # if patience is set to 0, don't do early stopping
            if (val_score > best['val']) or (patience == 0):
                if verbose:
                    print('🌸 New best epoch! 🌸')
                best = {'train': train_score, 'val': val_score, 'test': test_score, 'epoch': epoch}
                
                # mlflow.pytorch.log_model(pytorch_model=model_dict["model"], 
                #                          artifact_path="", 
                #                          registered_model_name=f"model_{epoch}",
                #                          input_example=data["train"]["X"][0,:,:].cpu().numpy())
                
                
                remaining_patience = patience
            else:
                remaining_patience -= 1

            if remaining_patience < 0:
                break
        
        mlflow.log_metrics({ "best_train_score": best["train"], "best_val_score": best["val"], "best_test_score": best["test"], 
                              "best_epoch": best["epoch"] })
        if verbose:
            print('\n\nResult:')
            print(best)
    return best




In [13]:

class LSTM_Validator():
    def __init__(self, train_ds, train_demo_ds, config, verbose=False):
        self.verbose = verbose
        self.model_dict = None
        train_cols = list(train_ds.columns)
        acc_cols = [col  for col in train_cols if col.startswith("acc")]
        rot_cols = [col  for col in train_cols if col.startswith("rot")]
        thm_cols = [col  for col in train_cols if col.startswith("thm")]
        tof_cols = [[col  for col in train_cols if col.startswith(f"tof_{i+1}")] for i in range(5)]
        tof_cols_all = [col for cl in tof_cols for col in cl]
        self.features_dict = {"acc":acc_cols, "rot":rot_cols, "thm":thm_cols, "tof":tof_cols_all}
        self.features = acc_cols+rot_cols+thm_cols+tof_cols_all
        gestures = ['Pull air toward your face', 'Feel around in tray and pull out an object', 'Neck - scratch', 'Pinch knee/leg skin', 
                'Forehead - scratch', 'Eyelash - pull hair', 'Drink from bottle/cup', 'Wave hello', 'Cheek - pinch skin', 
                'Forehead - pull hairline', 'Text on phone', 'Write name in air', 'Scratch knee/leg skin', 'Neck - pinch skin', 
                'Write name on leg', 'Above ear - pull hair', 'Eyebrow - pull hair', 'Glasses on/off']
        
        self.demo_features = ['adult_child', 'age', 'sex', 'handedness', 'height_cm', 'shoulder_to_wrist_cm', 'elbow_to_wrist_cm']
        self.features_dict["demo"] = self.demo_features
        self.features_dict["all"] = acc_cols+rot_cols+thm_cols+tof_cols_all
        
        self.le = LabelEncoder()
        self.le.fit(gestures)
        train_ds = train_ds.with_columns(pl.Series(name="gesture_id", values=self.le.transform(train_ds.select("gesture").to_series())))

        self.CONFIG = config
        
        self.CONFIG["n_demo_features"] = len(self.demo_features)
        self.CONFIG["n_classes"] = len(self.le.classes_)
        self.CONFIG["le"] = self.le
        
        
        if self.verbose:
            print("Cleaning...")
        data = clean_data(train_ds, self.CONFIG, self.features_dict)
        
        if self.verbose:
            print("Splitting data...")
        data_dict = split_data(data, train_demo_ds)
        
        if self.verbose:
            print("Preprocessing...")
        data_dict, self.ct, self.ct_demo = preprocess(data_dict, self.features_dict)

        if self.verbose:
            print("Preparing dataset...")
        self.data = create_dataset(data_dict, features_dict=self.features_dict, tail_length=self.CONFIG["tail_length"])
        
        

        
    # PATH: Path to save model to, e.g. 'model.pth'
    def train_model(self, hyper_params, PATH=None):
        self.hyper_params=hyper_params
        
        if self.model_dict:
            del self.model_dict["model"]
            gc.collect()
            torch.cuda.empty_cache() 
        
        if self.verbose:
            print("Preparing model...")
        self.model_dict = prepare_model(config=self.CONFIG | self.hyper_params)
        
        if self.verbose:
            print("Training model...")
            
        best = train(self.model_dict, self.data, self.CONFIG | self.hyper_params, verbose=self.verbose)
        
        if PATH is not None:
            if self.verbose:
                print("Saving model...")
            torch.save(self.model_dict["model"].state_dict(), PATH)
        
        return best
        
    # PATH: saved model path, e.g. 'model.pth'
    def load_model(self, PATH):
        if self.verbose:
            print("Loading model...")
        
        if not self.model_dict:
            self.model_dict = prepare_model(config=self.CONFIG | self.hyper_params)
        
        if self.model_dict["device"].type == 'cpu':  
            self.model_dict["model"].load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))
        else:
            self.model_dict["model"].load_state_dict(torch.load(PATH))
   
    

CONFIG = {
    "compile_model": False,
    
    "n_features": 332,
    "target": "gesture_id",
    "tail_length": 100,
    "imu_only": False
}

hyper_params = {
    "lstm_layers": 2,
    "hidden_size": 64,
    'dropout': 0.2,
    "learning_rate": 1e-3, 
    "weight_decay": 0.9,
    "n_epochs": 60,
    "patience": 84
}

lstm_val = LSTM_Validator(train_ds, train_demo_ds, CONFIG, verbose=True)
lstm_val.train_model(hyper_params)

Cleaning...
Splitting data...
Preprocessing...
Preparing dataset...
Preparing model...
Device:        CUDA
AMP:           False (dtype: torch.bfloat16)
torch.compile: False
Training model...
Test score before training: 0.2893
----------------------------------------------------------------------------------------



Epoch 0:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.4208 (val) 0.4495 (test) 0.4473
🌸 New best epoch! 🌸


Epoch 1:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.4828 (val) 0.4954 (test) 0.4814
🌸 New best epoch! 🌸


Epoch 2:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.4968 (val) 0.5061 (test) 0.4984
🌸 New best epoch! 🌸


Epoch 3:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.5116 (val) 0.5132 (test) 0.5138
🌸 New best epoch! 🌸


Epoch 4:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.5222 (val) 0.5185 (test) 0.5350
🌸 New best epoch! 🌸


Epoch 5:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.5342 (val) 0.5379 (test) 0.5545
🌸 New best epoch! 🌸


Epoch 6:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.5506 (val) 0.5618 (test) 0.5486
🌸 New best epoch! 🌸


Epoch 7:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.5630 (val) 0.5823 (test) 0.5630
🌸 New best epoch! 🌸


Epoch 8:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.5771 (val) 0.5894 (test) 0.5804
🌸 New best epoch! 🌸


Epoch 9:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.5866 (val) 0.5623 (test) 0.5459


Epoch 10:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.5829 (val) 0.5993 (test) 0.5662
🌸 New best epoch! 🌸


Epoch 11:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6036 (val) 0.6098 (test) 0.6026
🌸 New best epoch! 🌸


Epoch 12:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6095 (val) 0.6288 (test) 0.6129
🌸 New best epoch! 🌸


Epoch 13:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6243 (val) 0.6389 (test) 0.6141
🌸 New best epoch! 🌸


Epoch 14:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6191 (val) 0.6013 (test) 0.5660


Epoch 15:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6220 (val) 0.6308 (test) 0.6222


Epoch 16:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6404 (val) 0.6526 (test) 0.6251
🌸 New best epoch! 🌸


Epoch 17:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6402 (val) 0.6434 (test) 0.6094


Epoch 18:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6439 (val) 0.6508 (test) 0.6254


Epoch 19:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6510 (val) 0.6588 (test) 0.6284
🌸 New best epoch! 🌸


Epoch 20:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6622 (val) 0.6607 (test) 0.6409
🌸 New best epoch! 🌸


Epoch 21:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6558 (val) 0.6577 (test) 0.6341


Epoch 22:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6627 (val) 0.6605 (test) 0.6325


Epoch 23:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6593 (val) 0.6643 (test) 0.6354
🌸 New best epoch! 🌸


Epoch 24:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6694 (val) 0.6634 (test) 0.6294


Epoch 25:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6710 (val) 0.6809 (test) 0.6597
🌸 New best epoch! 🌸


Epoch 26:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6761 (val) 0.6895 (test) 0.6473
🌸 New best epoch! 🌸


Epoch 27:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6893 (val) 0.6924 (test) 0.6571
🌸 New best epoch! 🌸


Epoch 28:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6896 (val) 0.6822 (test) 0.6469


Epoch 29:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6852 (val) 0.6945 (test) 0.6499
🌸 New best epoch! 🌸


Epoch 30:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6718 (val) 0.6949 (test) 0.6369
🌸 New best epoch! 🌸


Epoch 31:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6806 (val) 0.6872 (test) 0.6486


Epoch 32:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6706 (val) 0.6889 (test) 0.6443


Epoch 33:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6985 (val) 0.6967 (test) 0.6365
🌸 New best epoch! 🌸


Epoch 34:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6867 (val) 0.6829 (test) 0.6516


Epoch 35:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6999 (val) 0.6982 (test) 0.6660
🌸 New best epoch! 🌸


Epoch 36:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7161 (val) 0.7103 (test) 0.6719
🌸 New best epoch! 🌸


Epoch 37:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7177 (val) 0.7013 (test) 0.6524


Epoch 38:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7139 (val) 0.6873 (test) 0.6575


Epoch 39:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6854 (val) 0.6901 (test) 0.6280


Epoch 40:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.6939 (val) 0.7066 (test) 0.6605


Epoch 41:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7174 (val) 0.7241 (test) 0.6815
🌸 New best epoch! 🌸


Epoch 42:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7303 (val) 0.7009 (test) 0.6507


Epoch 43:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7348 (val) 0.7002 (test) 0.6707


Epoch 44:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7402 (val) 0.7224 (test) 0.6772


Epoch 45:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7381 (val) 0.7196 (test) 0.6810


Epoch 46:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7548 (val) 0.7344 (test) 0.6889
🌸 New best epoch! 🌸


Epoch 47:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7448 (val) 0.7235 (test) 0.6955


Epoch 48:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7302 (val) 0.7304 (test) 0.6753


Epoch 49:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7395 (val) 0.7198 (test) 0.6721


Epoch 50:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7601 (val) 0.7317 (test) 0.6943


Epoch 51:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7616 (val) 0.7376 (test) 0.6883
🌸 New best epoch! 🌸


Epoch 52:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7514 (val) 0.7258 (test) 0.6841


Epoch 53:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7448 (val) 0.7232 (test) 0.6691


Epoch 54:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7513 (val) 0.7279 (test) 0.6789


Epoch 55:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7724 (val) 0.7300 (test) 0.6901


Epoch 56:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7714 (val) 0.7382 (test) 0.6872
🌸 New best epoch! 🌸


Epoch 57:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7709 (val) 0.7163 (test) 0.6601


Epoch 58:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7750 (val) 0.7398 (test) 0.6904
🌸 New best epoch! 🌸


Epoch 59:   0%|          | 0/24 [00:00<?, ?it/s]

(train) 0.7621 (val) 0.7277 (test) 0.6750


Result:
{'train': 0.7750122332451037, 'val': 0.7397669890235561, 'test': 0.6904332694601707, 'epoch': 58}


{'train': 0.7750122332451037,
 'val': 0.7397669890235561,
 'test': 0.6904332694601707,
 'epoch': 58}

In [None]:
mlflow.set_experiment(f"CMI LSTM Experiment {pd.Timestamp.now()}")
metrics = pl.DataFrame()
for i in range(5):
    print(f"FOLD {i}")
    data = load_fold(i, prefix="20250628_v2")
    data["test"] = data["val"]

    model_dict = prepare_model(config=CONFIG | hyper_params)

    best = train(model_dict, data, CONFIG | hyper_params, verbose=True)
    print(best)
    metrics = pl.concat([metrics, pl.DataFrame(best)])
    
    data.clear()
    model_dict.clear() 
    gc.collect()
    torch.cuda.empty_cache()

    
    
    
print(metrics)
print(metrics.mean())

# LSTM

In [14]:
# https://docs.pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

def test_LSTM():
    # The first axis is the sequence itself, 
    # the second indexes instances in the mini-batch, 
    # and the third indexes elements of the input.
    lstm = nn.LSTM(3, 3)
    # initialize the hidden state.
    hidden = (torch.randn(1, 2, 3),
            torch.randn(1, 2, 3))

    sample1 = [torch.randn(1, 3) for _ in range(5)] 
    sample2 = [torch.randn(1, 3) for _ in range(5)] 
    sample1 = torch.cat(sample1).view(len(sample1), 1, -1)
    sample2 = torch.cat(sample2).view(len(sample2), 1, -1)
    inputs = torch.hstack([sample1, sample2])
    print(inputs.shape)
    print(inputs)
    # inputs = torch.cat(inputs).view(len(inputs), 1, -1)




    out, hidden = lstm(inputs, hidden)
    display(out.shape)
    display(hidden)
    
# test_LSTM()

In [None]:
def test_model():

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # stack it together for now
    data = {
            part: {'X': torch.as_tensor((np.dstack([(data_prep["train"][mtype]) for mtype in ["x_acc", "x_rot", "x_thm", "x_tof"]])), device=device) }
            for part in data_prep
    }
    for part in data_prep:
        data[part]['y'] = torch.as_tensor(data_prep["train"]["y"], device=device) 
        
    # np.dstack([(data_prep["train"][mtype]) for mtype in ["x_acc", "x_rot", "x_thm", "x_tof"]])
    # data_prep["train"]["x_acc"]
    BATCH_SIZE = 10
    TAIL_LENGTH = 75
    idx = np.random.randint(low=0, high=5000, size=BATCH_SIZE)
    print(data["train"]["X"][idx].shape)

    print(torch.transpose(data["train"]["X"][idx], 0,1).shape)
    print(data["train"]["X"][idx].view(TAIL_LENGTH, BATCH_SIZE, -1).shape)
    print(torch.transpose(data["train"]["X"][idx], 0,1))
    print(torch.transpose(data["train"]["X"][idx], 1,0))
    print(data["train"]["X"][idx].view(TAIL_LENGTH, BATCH_SIZE, -1))
    
    NUM_LAYERS = 1
    HIDDEN_SIZE = 32


    class LSTMClassifier(nn.Module):

        def __init__(self, input_dim, hidden_dim, classes_dim, num_layers, dropout_rate=0):
            super(LSTMClassifier, self).__init__()
            self.hidden_dim = hidden_dim

            # The LSTM takes word embeddings as inputs, and outputs hidden states
            # with dimensionality hidden_dim.
            self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, dropout=dropout_rate)

            # The linear layer that maps from hidden state space to tag space
            self.hidden2class = nn.Linear(hidden_dim, classes_dim)

        def forward(self, x):
            lstm_out, _ = self.lstm(x)
            class_space = self.hidden2class(lstm_out[-1,...])
            class_scores = F.softmax(class_space, dim=1)
            return class_scores


    model = LSTMClassifier(332 , hidden_dim=HIDDEN_SIZE, classes_dim=len(le.classes_), num_layers=NUM_LAYERS , dropout_rate=0)

    idx = np.random.randint(low=0, high=5000, size=BATCH_SIZE)
    # output (L,N,D∗H): L=sequence length,  N=Batch size, H=Hiddensize
    # hidden size (D∗num_layers,N,H): num_layers, N=Batch Size, H=hidden_size
    def apply_model(part: str, idx: Tensor) -> Tensor:
        return (
            model(
                torch.transpose(data[part]['X'][idx],0,1)
            )
        )
    print(apply_model("train", idx).shape)
    print(apply_model("train", idx))

    # torch.cat([torch.as_tensor(data_prep["train"][mtype][idx]) for mtype in ["x_acc", "x_rot", "x_thm", "x_tof"]])
 
# test_model()   

In [16]:
NUM_LAYERS = 1
HIDDEN_SIZE = 32


CONFIG = {
    "compile_model": False,
    
    "n_features": 332,
    "n_classes": len(le.classes_),
    "target": "gesture_id"
}

hyper_params = {
    "lstm_layers": 1,
    "hidden_size": 32,
    'dropout': 0.1,
    "learning_rate": 2e-3, 
    "weight_decay": 3e-4,
}

class LSTMClassifier(nn.Module):

    def __init__(self, input_dim, hidden_dim, classes_dim, num_layers, dropout_rate=0):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, dropout=dropout_rate)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2class = nn.Linear(hidden_dim, classes_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        class_space = self.hidden2class(lstm_out[-1,...])
        # class_scores = F.softmax(class_space, dim=1)
        return class_space
    


def prepare_model(data_prep, config):
    # Device
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # Convert data to tensors
    # stack it together for now
    data = {
        part: {'X': torch.as_tensor((np.dstack([(data_prep[part][mtype]) for mtype in ["x_acc", "x_rot", "x_thm", "x_tof"]])), device=device) }
        for part in data_prep
    }

    for part in data_prep:
        data[part]['y'] = torch.as_tensor(data_prep[part]["y"], device=device) 


    # Automatic mixed precision (AMP)
    # torch.float16 is implemented for completeness,
    # but it was not tested in the project,
    # so torch.bfloat16 is used by default.
    amp_dtype = (
        torch.bfloat16
        if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
        else torch.float16
        if torch.cuda.is_available()
        else None
    )
    # Changing False to True will result in faster training on compatible hardware.
    amp_enabled = False and amp_dtype is not None
    grad_scaler = torch.amp.GradScaler("cuda") if amp_dtype is torch.float16 else None  # type: ignore

    # torch.compile
    compile_model = config["compile_model"]

    # fmt: off
    print(
        f'Device:        {device.type.upper()}'
        f'\nAMP:           {amp_enabled} (dtype: {amp_dtype})'
        f'\ntorch.compile: {compile_model}'
    )
    
    # Choose one of the two configurations below.
    # TODO
    model = LSTMClassifier(config["n_features"], config["hidden_size"], config["n_classes"], config["lstm_layers"], config["dropout"]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])

    if compile_model:
        # NOTE
        # `torch.compile` is intentionally called without the `mode` argument
        # (mode="reduce-overhead" caused issues during training with torch==2.0.1).
        model = torch.compile(model)
        evaluation_mode = torch.no_grad
    else:
        evaluation_mode = torch.inference_mode
        
    model_dict = {"model": model,
                  "eval_mode": evaluation_mode,
                  "optimizer": optimizer,
                  "device": device,
                  "grad_scaler": grad_scaler,
                  "amp_enabled": amp_enabled,
                  "amp_dtype": amp_dtype,
                  "target": config["target"]
                  }
        
    return model_dict, data

prepare_model(data_prep, CONFIG | hyper_params)

Device:        CUDA
AMP:           False (dtype: torch.bfloat16)
torch.compile: False




({'model': LSTMClassifier(
    (lstm): LSTM(332, 32, dropout=0.1)
    (hidden2class): Linear(in_features=32, out_features=18, bias=True)
  ),
  'eval_mode': torch.autograd.grad_mode.inference_mode,
  'optimizer': AdamW (
  Parameter Group 0
      amsgrad: False
      betas: (0.9, 0.999)
      capturable: False
      differentiable: False
      eps: 1e-08
      foreach: None
      fused: None
      lr: 0.002
      maximize: False
      weight_decay: 0.0003
  ),
  'device': device(type='cuda', index=0),
  'grad_scaler': None,
  'amp_enabled': False,
  'amp_dtype': torch.bfloat16,
  'target': 'gesture_id'},
 {'train': {'X': tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
            [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
            [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
            ...,
            [ 0.6403,  0.7623,  1.0088,  ...,  1.1836,  1.1813,  1.1942],
            [ 0.7338,  0.7855,  0.8770,  ...,  1.1346,  1.146

In [17]:


def train(model_dict, data, config, verbose=False):
    model = model_dict["model"]
    optimizer = model_dict["optimizer"]
    evaluation_mode = model_dict["eval_mode"]
    device = model_dict["device"]
    grad_scaler = model_dict["grad_scaler"]
    amp_enabled = model_dict["amp_enabled"]
    amp_dtype = model_dict["amp_dtype"]
    target = model_dict["target"]
    
    
    @torch.autocast(device.type, enabled=amp_enabled, dtype=amp_dtype)  # type: ignore[code]
    def apply_model(part: str, idx: Tensor) -> Tensor:
        return (
            model(
                torch.transpose(data[part]['X'][idx],0,1)
            )
        )

    task_type = "classification"
    base_loss_fn = F.mse_loss if task_type == 'regression' else F.cross_entropy


    def loss_fn(y_pred: Tensor, y_true: Tensor) -> Tensor:
        return base_loss_fn(y_pred, y_true)

    # TODO: Replace with competition metric
    def score_fn(y_true, y_pred):
        sol = pd.DataFrame({"gesture": le.inverse_transform(y_true)}).reset_index(names=["id"])
        sub = pd.DataFrame({"gesture": le.inverse_transform(y_pred)}).reset_index(names=["id"])
        return score(sol, sub, row_id_column_name='id')

    @evaluation_mode()
    def evaluate(part: str) -> tuple[float, float]:
        model.eval()

        # When using torch.compile, you may need to reduce the evaluation batch size.
        eval_batch_size = 8096
        y_pred = (
            torch.cat(
                [
                    apply_model(part, idx)
                    for idx in torch.arange(len(data[part]['y']), device=device).split(
                        eval_batch_size
                    )
                ]
            )
        )


        loss = loss_fn(y_pred, data[part]["y"]).detach().cpu().numpy()

        if task_type != 'regression':
            # For classification, the mean must be computed in the probabily space.
            y_pred = F.softmax(y_pred, dim=1).cpu().numpy()

        y_true = data[part]['y'].cpu().numpy()
        
        sc = (
            score_fn(y_true, y_pred.argmax(1))
        )
        return float(sc), float(loss)  # The higher -- the better.

    if verbose:
        print(f'Test score before training: {evaluate("test")[0]:.4f}')
    
    # For demonstration purposes (fast training and bad performance),
    # one can set smaller values:
    # n_epochs = 20
    # patience = 2
    n_epochs = 1_000_000_000
    if "n_epochs" in config:
        n_epochs =  config["n_epochs"]
    
    # Early stopping: the training stops when
    # there are more than `patience` consequtive bad updates.
    patience = 10
    if "patience" in config:
        patience =  config["patience"]
    

    batch_size = 256
    epoch_size = math.ceil(len(data["train"]["X"]) / batch_size)
    best = {
        'val': -math.inf,
        'test': -math.inf,
        'epoch': -1,
    }
    
    remaining_patience = patience

    if verbose:
        print('-' * 88 + '\n')
    
    with mlflow.start_run():
        mlflow.log_params(config)
        
        
        for epoch in range(n_epochs):
            pred_train = torch.zeros((len(data["train"]["X"]), config["n_classes"]), device=device)
            for batch_idx in tqdm(
                torch.randperm(len(data['train']['y']), device=device).split(batch_size),
                desc=f'Epoch {epoch}',
                total=epoch_size,
                disable=not verbose
            ):
                model.train()
                optimizer.zero_grad()
                pred = apply_model('train', batch_idx)
                loss = loss_fn(pred, data["train"]["y"][batch_idx])
                pred_train[batch_idx] = pred.detach()
                if grad_scaler is None:
                    loss.backward()
                    optimizer.step()
                else:
                    grad_scaler.scale(loss).backward()  # type: ignore
                    grad_scaler.step(optimizer)
                    grad_scaler.update()
                    
            train_loss = loss_fn(pred_train, data["train"]["y"]).cpu().numpy()
            train_score = float(score_fn(data["train"]["y"].cpu().numpy(), 
                                         F.softmax(pred_train, dim=1).cpu().numpy().argmax(1)) )
            
            val_score, val_loss = evaluate('val')
            test_score, test_loss = evaluate('test')
            if verbose:
                print(f'(val) {val_score:.4f} (test) {test_score:.4f}')

            mlflow.log_metrics({"train_loss": float(train_loss), "val_loss": val_loss, "test_loss": test_loss,
                                "train_f1-score": train_score, "val_f1-score": val_score, "test_f1-score": test_score,  
                                }, step=epoch)
            
            
            # if patience is set to 0, don't do early stopping
            if (val_score > best['val']) or (patience == 0):
                if verbose:
                    print('🌸 New best epoch! 🌸')
                best = {'train': train_score, 'val': val_score, 'test': test_score, 'epoch': epoch}
                
                # mlflow.pytorch.log_model(pytorch_model=model_dict["model"], 
                #                          artifact_path="", 
                #                          registered_model_name=f"model_{epoch}",
                #                          input_example=data["train"]["X"][0,:,:].cpu().numpy())
                
                
                remaining_patience = patience
            else:
                remaining_patience -= 1

            if remaining_patience < 0:
                break
            
            if verbose:
                print()
        
        mlflow.log_metrics({ "best_train_score": best["train"], "best_val_score": best["val"], "best_test_score": best["test"], 
                              "best_epoch": best["epoch"] })
        if verbose:
            print('\n\nResult:')
            print(best)
    return best

model_dict, data = prepare_model(data_prep, config=CONFIG | hyper_params)
mlflow.set_experiment(f"CMI LSTM Experiment {pd.Timestamp.now()}")
train(model_dict, data, CONFIG | hyper_params, verbose=False)

2025/06/23 19:59:45 INFO mlflow.tracking.fluent: Experiment with name 'CMI LSTM Experiment 2025-06-23 19:59:45.708743' does not exist. Creating a new experiment.


Device:        CUDA
AMP:           False (dtype: torch.bfloat16)
torch.compile: False


{'train': 0.8777034604580647,
 'val': 0.6870461894341058,
 'test': 0.6734010885145207,
 'epoch': 28}

In [None]:
# Quick grid search 
hyper_params = {
    "lstm_layers": 1,
    "hidden_size": 32,
    'dropout': 0.1,
    "learning_rate": 2e-3, 
    "weight_decay": 3e-4,
}

param_grid = {
    "lstm_layers": [1,2,4,8,16],
    "hidden_size": [32, 64, 128, 256],
    "dropout": [0, 0.1, 0.25, 0.5],
    "learning_rate": [1e-1, 1e-2, 1e-3]
}

mlflow.set_experiment(f"CMI LSTM Gridsearch {pd.Timestamp.now()}")
gs_results = pd.DataFrame()

for params in tqdm(ParameterGrid(param_grid)):
    hp_run = hyper_params.copy()
    hp_run.update(params)
    model_dict, data = prepare_model(data_prep, config=CONFIG | hp_run)
    
    final_scores = train(model_dict, data, CONFIG | hp_run, verbose=False)
    hp_run.update(final_scores)
    gs_results = pd.concat([gs_results, pd.DataFrame(hp_run, index=[0])], axis=0)


In [19]:
gs_results.sort_values(by=["val"], ascending=False)

Unnamed: 0,lstm_layers,hidden_size,dropout,learning_rate,weight_decay,train,val,test,epoch
0,1,256,0.10,0.001,0.0003,0.997903,0.741873,0.694525,41
0,2,256,0.00,0.001,0.0003,0.927747,0.741161,0.698365,19
0,4,128,0.25,0.001,0.0003,0.940990,0.737523,0.692909,41
0,2,256,0.50,0.001,0.0003,0.953921,0.737216,0.711294,23
0,4,256,0.50,0.001,0.0003,0.944009,0.736636,0.680967,35
...,...,...,...,...,...,...,...,...,...
0,8,256,0.50,0.100,0.0003,0.381525,0.394450,0.393623,5
0,16,256,0.50,0.001,0.0003,0.412665,0.394450,0.393623,0
0,8,256,0.50,0.010,0.0003,0.409881,0.394450,0.393623,4
0,8,128,0.00,0.100,0.0003,0.396385,0.394378,0.393623,2


In [None]:
hyper_params = {
    "lstm_layers": 2,
    "hidden_size": 256,
    'dropout': 0.5,
    "learning_rate": 0.001, 
    "weight_decay": 3e-4,
    "n_epochs": 23,
    "patience": 0
}

# Feature Engineering