# 1. Import everything!

In [1]:
# Define hyper parameters

CV = 5 # cv for gridserach

# Test mode parameters
TEST_MODE = False
N = 100 # Number of instances per class

## 1.1 Liberaries

In [2]:
import os

import pandas as pd
pd.options.display.max_columns = 100

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

import sys
sys.path.append('../Data/cmi-detect-behavior-with-sensor-data')

import kaggle_evaluation.cmi_inference_server

In [3]:
!pip install /kaggle/input/pip-install-dependencies-cmi/scikit_base-0.12.3-py3-none-any.whl
!pip install /kaggle/input/pip-install-dependencies-cmi/scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install /kaggle/input/pip-install-dependencies-cmi/sktime-0.38.1-py3-none-any.whl

Processing /kaggle/input/pip-install-dependencies-cmi/scikit_base-0.12.3-py3-none-any.whl
Installing collected packages: scikit-base
Successfully installed scikit-base-0.12.3
Processing /kaggle/input/pip-install-dependencies-cmi/scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.7.0 which is incompatible.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-le

## 1.2 Custom transformers

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sktime.transformations.panel.catch22 import Catch22
from sktime.transformations.series.summarize import SummaryTransformer

from utils_custom_transformers_cmi_vsn2 import *

Processing /kaggle/input/pip-install-dependencies-cmi/scikit_base-0.12.3-py3-none-any.whl
scikit-base is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Processing /kaggle/input/pip-install-dependencies-cmi/scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
scikit-learn is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Processing /kaggle/input/pip-install-dependencies-cmi/sktime-0.38.1-py3-none-any.whl
sktime is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [5]:
from sklearn.preprocessing import StandardScaler

class GeneralPreprocessor_2(BaseEstimator, TransformerMixin):
    def __init__(self, impute_ts=False, scaler=StandardScaler(), rows_to_keep=None):
        self.num_cols = None  # store numeric col names
        self.cat_cols = None  # store categorical col names
        self.ts_cols = None
        self.impute_ts = impute_ts
        self.scaler = scaler
        self.rows_to_keep = rows_to_keep
        
    def fit(self, X, y=None):
        X_copy = X.copy()

        # Identify timeserie columns
        self.ts_cols = [col for col in X_copy.columns if X_copy[col].apply(lambda x: isinstance(x, pd.Series)).all()]
        
        # Catgecorigal and numeric columns (excluding time series)
        self.num_cols = X_copy.select_dtypes(include=["number"]).columns.difference(self.ts_cols).tolist()
        self.cat_cols = X_copy.select_dtypes(include=["object", "category"]).columns.difference(self.ts_cols).tolist()

        # Fit global scaler on all time series values (flattened)
        if self.scaler and self.ts_cols:
            all_values = []
            
            for col in self.ts_cols:  # Extract all values
                for series in X_copy[col]:
                    if isinstance(series, pd.Series):
                        all_values.extend(series.values)

            all_values = np.array(all_values).reshape(-1, 1)
            self.scaler = self.scaler.fit(all_values)

        return self

    def transform(self, X):
        X_copy = X.copy()

        # Set dtype categorical
        X_copy[self.cat_cols] = X_copy[self.cat_cols].astype("category")

        # Impute, scale and reshape time series
        if self.ts_cols:
            for col in self.ts_cols:
                
                def process_series(s):
                    if not isinstance(s, pd.Series):
                        return s

                    # Impute if needed
                    if self.impute_ts:
                        if s.isna().all():
                            s = pd.Series([0] * len(s), index=s.index)
                        else:
                            s = s.ffill().bfill()

                    # Scale using global scaler
                    if self.scaler:
                        scaled_vals = self.scaler.transform(s.values.reshape(-1, 1)).flatten()
                        s = pd.Series(scaled_vals, index=s.index)
                    
                    # Resize series to rows_to_keep
                    if self.rows_to_keep is not None:
                        length = len(s)
                        target = self.rows_to_keep

                        if length > target:
                            # Truncate
                            s = s.iloc[:target]
                        elif length < target:
                            # Pad with zeros at the end
                            pad_size = target - length
                            pad = pd.Series([0] * pad_size, index=range(length, target))
                            s = pd.concat([s, pad])

                    return s

                X_copy[col] = X_copy[col].map(process_series)

        return X_copy

## 1.3 Data

In [6]:
df_train = pd.read_csv('/kaggle/input/cmi-detect-behavior-with-sensor-data/train.csv')
df_demo = pd.read_csv('/kaggle/input/cmi-detect-behavior-with-sensor-data/train_demographics.csv')

In [7]:
# Create a small training sample if in testing mode

if TEST_MODE:
    
    def balanced_sample(df, target_col, total_samples, random_state=42):
        classes = df[target_col].unique()
        n_classes = len(classes)
        samples_per_class = total_samples // n_classes
        
        balanced_parts = []
        for cls in classes:
            subset = df[df[target_col] == cls]
            
            if len(subset) < samples_per_class:
                raise ValueError(f"Not enough samples in class '{cls}' to take {samples_per_class}")
            
            balanced_sample = subset.sample(n=samples_per_class, random_state=random_state)
            balanced_parts.append(balanced_sample)
        
        return pd.concat(balanced_parts).sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Select sequence ids (balance gesture class)
    df_seq_id = df_train.groupby("sequence_id").first().reset_index()
    df_seq_id_balanced = balanced_sample(df_seq_id, target_col="gesture", total_samples=N*18)
    seq_ids_to_keep = df_seq_id_balanced["sequence_id"]
    subjects_to_keep = df_seq_id_balanced["subject"]
    
    df_train = df_train[df_train.sequence_id.isin(seq_ids_to_keep)].copy().reset_index(drop=True)
    df_demo = df_demo[df_demo.subject.isin(subjects_to_keep)].copy().reset_index(drop=True)

# 2. Data preprocessing

## 2.1 Set-up processing X data

In [8]:
from sklearn.pipeline import Pipeline

# Define TOF and THM columns
tof_cols = [col for col in df_train.columns if 'tof_' in col]
thm_cols = [col for col in df_train.columns if 'thm_' in col]

# Define time serie columns
ts_cols = ['acc_x', 'acc_y', 'acc_z', 'rot_x', 'rot_y', 'rot_z', 'rot_w', 'mean_tof', 'mean_thm']

# Define features to drop
features_to_remove = ['row_id', 'sequence_id', 'subject', 'behavior', 'phase', 'orientation']

# Define the number of rows to keep
rows_to_keep = round(np.median(df_train[df_train.behavior == 'Performs gesture'].groupby(['sequence_id'])["sequence_counter"].count().values))

In [9]:
preprocessor = Pipeline([
    ('data_cleaning', DataCleaner()),
    ('avg_cols_tof', AverageColumnsTransformer(columns_to_average=tof_cols, new_column_name='mean_tof')),
    ('avg_cols_thm', AverageColumnsTransformer(columns_to_average=thm_cols, new_column_name='mean_thm')),
    ('to_nested', ToNestedTransformer(ts_cols=ts_cols, group_col='sequence_id', rows_to_keep=rows_to_keep)),
    ('feature_selection', FeatureSelector(features_to_remove=features_to_remove)),
    ('general_preprocessing', GeneralPreprocessor_2(impute_ts=True, rows_to_keep=rows_to_keep))
])

## 2.2 Set-up encoding y data

In [10]:
from sklearn.preprocessing import LabelEncoder

# Encode target columns
encoder_sequence_type = LabelEncoder()
encoder_gesture = LabelEncoder()

## 2.3 Split X and y data

In [11]:
# df_X raw data
df_X = df_train.drop(['sequence_type', 'gesture','orientation'], axis=1).copy()

# df_y short format
df_y = df_train[['sequence_id', 'sequence_type', 'gesture']].groupby('sequence_id').agg({
    'sequence_type': 'first',
    'gesture': 'first'
}).reset_index()

## 2.4 Apply preprocessing

In [12]:
df_X_transformed = preprocessor.fit_transform(df_X)
df_X_transformed.reset_index(drop=True, inplace=True)
df_X_transformed.head()

Unnamed: 0,acc_x,acc_y,acc_z,rot_x,rot_y,rot_z,rot_w,mean_tof,mean_thm
0,0 -0.215994 1 -0.214912 2 -0.228487 3...,0 -0.249834 1 -0.253671 2 -0.245014 3...,0 -0.274624 1 -0.250523 2 -0.280428 3...,0 -0.398984 1 -0.398949 2 -0.399639 3...,0 -0.402521 1 -0.402821 2 -0.403316 3...,0 -0.412591 1 -0.412566 2 -0.412223 3...,0 -0.385412 1 -0.385801 2 -0.386428 3...,0 1.498588 1 1.541346 2 1.341093 3...,0 0.344770 1 0.346242 2 0.343822 3...
1,0 -0.323122 1 -0.348207 2 -0.278756 3...,0 -0.173595 1 -0.229569 2 -0.202615 3...,0 -0.348502 1 -0.246293 2 -0.273247 3...,0 -0.394982 1 -0.393505 2 -0.394771 3...,0 -0.404363 1 -0.402859 2 -0.403920 3...,0 -0.412964 1 -0.413554 2 -0.413101 3...,0 -0.386929 1 -0.385440 2 -0.386326 3...,0 2.458279 1 2.658218 2 2.583272 3...,0 0.385200 1 0.384031 2 0.382366 3...
2,0 -0.577416 1 -0.550560 2 -0.612338 3...,0 -0.303841 1 -0.273935 2 -0.324991 3...,0 -0.537280 1 -0.514162 2 -0.549871 3...,0 -0.385796 1 -0.384872 2 -0.385117 3...,0 -0.370725 1 -0.370922 2 -0.370525 3...,0 -0.390220 1 -0.390440 2 -0.390759 3...,0 -0.379775 1 -0.379903 2 -0.380443 3...,0 5.090297 1 5.154008 2 5.113989 3...,0 0.224444 1 0.224258 2 0.224431 3...
3,0 -0.269607 1 -0.259966 2 -0.257015 3...,0 -0.515638 1 -0.513670 2 -0.509834 3...,0 -0.575153 1 -0.562463 2 -0.562562 3...,0 -0.413477 1 -0.413477 2 -0.413490 3...,0 -0.400627 1 -0.400618 2 -0.400434 3...,0 -0.395733 1 -0.395887 2 -0.396213 3...,0 -0.383125 1 -0.383185 2 -0.383189 3...,0 -0.160336 1 -0.157194 2 -0.153563 3...,0 0.474856 1 0.479016 2 0.480584 3...
4,0 -0.229471 1 -0.232422 2 -0.230553 3...,0 -0.233307 1 -0.235275 2 -0.237144 3...,0 -0.281608 1 -0.284560 2 -0.283576 3...,0 -0.384745 1 -0.384843 2 -0.384962 3...,0 -0.403191 1 -0.403177 2 -0.403131 3...,0 -0.396304 1 -0.396345 2 -0.396417 3...,0 -0.371006 1 -0.370976 2 -0.370926 3...,0 2.510606 1 2.538807 2 2.594196 3...,0 0.341484 1 0.344043 2 0.345857 3...


In [13]:
# Transform y data
df_y_encoded = df_y.copy()

df_y_encoded['sequence_type'] = encoder_sequence_type.fit_transform(df_y['sequence_type'])
df_y_encoded['gesture'] = encoder_gesture.fit_transform(df_y['gesture'])

df_y_encoded.head()

Unnamed: 0,sequence_id,sequence_type,gesture
0,SEQ_000007,1,1
1,SEQ_000008,1,6
2,SEQ_000013,1,1
3,SEQ_000016,0,17
4,SEQ_000018,1,6


# 3. sktime base models

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from metric import score

In [15]:
def pred_hard_and_score(model, df_X, df_y_encoded, df_y):

    y_pred = cross_val_predict(model,
                               X=df_X, y=df_y_encoded.gesture,
                               cv=CV)

    y_pred = encoder_gesture.inverse_transform(y_pred)
    df_submission = pd.DataFrame({'sequence_id': df_y.sequence_id,
                                  'gesture': y_pred})
       
    f1 = score(df_submission, df_y[['sequence_id', 'gesture']], 'sequence_id')

    return y_pred, f1

In [16]:
def pred_and_score(model, df_X, df_y_encoded, df_y):

    # Cross val predict
    y_proba = cross_val_predict(model,
                               X=df_X, y=df_y_encoded.gesture,
                               cv=CV, method="predict_proba")
    y_pred_indices = np.argmax(y_proba, axis=1)
    y_pred = encoder_gesture.inverse_transform(y_pred_indices)

    df_submission = pd.DataFrame({'sequence_id': df_y.sequence_id,
                                  'gesture': y_pred})
       
    f1 = score(df_submission, df_y[['sequence_id', 'gesture']], 'sequence_id')

    # Training final model
    model.fit(X=df_X, y=df_y_encoded.gesture)

    return model, y_proba, f1

In [17]:
def results_to_df(results, csv_name):
    # Number of classes (assuming all models predict same classes)
    n_classes = results[0]['y_pred'].shape[1]
    
    # Prepare dict to hold stacked prob arrays
    stacked_probs = {}
    
    for res in results:
        model_name = res["model"]
        proba = res["y_pred"]  # shape: (n_samples, n_classes)
        
        # Create column names like 'SummaryClassifier_class0', 'SummaryClassifier_class1', ...
        col_names = [f"{model_name}_class{i}" for i in range(n_classes)]
        
        # Store the probabilities with these columns
        stacked_probs.update(dict(zip(col_names, proba.T)))
    
    # Now build the DataFrame; transpose stacked_probs so each key is a column with length n_samples
    df_stack_features = pd.DataFrame({k: v for k, v in stacked_probs.items()})
    
    df_stack_features.to_csv(csv_name)

    return df_stack_features

## 3.2 Set-up testing

In [18]:
# import models
from sktime.classification.feature_based import SummaryClassifier
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.classification.dictionary_based import TemporalDictionaryEnsemble
from sktime.classification.deep_learning import InceptionTimeClassifier
from sktime.classification.kernel_based import RocketClassifier
from sktime.classification.interval_based import CanonicalIntervalForest

import time
import joblib

In [19]:
models = [
    SummaryClassifier(random_state=2025),
    KNeighborsTimeSeriesClassifier(n_neighbors=5, distance="dtw"),
    #TemporalDictionaryEnsemble(n_parameter_samples=10, max_ensemble_size=3, randomly_selected_params=5, random_state=2025),
    #InceptionTimeClassifier(n_epochs=10, batch_size=32, random_state=2025),
    RocketClassifier(num_kernels=5000, rocket_transform="minirocket", random_state=2025),
    CanonicalIntervalForest(n_estimators=20, min_interval=10, att_subsample_size=8, random_state=2025)
]

results = []

for model in models:
    
    model_name = model.__class__.__name__
    print(f"Training {model_name}...")
    
    start_time = time.time()

    trained_model, y_pred, f1 = pred_and_score(model, df_X_transformed, df_y_encoded, df_y)

    duration = time.time() - start_time

    results.append({
        "model": model_name,
        "f1_score": f1,
        "y_pred": y_pred,
        "time_sec": duration
    })

    # Save model
    joblib.dump(trained_model, model_name + '_all_data.pkl')
    
    print(f"{model_name} finished in {duration:.2f} seconds with F1 score = {f1:.4f}")
    print()

# Transform results to a Dataframe
df_X_meta_all = results_to_df(results, 'base_models_proba.csv')

Training SummaryClassifier...
SummaryClassifier finished in 1056.92 seconds with F1 score = 0.7857

Training KNeighborsTimeSeriesClassifier...
KNeighborsTimeSeriesClassifier finished in 770.00 seconds with F1 score = 0.6290

Training RocketClassifier...
RocketClassifier finished in 662.10 seconds with F1 score = 0.7326

Training CanonicalIntervalForest...
CanonicalIntervalForest finished in 13364.98 seconds with F1 score = 0.7253



## 3.3 IMU data only

In [20]:
df_X_IMU = df_X_transformed.drop(['mean_tof', 'mean_thm'], axis=1)

models = [
    SummaryClassifier(random_state=2025),
    KNeighborsTimeSeriesClassifier(n_neighbors=5, distance="dtw"),
    #TemporalDictionaryEnsemble(n_parameter_samples=10, max_ensemble_size=3, randomly_selected_params=5, random_state=2025),
    #InceptionTimeClassifier(n_epochs=10, batch_size=32, random_state=2025),
    RocketClassifier(num_kernels=5000, rocket_transform="minirocket", random_state=2025),
    CanonicalIntervalForest(n_estimators=20, min_interval=10, att_subsample_size=8, random_state=2025)
]

results = []

for model in models:
    
    model_name = model.__class__.__name__
    print(f"Training {model_name}...")
    
    start_time = time.time()

    trained_model, y_pred, f1 = pred_and_score(model, df_X_IMU, df_y_encoded, df_y)

    duration = time.time() - start_time

    results.append({
        "model": model_name,
        "f1_score": f1,
        "y_pred": y_pred,
        "time_sec": duration
    })

    # Save model
    joblib.dump(trained_model, model_name + '_imu_data.pkl')
    
    print(f"{model_name} finished in {duration:.2f} seconds with F1 score = {f1:.4f}")
    print()

# Transform results to a Dataframe
df_X_meta_IMU = results_to_df(results, 'base_models_proba_IMU_only.csv')

Training SummaryClassifier...
SummaryClassifier finished in 941.58 seconds with F1 score = 0.7277

Training KNeighborsTimeSeriesClassifier...
KNeighborsTimeSeriesClassifier finished in 670.59 seconds with F1 score = 0.6214

Training RocketClassifier...
RocketClassifier finished in 581.01 seconds with F1 score = 0.6935

Training CanonicalIntervalForest...
CanonicalIntervalForest finished in 11451.42 seconds with F1 score = 0.6733



# 4. Meta stacking model

In [21]:
import xgboost as xgb

param_grid = {
    'max_depth': [5],
    'learning_rate': [0.1],
    'n_estimators': [1000],
    'enable_categorical': [True],
    'num_class': [18]
}

xgb_clf = xgb.XGBClassifier(*param_grid)

_, y_pred, f1 = pred_and_score(xgb_clf, df_X_meta_all, df_y_encoded, df_y)
print(f'F1 score on all data: {f1}')

_, y_pred, f1 = pred_and_score(xgb_clf, df_X_meta_IMU, df_y_encoded, df_y)
print(f'F1 score on IMU data: {f1}')

F1 score on all data: 0.7879693039077917
F1 score on IMU data: 0.7387923136996528


In [22]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(C=1)

_, y_pred, f1 = pred_and_score(lr_clf, df_X_meta_all, df_y_encoded, df_y)
print(f'F1 score on all data: {f1}')

_, y_pred, f1 = pred_and_score(lr_clf, df_X_meta_IMU, df_y_encoded, df_y)
print(f'F1 score on IMU data: {f1}')

F1 score on all data: 0.7959059009938888
F1 score on IMU data: 0.7494822939645888


In [23]:
from sklearn.svm import SVC

svc_clf = SVC()

y_pred, f1 = pred_hard_and_score(svc_clf, df_X_meta_all, df_y_encoded, df_y)
print(f'F1 score on all data: {f1}')

y_pred, f1 = pred_hard_and_score(svc_clf, df_X_meta_IMU, df_y_encoded, df_y)
print(f'F1 score on IMU data: {f1}')

F1 score on all data: 0.7900787383451658
F1 score on IMU data: 0.7416685152692319


# 5. Save y data

In [24]:
joblib.dump(encoder_gesture, 'encoder_gesture.pkl')

df_y.to_csv('df_y.csv')
df_y_encoded.to_csv('df_y_encoded.csv')

joblib.dump(preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']