# 1. Import everything!

In [1]:
# Hyper parameters

CV=5

## 1.1 Liberaries

In [2]:
import os

import pandas as pd
pd.options.display.max_columns = 100

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

import sys
sys.path.append('../Data/cmi-detect-behavior-with-sensor-data')

import kaggle_evaluation.cmi_inference_server

In [3]:
!pip install /kaggle/input/pip-install-dependencies-cmi/scikit_base-0.12.3-py3-none-any.whl
!pip install /kaggle/input/pip-install-dependencies-cmi/scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install /kaggle/input/pip-install-dependencies-cmi/sktime-0.38.1-py3-none-any.whl

Processing /kaggle/input/pip-install-dependencies-cmi/scikit_base-0.12.3-py3-none-any.whl
Installing collected packages: scikit-base
Successfully installed scikit-base-0.12.3
Processing /kaggle/input/pip-install-dependencies-cmi/scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.7.0 which is incompatible.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-le

## 1.2 Custom transformers

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

from utils_custom_transformers_cmi_vsn2 import *

Processing /kaggle/input/pip-install-dependencies-cmi/scikit_base-0.12.3-py3-none-any.whl
scikit-base is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Processing /kaggle/input/pip-install-dependencies-cmi/scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
scikit-learn is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Processing /kaggle/input/pip-install-dependencies-cmi/sktime-0.38.1-py3-none-any.whl
sktime is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


## 1.3 Load model and preprocessor

In [5]:
from sklearn.preprocessing import StandardScaler

class GeneralPreprocessor_2(BaseEstimator, TransformerMixin):
    def __init__(self, impute_ts=False, scaler=StandardScaler(), rows_to_keep=None):
        self.num_cols = None  # store numeric col names
        self.cat_cols = None  # store categorical col names
        self.ts_cols = None
        self.impute_ts = impute_ts
        self.scaler = scaler
        self.rows_to_keep = rows_to_keep
        
    def fit(self, X, y=None):
        X_copy = X.copy()

        # Identify timeserie columns
        self.ts_cols = [col for col in X_copy.columns if X_copy[col].apply(lambda x: isinstance(x, pd.Series)).all()]
        
        # Catgecorigal and numeric columns (excluding time series)
        self.num_cols = X_copy.select_dtypes(include=["number"]).columns.difference(self.ts_cols).tolist()
        self.cat_cols = X_copy.select_dtypes(include=["object", "category"]).columns.difference(self.ts_cols).tolist()

        # Fit global scaler on all time series values (flattened)
        if self.scaler and self.ts_cols:
            all_values = []
            
            for col in self.ts_cols:  # Extract all values
                for series in X_copy[col]:
                    if isinstance(series, pd.Series):
                        all_values.extend(series.values)

            all_values = np.array(all_values).reshape(-1, 1)
            self.scaler = self.scaler.fit(all_values)

        return self

    def transform(self, X):
        X_copy = X.copy()

        # Set dtype categorical
        X_copy[self.cat_cols] = X_copy[self.cat_cols].astype("category")

        # Impute, scale and reshape time series
        if self.ts_cols:
            for col in self.ts_cols:
                
                def process_series(s):
                    if not isinstance(s, pd.Series):
                        return s

                    # Impute if needed
                    if self.impute_ts:
                        if s.isna().all():
                            s = pd.Series([0] * len(s), index=s.index)
                        else:
                            s = s.ffill().bfill()

                    # Scale using global scaler
                    if self.scaler:
                        scaled_vals = self.scaler.transform(s.values.reshape(-1, 1)).flatten()
                        s = pd.Series(scaled_vals, index=s.index)
                    
                    # Resize series to rows_to_keep
                    if self.rows_to_keep is not None:
                        length = len(s)
                        target = self.rows_to_keep

                        if length > target:
                            # Truncate
                            s = s.iloc[:target]
                        elif length < target:
                            # Pad with zeros at the end
                            pad_size = target - length
                            pad = pd.Series([0] * pad_size, index=range(length, target))
                            s = pd.concat([s, pad])

                    return s

                X_copy[col] = X_copy[col].map(process_series)

        return X_copy

In [6]:
import joblib

# Load trained base models (all data)
model_ST_all = joblib.load('/kaggle/input/pred-broba-base-sklearn/SummaryClassifier_all_data.pkl')
model_ST_imu = joblib.load('/kaggle/input/pred-broba-base-sklearn/SummaryClassifier_imu_data.pkl')

model_CIF_all = joblib.load('/kaggle/input/pred-broba-base-sklearn/CanonicalIntervalForest_all_data.pkl')
model_CIF_imu = joblib.load('/kaggle/input/pred-broba-base-sklearn/CanonicalIntervalForest_imu_data.pkl')

model_KN_all = joblib.load('/kaggle/input/pred-broba-base-sklearn/KNeighborsTimeSeriesClassifier_all_data.pkl')
model_KN_imu = joblib.load('/kaggle/input/pred-broba-base-sklearn/KNeighborsTimeSeriesClassifier_imu_data.pkl')

model_RC_all = joblib.load('/kaggle/input/pred-broba-base-sklearn/RocketClassifier_all_data.pkl')
model_RC_imu = joblib.load('/kaggle/input/pred-broba-base-sklearn/RocketClassifier_imu_data.pkl')

# Load predictions
pred_proba_all = pd.read_csv('/kaggle/input/pred-broba-base-sklearn/base_models_proba.csv')
pred_proba_all = pred_proba_all.drop('Unnamed: 0', axis=1)
pred_proba_imu = pd.read_csv('/kaggle/input/pred-broba-base-sklearn/base_models_proba_IMU_only.csv')
pred_proba_imu = pred_proba_imu.drop('Unnamed: 0', axis=1)

# Load y data
df_y = pd.read_csv('/kaggle/input/pred-broba-base-sklearn/df_y.csv')
df_y_encoded = pd.read_csv('/kaggle/input/pred-broba-base-sklearn/df_y_encoded.csv')

encoder_gesture = joblib.load('/kaggle/input/pred-broba-base-sklearn/encoder_gesture.pkl')

preprocessor = joblib.load('/kaggle/input/pred-broba-base-sklearn/preprocessor.pkl')

# 2. Train ensemble model

In [7]:
from metric import score
from sklearn.model_selection import cross_val_predict

def pred_and_score(model, df_X, df_y_encoded, df_y):

    # Cross val predict
    y_proba = cross_val_predict(model,
                               X=df_X, y=df_y_encoded.gesture,
                               cv=CV, method="predict_proba")
    y_pred_indices = np.argmax(y_proba, axis=1)
    y_pred = encoder_gesture.inverse_transform(y_pred_indices)

    df_submission = pd.DataFrame({'sequence_id': df_y.sequence_id,
                                  'gesture': y_pred})
       
    f1 = score(df_submission, df_y[['sequence_id', 'gesture']], 'sequence_id')

    # Training final model
    model.fit(X=df_X, y=df_y_encoded.gesture)

    return model, y_proba, f1

In [8]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(C=1)

meta_model_all, y_pred, f1 = pred_and_score(lr_clf, pred_proba_all , df_y_encoded, df_y)
print(f'F1 score on all data: {f1}')

meta_model_imu, y_pred, f1 = pred_and_score(lr_clf, pred_proba_imu, df_y_encoded, df_y)
print(f'F1 score on IMU data: {f1}')

F1 score on all data: 0.7959059009938888
F1 score on IMU data: 0.7494822939645888


# 3. Make predictions on test set

In [9]:
def predict_with_model(sequence, model,
                       IMU_only=True):    

    # Drop tof and thm columns if IMU only
    if IMU_only:
        thm_cols_to_remove = [col for col in sequence.columns if 'thm' in col]
        tof_cols_to_remove = [col for col in sequence.columns if 'tof' in col]
        cols_to_drop = thm_cols_to_remove + tof_cols_to_remove
        sequence = sequence.drop(labels=cols_to_drop, axis=1)

    # Predict with models
    predictions = model.predict_proba(sequence)
        
    return predictions
    

In [10]:
def results_to_df(results, csv_name, save=False):
    # Number of classes (assuming all models predict same classes)
    n_classes = results[0]['y_pred'].shape[1]
    
    # Prepare dict to hold stacked prob arrays
    stacked_probs = {}
    
    for res in results:
        model_name = res["model"]
        proba = res["y_pred"]  # shape: (n_samples, n_classes)
        
        # Create column names like 'SummaryClassifier_class0', 'SummaryClassifier_class1', ...
        col_names = [f"{model_name}_class{i}" for i in range(n_classes)]
        
        # Store the probabilities with these columns
        stacked_probs.update(dict(zip(col_names, proba.T)))
    
    # Now build the DataFrame; transpose stacked_probs so each key is a column with length n_samples
    df_stack_features = pd.DataFrame({k: v for k, v in stacked_probs.items()})

    if save:
        df_stack_features.to_csv(csv_name)

    return df_stack_features

In [11]:
from statistics import mode

def predict(sequence: pl.DataFrame, demographics: pl.DataFrame) -> str:

    # Convert to pandas
    sequence = sequence.to_pandas()
    demographics = demographics.to_pandas()
    
    # Potential cols to drop
    thm_cols_to_remove = [col for col in sequence.columns if 'thm' in col]
    tof_cols_to_remove = [col for col in sequence.columns if 'tof' in col]
    cols_to_drop = thm_cols_to_remove + tof_cols_to_remove

    # Replace all -1 with NaN
    sequence = sequence.replace(-1, np.nan)

    # Data preprocessing
    sequence_transformed = preprocessor.transform(sequence)
    sequence_transformed.reset_index(drop=True, inplace=True)
    
    if sequence[cols_to_drop].isna().all().all() == True: # IMU only
        
        results = []
        models = [model_ST_imu, model_KN_imu, model_RC_imu, model_CIF_imu]
        
        for model in models:
            model_name = model.__class__.__name__
            pred = predict_with_model(sequence_transformed, model, IMU_only=True)
            
            results.append({
                "model": model_name,
                "y_pred": pred,
            })


        df_meta_X = results_to_df(results, 'x')
        
        # Make prediction
        predictions = meta_model_imu.predict(df_meta_X)    
        final_pred = encoder_gesture.inverse_transform([mode(predictions)])
        
    else:
        
        results = []
        models = [model_ST_all, model_KN_all, model_RC_all, model_CIF_all]
        
        for model in models:
            model_name = model.__class__.__name__
            pred = predict_with_model(sequence_transformed, model, IMU_only=False)

            results.append({
                "model": model_name,
                "y_pred": pred,
            })

        df_meta_X = results_to_df(results, 'x')

        # Make prediction
        predictions = meta_model_all.predict(df_meta_X)    
        final_pred = encoder_gesture.inverse_transform([mode(predictions)])

    print(final_pred[0])
    return final_pred[0]

In [12]:
inference_server = kaggle_evaluation.cmi_inference_server.CMIInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        data_paths=(
            '/kaggle/input/cmi-detect-behavior-with-sensor-data/test.csv',
            '/kaggle/input/cmi-detect-behavior-with-sensor-data/test_demographics.csv',
        )
    )

Cheek - pinch skin
Eyelash - pull hair
