In [1]:
import numpy as np
import polars as pl
import pandas as pd
import seaborn as sea
import matplotlib.pyplot as plt
import joblib 
from scipy.stats import spearmanr

from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

import lightgbm
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier

import optuna
import os
import warnings
warnings.filterwarnings('ignore')

import kaggle_evaluation.cmi_inference_server

In [2]:
SEED = 3126

train = pd.read_csv("/kaggle/input/sequence-level-broad-summary-features/train_features.csv")
train_demo = pd.read_csv("/kaggle/input/cmi-detect-behavior-with-sensor-data/train_demographics.csv")
test  = pl.read_csv("/kaggle/input/cmi-detect-behavior-with-sensor-data/test.csv")
test_demo  = pl.read_csv("/kaggle/input/cmi-detect-behavior-with-sensor-data/test_demographics.csv")

label_encoder = joblib.load("/kaggle/input/sequence-level-broad-summary-features/label_encoder.joblib")

feature_cols = [col for col in train.columns 
                if col not in ['sequence_id', 'target', 'gesture', 'subject']]

In [3]:
def make_sequence_summary_features(df, demographics_df=None):
    """
    Create comprehensive features from sensor sequences
    """
    features = []
    
    # Group by sequence_id to create sequence-level features
    for seq_id, group in df.groupby('sequence_id'):
        seq_features = {'sequence_id': seq_id}
        columns = set(group.columns)
        
        # Basic sequence info
        seq_features['sequence_length'] = len(group)
        seq_features['subject'] = group['subject'].iloc[0]
        
        # Add demographics if available
        if (demographics_df is not None) and (not demographics_df.empty):
            subject_demo = demographics_df[ demographics_df['subject'] == seq_features['subject'] ]
            if not subject_demo.empty:
                seq_features['adult_child'] = subject_demo['adult_child'].iloc[0]
                seq_features['age'] = subject_demo['age'].iloc[0]
                seq_features['sex'] = subject_demo['sex'].iloc[0]
                seq_features['handedness'] = subject_demo['handedness'].iloc[0]
                seq_features['height_cm']  = subject_demo['height_cm'].iloc[0]
                seq_features['shoulder_to_wrist_cm'] = subject_demo['shoulder_to_wrist_cm'].iloc[0]
                seq_features['elbow_to_wrist_cm']    = subject_demo['elbow_to_wrist_cm'].iloc[0]
            else:
                # Set default values if demographics not found
                seq_features['adult_child'] = -1
                seq_features['age'] = -1
                seq_features['sex'] = -1
                seq_features['handedness'] = -1
                seq_features['height_cm'] = -1
                seq_features['shoulder_to_wrist_cm'] = -1
                seq_features['elbow_to_wrist_cm'] = -1
        else:
            # Set default values if demographics not available
            seq_features['adult_child'] = -1
            seq_features['age'] = -1
            seq_features['sex'] = -1
            seq_features['handedness'] = -1
            seq_features['height_cm'] = -1
            seq_features['shoulder_to_wrist_cm'] = -1
            seq_features['elbow_to_wrist_cm'] = -1
        
        # Behavior phase encoding (if available)
        if 'behavior' in columns:
            behavior_counts = group['behavior'].value_counts()
            for behavior in ['Transition', 'Pause', 'Gesture']:
                seq_features[f'{behavior.lower()}_count'] = behavior_counts.get(behavior, 0)
                seq_features[f'{behavior.lower()}_ratio'] = behavior_counts.get(behavior, 0) / len(group)
        else:
            # Set default values if behavior column is not available
            for behavior in ['Transition', 'Pause', 'Gesture']:
                seq_features[f'{behavior.lower()}_count'] = 0
                seq_features[f'{behavior.lower()}_ratio'] = 0
        
        # Statistical features for each sensor type
        sensor_groups = {
            'acc': ['acc_x', 'acc_y', 'acc_z'],
            'rot': ['rot_w', 'rot_x', 'rot_y', 'rot_z'],
            'thm': ["thm_1", "thm_2", "thm_3", "thm_4", "thm_5"],
            'tof': [f"tof_{i}_v{j}" for i in range(1,6) for j in range(0,64)]
        }
        
        for sensor_type, cols in sensor_groups.items():
            available_cols = [col for col in cols if col in columns]
            if available_cols:
                sensor_data = group[available_cols].values        
                # Basic statistics
                seq_features[f'{sensor_type}_mean'] = np.mean(sensor_data)
                seq_features[f'{sensor_type}_std']  = np.std(sensor_data)
                seq_features[f'{sensor_type}_min']  = np.min(sensor_data)
                seq_features[f'{sensor_type}_max']  = np.max(sensor_data)
                seq_features[f'{sensor_type}_range']  = np.max(sensor_data) - np.min(sensor_data)
                seq_features[f'{sensor_type}_median'] = np.median(sensor_data)
                
                # Percentiles
                seq_features[f'{sensor_type}_q25'] = np.percentile(sensor_data, 25)
                seq_features[f'{sensor_type}_q75'] = np.percentile(sensor_data, 75)
                seq_features[f'{sensor_type}_iqr'] = np.percentile(sensor_data, 75) - np.percentile(sensor_data, 25)                
                
                # Signal characteristics
                seq_features[f'{sensor_type}_energy'] = np.sum(sensor_data**2)
                seq_features[f'{sensor_type}_rms'] = np.sqrt(np.mean(sensor_data**2))

                if sensor_type != "tof":
                    for col in available_cols:
                        sensor_data = group[col].values
                        seq_features[f'{col}_mean'] = np.mean(sensor_data)
                        seq_features[f'{col}_std']  = np.std(sensor_data)
                        seq_features[f'{col}_min']  = np.min(sensor_data)
                        seq_features[f'{col}_max']  = np.max(sensor_data)
                        seq_features[f'{col}_range']  = np.max(sensor_data) - np.min(sensor_data)
                        seq_features[f'{col}_median'] = np.median(sensor_data)
                    
                        # Percentiles
                        seq_features[f'{col}_q25'] = np.percentile(sensor_data, 25)
                        seq_features[f'{col}_q75'] = np.percentile(sensor_data, 75)
                        seq_features[f'{col}_iqr'] = np.percentile(sensor_data, 75) - np.percentile(sensor_data, 25)                
                
        # Specific features for IMU data (acceleration and rotation)
        if all(col in columns for col in ['acc_x', 'acc_y', 'acc_z']):
            acc_data = group[['acc_x', 'acc_y', 'acc_z']].values
            # Acceleration features
            acc_magnitude = np.sqrt(np.sum(acc_data**2, axis=1))
            jerk = np.nan_to_num(np.diff(acc_magnitude), nan=-666)
            seq_features['jerk_mean'] = np.mean(jerk)
            seq_features['jerk_std'] = np.std(jerk)
            seq_features['acc_magnitude_mean'] = np.mean(acc_magnitude)
            seq_features['acc_magnitude_std'] = np.std(acc_magnitude)
            seq_features['acc_magnitude_max'] = np.max(acc_magnitude)
            seq_features['acc_height_norm'] = seq_features['acc_magnitude_mean'] / max(seq_features['height_cm'], 1)
            seq_features['acc_shoulder_norm'] = seq_features['acc_magnitude_mean'] / max(seq_features['shoulder_to_wrist_cm'], 1)
            seq_features['acc_elbow_norm'] = seq_features['acc_magnitude_mean'] / max(seq_features['elbow_to_wrist_cm'], 1)
            seq_features['acc_xy_corr'] = spearmanr(group['acc_x'], group['acc_y'], nan_policy='omit').statistic
            seq_features['acc_yz_corr'] = spearmanr(group['acc_y'], group['acc_z'], nan_policy='omit').statistic
            seq_features['acc_xz_corr'] = spearmanr(group['acc_x'], group['acc_z'], nan_policy='omit').statistic
            seq_features["acc_x_cumsum"] = np.sum(group["acc_x"])
            seq_features["acc_y_cumsum"] = np.sum(group["acc_y"])
            seq_features["acc_z_cumsum"] = np.sum(group["acc_z"])
            
        # Rotational features
        rot_angle = 2*np.arccos(np.clip(group["rot_w"].values, -1.0, 1.0))
        angular_velocity = np.nan_to_num(np.diff(rot_angle), nan=-666)
        angular_acceleration = np.nan_to_num(np.diff(angular_velocity), nan=-666)
        seq_features['rot_wx_corr'] = np.nan_to_num(spearmanr(group['rot_w'], group['rot_x'], nan_policy='omit').statistic, nan=-666)
        seq_features['rot_wy_corr'] = np.nan_to_num(spearmanr(group['rot_w'], group['rot_y'], nan_policy='omit').statistic, nan=-666)
        seq_features['rot_wz_corr'] = np.nan_to_num(spearmanr(group['rot_w'], group['rot_z'], nan_policy='omit').statistic, nan=-666)
        seq_features['rot_xy_corr'] = np.nan_to_num(spearmanr(group['rot_x'], group['rot_y'], nan_policy='omit').statistic, nan=-666)
        seq_features['rot_xz_corr'] = np.nan_to_num(spearmanr(group['rot_x'], group['rot_z'], nan_policy='omit').statistic, nan=-666)
        seq_features['rot_yz_corr'] = np.nan_to_num(spearmanr(group['rot_y'], group['rot_z'], nan_policy='omit').statistic, nan=-666)
        seq_features['angular_velocity_mean'] = np.mean(angular_velocity)
        seq_features['angular_velocity_std'] = np.std(angular_velocity)
        seq_features['angular_accel_mean'] = np.mean(angular_acceleration)
        seq_features['angular_accel_std'] = np.std(angular_acceleration)
        seq_features["rot_angle_cumsum"] = np.sum(rot_angle)
        seq_features["rot_angle_mean"] = np.mean(rot_angle)
        seq_features["rot_angle_median"] = np.median(rot_angle)
        seq_features["rot_angle_std"]  = np.std(rot_angle)
        seq_features["rot_angle_min"]  = np.min(rot_angle)    
        seq_features["rot_angle_max"]  = np.max(rot_angle)
        seq_features["rot_angle_range"]  = np.max(rot_angle) - np.min(rot_angle)
        seq_features["rot_angle_q25"] = np.percentile(rot_angle, 25)
        seq_features["rot_angle_q75"] = np.percentile(rot_angle, 75)
        seq_features["rot_angle_iqr"] = np.percentile(rot_angle, 75) - np.percentile(rot_angle, 25)                
        seq_features['rot_angle_energy'] = np.sum(rot_angle**2)
        seq_features['rot_angle_rms'] = np.sqrt(np.mean(rot_angle**2))
        
        # Add target if available
        if 'encoded_gesture' in columns:
            seq_features['target'] = group['encoded_gesture'].iloc[0]
            seq_features['gesture'] = group['gesture'].iloc[0]
        
        features.append(seq_features)
    
    return pd.DataFrame(features)

In [4]:
from sklearn.metrics import f1_score

class ParticipantVisibleError(Exception):
    """Errors raised here will be shown directly to the competitor."""
    pass


class CompetitionMetric:
    """Hierarchical macro F1 for the CMI 2025 challenge."""
    def __init__(self):
        self.target_gestures = [
            'Above ear - pull hair',
            'Cheek - pinch skin',
            'Eyebrow - pull hair',
            'Eyelash - pull hair',
            'Forehead - pull hairline',
            'Forehead - scratch',
            'Neck - pinch skin',
            'Neck - scratch',
        ]
        self.non_target_gestures = [
            'Write name on leg',
            'Wave hello',
            'Glasses on/off',
            'Text on phone',
            'Write name in air',
            'Feel around in tray and pull out an object',
            'Scratch knee/leg skin',
            'Pull air toward your face',
            'Drink from bottle/cup',
            'Pinch knee/leg skin'
        ]
        self.all_classes = self.target_gestures + self.non_target_gestures

    def calculate_hierarchical_f1(
        self,
        sol: pd.DataFrame,
        sub: pd.DataFrame
    ) -> float:

        # Validate gestures
        invalid_types = {i for i in sub['gesture'].unique() if i not in self.all_classes}
        if invalid_types:
            raise ParticipantVisibleError(
                f"Invalid gesture values in submission: {invalid_types}"
            )

        # Compute binary F1 (Target vs Non-Target)
        y_true_bin = sol['gesture'].isin(self.target_gestures).values
        y_pred_bin = sub['gesture'].isin(self.target_gestures).values
        
        f1_binary = f1_score(y_true_bin, y_pred_bin, pos_label=True, zero_division=0, average='binary')

        # Build multi-class labels for gestures
        y_true_mc = sol['gesture'].apply(lambda x: x if x in self.target_gestures else 'non_target')
        y_pred_mc = sub['gesture'].apply(lambda x: x if x in self.target_gestures else 'non_target')

        f1_macro = f1_score(y_true_mc, y_pred_mc, average='macro', zero_division=0)

        return f1_binary, f1_macro, (f1_binary+f1_macro)/2.0

In [5]:
def F1_score(y_val, y_pred, lbl_encoder, choice="weighted_score"):
    metric = CompetitionMetric()
    y_val  = pd.DataFrame({'id':range(len(y_val)), 
                           'gesture':y_val})
    y_pred = pd.DataFrame({'id':range(len(y_pred)), 
                           'gesture':y_pred})

    ## Convert numeric labels to original descriptions
    y_val["gesture"]  = lbl_encoder.inverse_transform(y_val["gesture"])
    y_pred["gesture"] = lbl_encoder.inverse_transform(y_pred["gesture"])

    ## Computes score
    binary, macro, weighted_score = metric.calculate_hierarchical_f1(y_val, y_pred)

    ## Returns result
    if choice=="binary": return binary
    elif choice=="macro": return macro
    elif choice=="weighted_score": return weighted_score
    else: return (binary, macro, weighted_score)

In [6]:
X = train[feature_cols]
y = train["target"]

In [7]:
def cv_evaluate(model, model_kind, X, y, lbl_encoder, n_splits=5, 
                random_state=SEED, stopping_rounds=100, min_delta=.0005):
    skfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_preds = np.zeros_like(y)
    binary_scores   = []
    macro_scores    = []
    weighted_scores = []
    history = {}
    
    for fold_num,(train_fold, val_fold) in enumerate(skfold.split(X, y)):
        print(f"\nFold {fold_num + 1}/{n_splits}")
        X_train, y_train = X.iloc[train_fold], y[train_fold]
        X_val, y_val     = X.iloc[val_fold], y[val_fold]

        cloned_model = clone(model)

        if model_kind=="lgbm":
            cloned_model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                callbacks=[lightgbm.early_stopping(stopping_rounds=stopping_rounds, min_delta=min_delta)]
            )
        else:
            cloned_model.fit(X_train, y_train)
        ## Stores out-of-fold predictions
        y_pred = cloned_model.predict(X_val)
        oof_preds[val_fold] = y_pred
        
        ## Store cv scores
        binary, macro, weighted_score = F1_score(y_val, y_pred, lbl_encoder, choice=None)
        binary_scores.append(binary)
        macro_scores.append(macro)
        weighted_scores.append(weighted_score)
    
    ## Store cv results inside dict
    history["oof_preds"] = oof_preds
    history["binary_scores"] = binary_scores
    history["macro_scores"]  = macro_scores
    history["weighted_scores"] = weighted_scores

    ## Store oof prediction scores inside dict
    binary, macro, weighted_score = F1_score(y, oof_preds, lbl_encoder, choice=None)
    history["full_binary_score"] = binary
    history["full_macro_score"] = macro
    history["full_weighted_score"] = weighted_score
    return history

In [8]:
params = {
    'n_estimators': 1601,
    'learning_rate': 0.012502527035230948,
    'max_depth': 10,
    'num_leaves': 56,
    'min_child_samples': 70,
    'subsample': 0.819484245856843,
    'colsample_bytree': 0.8043769543397135,
    'reg_lambda': 0.0031523588243255293,
    'reg_alpha': 3.094663101246672e-07
}
params["class_weight"] = "balanced"
params["objective"] = "multiclass"
params["n_jobs"] = -1
params["verbose"] = -1
params["random_state"] = SEED
stopping_rounds = 100
min_delta = 0.0006180506283718214

tuned_lgbm = LGBMClassifier(**params)

# tuned_lgbm_history =  cv_evaluate(tuned_lgbm, "lgbm", X, y, label_encoder, 
#                                   n_splits=5, random_state=SEED, stopping_rounds=stopping_rounds, min_delta=min_delta)

In [9]:
tuned_lgbm.fit(X, y)

In [10]:
### Saves lgbm model
joblib.dump(label_encoder, 'label_encoder.joblib')
joblib.dump(tuned_lgbm, 'tuned_lgbm.joblib')
joblib.dump(params, "tuned_lgbm_params.joblib")

['tuned_lgbm_params.joblib']