# Imports

In [1]:
!pip install /kaggle/input/pip-install-dependencies-cmi/scikit_base-0.12.3-py3-none-any.whl
!pip install /kaggle/input/pip-install-dependencies-cmi/scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install /kaggle/input/pip-install-dependencies-cmi/sktime-0.38.1-py3-none-any.whl

Processing /kaggle/input/pip-install-dependencies-cmi/scikit_base-0.12.3-py3-none-any.whl
Installing collected packages: scikit-base
Successfully installed scikit-base-0.12.3
Processing /kaggle/input/pip-install-dependencies-cmi/scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.7.0 which is incompatible.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-le

In [2]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sktime.transformations.panel.catch22 import Catch22
from sktime.transformations.series.summarize import SummaryTransformer

# Define custom transformers

In [3]:
class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy = X_copy.replace(-1, np.nan)
        return X_copy

In [4]:
class AverageColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_average, new_column_name='avg_value'):
        self.columns_to_average = columns_to_average
        self.new_column_name = new_column_name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy[self.new_column_name] = X_copy[self.columns_to_average].mean(axis=1)
        return X_copy.drop(columns=self.columns_to_average)

In [5]:
class ToNestedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, ts_cols, rows_to_keep=None, group_col='sequence_id'):
        self.ts_cols = ts_cols
        self.group_col = group_col
        self.rows_to_keep = rows_to_keep # if None keep all rows

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        X_copy = X.copy()

        if self.rows_to_keep != None:
            # For each sequence, assign row indices counting backwards from the end
            X_copy['row_idx'] = X_copy.groupby(self.group_col).cumcount(ascending=False)
            # Keep only the last rows_to_keep rows per sequence
            X_copy = X_copy[X_copy['row_idx'] < self.rows_to_keep].drop(columns=['row_idx'])

        nested = (
            X_copy.groupby(self.group_col)[self.ts_cols]
            .apply(lambda df: pd.Series({col: pd.Series(df[col].values) for col in self.ts_cols}))
            .reset_index()
            .set_index(self.group_col)
        )

        return nested

In [6]:
class SafeCatch22(Catch22):
    def _transform_case(self, X, f_idx):
        try:
            return super()._transform_case(X, f_idx)
        except Exception:
            # Return array of NaNs if Catch22 fails on a series
            return np.full((1, len(f_idx)), np.nan)

In [7]:
class TimeFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, extractor=SummaryTransformer()):
        self.extractor = extractor

    def fit(self, X, y=None):
        return self.extractor.fit(X, y)

    def transform(self, X):
        return self.extractor.transform(X)

In [8]:
class AddStaticAfterFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, static_df=None, on='sequence_id'):
        self.static_df = static_df
        self.on = on

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.reset_index()
        return X_copy.merge(self.static_df, on=self.on, how='left')

In [9]:
class MergeDemographics(BaseEstimator, TransformerMixin):
    def __init__(self, demo_df=None, on='subject'):
        self.demo_df = demo_df
        self.on = on

    def fit(self, X, y=None, demo_df=None):
        if demo_df is not None:
            self.demo_df = demo_df
        return self

    def transform(self, X):
        return X.merge(self.demo_df, on=self.on, how='left')

In [10]:
class GeneralPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.num_cols = None # store numeric col names
        self.cat_cols = None# store categorical col names
        
    def fit(self, X, y=None):
        X_copy = X.copy()

        # Identify column types
        self.num_cols = X_copy.select_dtypes(include=["number"]).columns.tolist()
        self.cat_cols = X_copy.select_dtypes(include=["object", "category"]).columns.tolist()
        
        return self

    def transform(self, X):
        X_copy = X.copy()

        # Set dtype categorical
        X_copy[self.cat_cols] = X_copy[self.cat_cols].astype("category")

        return X_copy

In [11]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, features_to_remove):
        self.features_to_remove = features_to_remove

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        features_to_keep = [col for col in X_copy.columns if col not in self.features_to_remove]
        return X_copy[features_to_keep]  

In [12]:
class TimeSeriesDataSelector(BaseEstimator, TransformerMixin):
    def __init__(self, rows_to_keep):
        self.rows_to_keep = rows_to_keep

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()

        # Select only the last rows of each time series (contains gesture)
        X_copy['row_idx'] = X_copy.groupby('sequence_id').cumcount(ascending=False)
        X_copy = X_copy[X_copy['row_idx'] < self.rows_to_keep]
        X_copy = X_copy.drop(['row_idx'], axis=1)
        
        return X_copy