# Dataset preprocessing

The goal of this notebook is to create a preprocessed kaggle dataset out of the competition dataset.  
For now, the preprocessing will be based on [this notebook](https://www.kaggle.com/code/vonmainstein/imu-tof).  
It consists of the following steps:
-   Set the appropriate dtypes (helps with RAM usage).
-   Impute missing feature values with forward, backward and then 0 filling.
-   Split the dataset into multiple cross validation folds.
-   Standardize feature values.
-   Pad/Truncate the sequences to the same length.  

> Note:  
> - Demographics data set will be ignored for now.  

## Setup

### Imports

In [1]:
import re
import os
import json
import shutil
import warnings
from os.path import join
from typing import Literal
from functools import partial
from tqdm.notebook import tqdm
from itertools import repeat, starmap, product

import numpy as np
import pandas as pd
from numpy import ndarray
import plotly.express as px
from numpy.linalg import norm
from pandas import DataFrame as DF
from scipy.spatial.transform import Rotation
from sklearn.model_selection import StratifiedGroupKFold

from kagglehub import whoami, competition_download, dataset_upload

### Supress performance warngings

In [2]:
warnings.filterwarnings(
    "ignore",
    message=(
        "DataFrame is highly fragmented.  This is usually the result of "
        "calling `frame.insert` many times.*"
    ),
    category=pd.errors.PerformanceWarning,
)

### Config

In [3]:
QUATERNION_COLS = ['rot_w', 'rot_x', 'rot_y', 'rot_z']
GRAVITY_WORLD = np.array([0, 0, 9.81], "float32")
RAW_ACCELRATION_COLS = ["acc_x", "acc_y", "acc_z"]
LINEAR_ACC_COLS = ["linear_" + col for col in RAW_ACCELRATION_COLS] # Acceleration without gravity
COMPETITION_HANDLE = "cmi-detect-behavior-with-sensor-data"
CATEGORY_COLUMNS = [
    'row_id',
    'sequence_type',
    'sequence_id',
    'subject',
    'orientation',
    'behavior',
    'phase',
    'gesture',
]
META_DATA_COLUMNS = [
    'row_id',
    'sequence_type',
    'sequence_id',
    'sequence_counter',
    'subject',
    'orientation',
    'behavior',
    'phase',
    'gesture',
]
DATASET_DF_DTYPES = {
    "acc_x": "float32", "acc_y": "float32", "acc_z": "float32",
    "thm_1":"float32", "thm_2":"float32", "thm_3":"float32", "thm_4":"float32", "thm_5":"float32",
    "sequence_counter": "int32",
    **{col: "category" for col in CATEGORY_COLUMNS},
    **{f"tof_{i_1}_v{i_2}": "float32" for i_1, i_2 in product(range(1, 5), range(64))},
}
PREPROCESSED_DATASET_HANDLE = "mauroabidalcarrer/prepocessed-cmi-2025"
# The quantile of the sequences len used to pad/truncate during preprocessing
SEQUENCE_NORMED_LEN_QUANTILE = 0.95
# SAMPLING_FREQUENCY = 10 #Hz
N_FOLDS = 5
VALIDATION_FRACTION = 0.2
TARGET_NAMES = sorted([
    "Above ear - pull hair",
    "Cheek - pinch skin",
    "Eyebrow - pull hair",
    "Eyelash - pull hair",
    "Feel around in tray and pull out an object",
    "Forehead - pull hairline",
    "Forehead - scratch",
    "Neck - pinch skin",
    "Neck - scratch",
    "Text on phone",
    "Wave hello",
    "Write name in air",
    "Write name on leg",
    "Drink from bottle/cup",
    "Pinch knee/leg skin",
    "Pull air toward your face",
    "Scratch knee/leg skin",
    "Glasses on/off"
])
EPSILON=1e-8
DELTA_ROTATION_ANGULAR_VELOCITY_COLS = ["angular_vel_x", "angular_vel_y", "angular_vel_z"]
DELTA_ROTATION_AXES_COLS = ["rotation_axis_x", "rotation_axis_y", "rotation_axis_z"]
EULER_ANGLES_COLS = ["euler_x", "euler_y", "euler_z"]
pad_trunc_mode_type = Literal["pre", "center", "post"]
SEQ_PAD_TRUNC_MODE: pad_trunc_mode_type = "pre"
DEFAULT_VERSION_NOTES = "Preprocessed Child Mind Institue 2025 competition dataset."
NB_COLS_PER_TOF_SENSOR = 64
TOF_PATCH_SIZE = 2
assert ((NB_COLS_PER_TOF_SENSOR // 2) % TOF_PATCH_SIZE) == 0, "tof side len should be dividable by TOF_PATCH_SIZE!"

### Define function to get the feature columns
Feature columns change over time so it's better to have a function to get them than manually update a variable every time we add/remove features.

In [4]:
def get_feature_cols(df:DF) -> list[str]:
    return sorted(list(set(df.columns) - set(META_DATA_COLUMNS) - set(TARGET_NAMES)))

### Load dataset
Requires to be logged in if this notebook is not running on kaggle, go to [your settings](https://www.kaggle.com/settings) to create an access token and put it in `~/.kaggle/`.

In [5]:
competition_dataset_path = competition_download(COMPETITION_HANDLE)
df = pd.read_csv(join(competition_dataset_path, "train.csv"), dtype=DATASET_DF_DTYPES)

In [6]:
64 // 16

4

## Data preprocessing

### Impute missing data
Perform forward, backward and then filling of all NaN sequences.

In [7]:
# Missing ToF values are already imputed by -1 which is inconvinient since we want all missing values to be NaN.    
# So we replace them by NaN and then perform imputing.
def get_fillna_val_per_feature_col(df:DF) -> dict:
    return {col: 1.0 if col == 'rot_w' else 0 for col in get_feature_cols(df)}

def imputed_features(df:DF) -> DF:
    # Missing ToF values are already imputed by -1 which is inconvinient since we want all missing values to be NaN.    
    # So we replace them by NaN and then perform imputing.  
    tof_vals_to_nan = {col: -1.0 for col in df.columns if col.startswith("tof")}
    # fillna_val_per_col = {col: 1.0 if col == 'rot_w' else 0 for col in df.columns}

    df[get_feature_cols(df)] = (
        df
        .loc[:, get_feature_cols(df)]
        # df.replace with np.nan sets dtype to floar64 so we set it back to float32
        .replace(tof_vals_to_nan, value=np.nan)
        .astype("float32")
        .groupby(df["sequence_id"], observed=True, as_index=False)
        .ffill()
        .groupby(df["sequence_id"], observed=True, as_index=False)
        .bfill()
        # In case there are only nan in the column in the sequence
        .fillna(get_fillna_val_per_feature_col(df))
    )
    return df

df = imputed_features(df)

### Standardize ToF sensors columns names
ToF sensors have `tof_{sensor_idx}_v{sensor_captor_idx}` nomenclature where `sensor_captor_idx` can be 1 digit if it's less than 10 or two if it's more.   
When sorting the features in alphabetical order the tof_x_v10 will end up before tof_x_v2.  
This in turn can cause issues if we try to reshape the input data to form a 3d input.  
To fix this we simply insert a 0 before the digit in the `sensor_captor_idx` lower than 10.  

In [8]:
def standardize_tof_cols_names(df: DF) -> DF:
    renamed_cols = {}
    pattern = re.compile(r"^(tof_\d_v)(\d)$")  # match 'tof_X_vY' where Y is a single digit

    for col in df.columns:
        match = pattern.match(col)
        if match:
            prefix, version = match.groups()
            new_col = f"{prefix}0{version}"
            renamed_cols[col] = new_col

    return df.rename(columns=renamed_cols)

df = standardize_tof_cols_names(df)

### Norm quaternions
This allows us to parse quaternions with `scipy.spatial.transform.Rotation`.

In [9]:
def norm_quat_rotations(df:DF) -> DF:
    df[QUATERNION_COLS] /= np.linalg.norm(df[QUATERNION_COLS], axis=1, keepdims=True)
    return df

### Linear acceleration
Remove gravity from the acceleration features.

In [10]:
# Removes gravity from the acceleration features
def add_linear_acc_cols(df:DF) -> DF:
    # Vectorized version of https://www.kaggle.com/code/wasupandceacar/lb-0-82-5fold-single-bert-model#Dataset `remove_gravity_from_acc`
    rotations:Rotation = Rotation.from_quat(df[QUATERNION_COLS])
    gravity_sensor_frame = rotations.apply(GRAVITY_WORLD, inverse=True).astype("float32")
    df[LINEAR_ACC_COLS] = df[RAW_ACCELRATION_COLS] - gravity_sensor_frame
    return df

df = add_linear_acc_cols(df)

### Accelerations magnitudes
Add magnitue (norm) of both  raw acceleration and linear accelration.

In [11]:
def add_acc_magnitude(df:DF, acc_cols:list[str], acc_mag_col_name:str) -> DF:
    return df.assign(**{acc_mag_col_name: np.linalg.norm(df.loc[:, acc_cols], axis=1)})

df = add_acc_magnitude(df, RAW_ACCELRATION_COLS, "acc_mag")
df = add_acc_magnitude(df, LINEAR_ACC_COLS, "linear_acc_mag")

### Quaternion angle magnitude
Refered to as "rotation angle" in top notebooks.

In [12]:
def add_quat_angle_mag(df:DF) -> DF:
    return df.assign(quat_rot_mag=np.arccos(df["rot_w"]) * 2)

df = add_quat_angle_mag(df)

### Angular velocity
Compute the axis of the rotation difference and its angular magnutide.  
Top notebooks scale the vector by the angular magnitude.  
I will also include the magnitude and unit axis vector in case they turn out to be more informative.   

In [13]:
def add_angular_velocity_features(df:DF) -> DF:
    rotations = Rotation.from_quat(df[QUATERNION_COLS])
    delta_rotations = rotations[1:] * rotations[:-1].inv()
    delta_rot_velocity = delta_rotations.as_rotvec()
    # Add extra line to avoid shape mismatch
    delta_rot_velocity = np.vstack((np.zeros((1, 3)), delta_rot_velocity))
    delta_rot_magnitude = norm(delta_rot_velocity, axis=1, keepdims=True)
    delta_rot_axes = delta_rot_velocity / (delta_rot_magnitude + EPSILON)
    df[DELTA_ROTATION_ANGULAR_VELOCITY_COLS] = delta_rot_velocity
    df[DELTA_ROTATION_AXES_COLS] = delta_rot_axes
    df["delta_rot_mag"] = delta_rot_magnitude.squeeze()

    return df

df = add_angular_velocity_features(df)

### Euler angles from quaternions

In [14]:
def rot_euler_angles(df:DF) -> ndarray:
    df[EULER_ANGLES_COLS] = (
        Rotation
        .from_quat(df[QUATERNION_COLS])
        .as_euler("xyz")
        .squeeze()
    )
    return df

df = rot_euler_angles(df)

### One hot encode target values.

In [15]:
one_hot_target = pd.get_dummies(df["gesture"])
df[TARGET_NAMES] = one_hot_target[TARGET_NAMES]
df

Unnamed: 0,row_id,sequence_type,sequence_id,sequence_counter,subject,orientation,behavior,phase,gesture,acc_x,...,Glasses on/off,Neck - pinch skin,Neck - scratch,Pinch knee/leg skin,Pull air toward your face,Scratch knee/leg skin,Text on phone,Wave hello,Write name in air,Write name on leg
0,SEQ_000007_000000,Target,SEQ_000007,0,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,6.683594,...,False,False,False,False,False,False,False,False,False,False
1,SEQ_000007_000001,Target,SEQ_000007,1,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,6.949219,...,False,False,False,False,False,False,False,False,False,False
2,SEQ_000007_000002,Target,SEQ_000007,2,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,5.722656,...,False,False,False,False,False,False,False,False,False,False
3,SEQ_000007_000003,Target,SEQ_000007,3,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,6.601562,...,False,False,False,False,False,False,False,False,False,False
4,SEQ_000007_000004,Target,SEQ_000007,4,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,5.566406,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574940,SEQ_065531_000048,Non-Target,SEQ_065531,48,SUBJ_039498,Seated Lean Non Dom - FACE DOWN,Performs gesture,Gesture,Write name on leg,3.503906,...,False,False,False,False,False,False,False,False,False,True
574941,SEQ_065531_000049,Non-Target,SEQ_065531,49,SUBJ_039498,Seated Lean Non Dom - FACE DOWN,Performs gesture,Gesture,Write name on leg,3.773438,...,False,False,False,False,False,False,False,False,False,True
574942,SEQ_065531_000050,Non-Target,SEQ_065531,50,SUBJ_039498,Seated Lean Non Dom - FACE DOWN,Performs gesture,Gesture,Write name on leg,3.082031,...,False,False,False,False,False,False,False,False,False,True
574943,SEQ_065531_000051,Non-Target,SEQ_065531,51,SUBJ_039498,Seated Lean Non Dom - FACE DOWN,Performs gesture,Gesture,Write name on leg,3.964844,...,False,False,False,False,False,False,False,False,False,True


### ToF data aggregation.
Time of Flight columns take most of the data, let's reduce their size by aggregating by mean for each Time of Flight sensor.

In [16]:
def agg_tof_cols_per_sensor(df:DF) -> DF:
    for tof_idx in tqdm(range(1, 6)):
        tof_name = f"tof_{tof_idx}"
        all_tof_cols = [f"{tof_name}_v{v_idx:02d}" for v_idx in range(64)]
        if any(map(lambda col: col not in df.columns, all_tof_cols)):
            print(f"Some (or) all ToF {tof_idx} columns are not in the df. Maybe you already ran this cell?")
            continue
        df = pd.concat(
            (
                # df.drop(columns=tof_cols),
                df,
                # For some reasons, it's faster to call all the aggregation functions seperatly than agg(list of functions)
                df[all_tof_cols].mean(axis="columns").to_frame(tof_name + "_mean"),
                df[all_tof_cols].std(axis="columns").to_frame(tof_name + "_std"),
                df[all_tof_cols].median(axis="columns").to_frame(tof_name + "_median"),
                df[all_tof_cols].min(axis="columns").to_frame(tof_name + "_min"),
                df[all_tof_cols].max(axis="columns").to_frame(tof_name + "_max"),
            ),
            axis="columns",
        )
    return df

df = agg_tof_cols_per_sensor(df)
df

  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,row_id,sequence_type,sequence_id,sequence_counter,subject,orientation,behavior,phase,gesture,acc_x,...,tof_4_mean,tof_4_std,tof_4_median,tof_4_min,tof_4_max,tof_5_mean,tof_5_std,tof_5_median,tof_5_min,tof_5_max
0,SEQ_000007_000000,Target,SEQ_000007,0,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,6.683594,...,123.359375,41.862347,134.0,51.0,206.0,135.343750,32.397930,128.5,88.0,226.0
1,SEQ_000007_000001,Target,SEQ_000007,1,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,6.949219,...,124.406250,40.487785,135.5,60.0,206.0,137.000000,32.489803,129.5,88.0,226.0
2,SEQ_000007_000002,Target,SEQ_000007,2,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,5.722656,...,125.687500,36.980423,131.5,71.0,206.0,140.234375,34.100105,136.0,88.0,226.0
3,SEQ_000007_000003,Target,SEQ_000007,3,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,6.601562,...,149.078125,29.778711,147.0,100.0,210.0,142.609375,35.809734,137.5,88.0,226.0
4,SEQ_000007_000004,Target,SEQ_000007,4,SUBJ_059520,Seated Lean Non Dom - FACE DOWN,Relaxes and moves hand to target location,Transition,Cheek - pinch skin,5.566406,...,163.765625,29.487551,157.5,116.0,229.0,151.265625,38.821220,142.5,88.0,226.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574940,SEQ_065531_000048,Non-Target,SEQ_065531,48,SUBJ_039498,Seated Lean Non Dom - FACE DOWN,Performs gesture,Gesture,Write name on leg,3.503906,...,74.062500,28.283079,66.5,39.0,162.0,52.843750,9.838730,52.0,27.0,73.0
574941,SEQ_065531_000049,Non-Target,SEQ_065531,49,SUBJ_039498,Seated Lean Non Dom - FACE DOWN,Performs gesture,Gesture,Write name on leg,3.773438,...,70.406250,29.312611,61.5,38.0,162.0,54.531250,10.579958,54.5,29.0,73.0
574942,SEQ_065531_000050,Non-Target,SEQ_065531,50,SUBJ_039498,Seated Lean Non Dom - FACE DOWN,Performs gesture,Gesture,Write name on leg,3.082031,...,70.109375,29.322031,63.0,41.0,162.0,57.468750,11.371042,57.5,26.0,80.0
574943,SEQ_065531_000051,Non-Target,SEQ_065531,51,SUBJ_039498,Seated Lean Non Dom - FACE DOWN,Performs gesture,Gesture,Write name on leg,3.964844,...,75.609375,27.347044,68.0,48.0,162.0,54.937500,10.818671,54.5,25.0,77.0


In [17]:
df.dtypes.to_dict()

{'row_id': CategoricalDtype(categories=['SEQ_000007_000000', 'SEQ_000007_000001',
                   'SEQ_000007_000002', 'SEQ_000007_000003',
                   'SEQ_000007_000004', 'SEQ_000007_000005',
                   'SEQ_000007_000006', 'SEQ_000007_000007',
                   'SEQ_000007_000008', 'SEQ_000007_000009',
                   ...
                   'SEQ_065531_000043', 'SEQ_065531_000044',
                   'SEQ_065531_000045', 'SEQ_065531_000046',
                   'SEQ_065531_000047', 'SEQ_065531_000048',
                   'SEQ_065531_000049', 'SEQ_065531_000050',
                   'SEQ_065531_000051', 'SEQ_065531_000052'],
 , ordered=False, categories_dtype=object),
 'sequence_type': CategoricalDtype(categories=['Non-Target', 'Target'], ordered=False, categories_dtype=object),
 'sequence_id': CategoricalDtype(categories=['SEQ_000007', 'SEQ_000008', 'SEQ_000013', 'SEQ_000016',
                   'SEQ_000018', 'SEQ_000022', 'SEQ_000033', 'SEQ_000034',
            

In [18]:
get_fillna_val_per_feature_col(df)

{'acc_mag': 0,
 'acc_x': 0,
 'acc_y': 0,
 'acc_z': 0,
 'angular_vel_x': 0,
 'angular_vel_y': 0,
 'angular_vel_z': 0,
 'delta_rot_mag': 0,
 'euler_x': 0,
 'euler_y': 0,
 'euler_z': 0,
 'linear_acc_mag': 0,
 'linear_acc_x': 0,
 'linear_acc_y': 0,
 'linear_acc_z': 0,
 'quat_rot_mag': 0,
 'rot_w': 1.0,
 'rot_x': 0,
 'rot_y': 0,
 'rot_z': 0,
 'rotation_axis_x': 0,
 'rotation_axis_y': 0,
 'rotation_axis_z': 0,
 'thm_1': 0,
 'thm_2': 0,
 'thm_3': 0,
 'thm_4': 0,
 'thm_5': 0,
 'tof_1_max': 0,
 'tof_1_mean': 0,
 'tof_1_median': 0,
 'tof_1_min': 0,
 'tof_1_std': 0,
 'tof_1_v00': 0,
 'tof_1_v01': 0,
 'tof_1_v02': 0,
 'tof_1_v03': 0,
 'tof_1_v04': 0,
 'tof_1_v05': 0,
 'tof_1_v06': 0,
 'tof_1_v07': 0,
 'tof_1_v08': 0,
 'tof_1_v09': 0,
 'tof_1_v10': 0,
 'tof_1_v11': 0,
 'tof_1_v12': 0,
 'tof_1_v13': 0,
 'tof_1_v14': 0,
 'tof_1_v15': 0,
 'tof_1_v16': 0,
 'tof_1_v17': 0,
 'tof_1_v18': 0,
 'tof_1_v19': 0,
 'tof_1_v20': 0,
 'tof_1_v21': 0,
 'tof_1_v22': 0,
 'tof_1_v23': 0,
 'tof_1_v24': 0,
 'tof_1_v25':

### Add derivatives w.r.t time features

In [19]:
def add_diff_features(df:DF) -> DF:
    return pd.concat(
        (
            df,
            (
                df
                .groupby("sequence_id", as_index=False, observed=True)
                [get_feature_cols(df)]
                .diff()
                .fillna(get_fillna_val_per_feature_col(df))
                .add_suffix("_diff")
            )
        ),
        axis="columns",
    )

df = add_diff_features(df)

In [20]:
len(get_feature_cols(df))

746

### Split into folds

In [21]:
df["gesture"].unique().tolist()

['Cheek - pinch skin',
 'Forehead - pull hairline',
 'Write name on leg',
 'Feel around in tray and pull out an object',
 'Neck - scratch',
 'Neck - pinch skin',
 'Eyelash - pull hair',
 'Eyebrow - pull hair',
 'Forehead - scratch',
 'Above ear - pull hair',
 'Wave hello',
 'Write name in air',
 'Text on phone',
 'Pull air toward your face',
 'Pinch knee/leg skin',
 'Scratch knee/leg skin',
 'Drink from bottle/cup',
 'Glasses on/off']

In [22]:
def split_dataset_stratified_groupkfold(df: pd.DataFrame, target_col: str, group_col: str) -> list[tuple[pd.DataFrame, pd.DataFrame]]:
    sgkf = StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True)

    folds = []
    X = df.drop(columns=[target_col])
    y = df[target_col]
    groups = df[group_col]

    for train_idx, valid_idx in sgkf.split(X, y, groups):
        train_df = df.iloc[train_idx].copy()
        valid_df = df.iloc[valid_idx].copy()
        folds.append((train_df, valid_df))

    return folds

# Example usage:
folds = split_dataset_stratified_groupkfold(df, target_col="gesture", group_col="subject")

### Std norm
Standard scale the feature cols (should probably do something different for IMU cols).  
<!-- *Deprecated, std norm is now performed at dataset creation to avoid target leakage.*   -->

In [23]:
def std_norm_dataset(train:DF, val:DF) -> tuple[DF, DF]:
    means = train[get_feature_cols(df)].mean().astype("float32")
    stds = train[get_feature_cols(df)].std().astype("float32")
    test = train[get_feature_cols(df)] - means
    train.loc[:, get_feature_cols(df)] = (train[get_feature_cols(df)] - means) / stds
    val.loc[:, get_feature_cols(df)] = (val[get_feature_cols(df)] - means) / stds
    return train, val

# folds = list(starmap(std_norm_dataset, folds))

Normalize full dataset.

In [24]:
# Retain full dataset meta data for inference
full_dataset_meta_data = {
    "mean": df[get_feature_cols(df)].mean().astype("float32").to_dict(),
    "std": df[get_feature_cols(df)].std().astype("float32").to_dict(),
}
# df.loc[:, get_feature_cols(df)] = (df[get_feature_cols(df)] - full_dataset_meta_data["mean"]) / full_dataset_meta_data['std']

Verify the mean and std of the full dataset.

In [25]:
df[get_feature_cols(df)].agg(["mean", "std"])

Unnamed: 0,acc_mag,acc_mag_diff,acc_x,acc_x_diff,acc_y,acc_y_diff,acc_z,acc_z_diff,angular_vel_x,angular_vel_x_diff,...,tof_5_v59,tof_5_v59_diff,tof_5_v60,tof_5_v60_diff,tof_5_v61,tof_5_v61_diff,tof_5_v62,tof_5_v62_diff,tof_5_v63,tof_5_v63_diff
mean,10.013321,0.000845,1.63998,-0.033817,1.790704,0.042222,-0.459811,-0.075174,0.004202,0.002148,...,109.432045,0.053868,105.047684,0.054457,98.860481,0.023232,95.268021,0.042291,93.515732,0.030533
std,1.212614,1.730977,5.781259,2.150647,5.003945,1.541683,6.09649,2.045596,0.197478,0.191036,...,81.556953,14.378976,80.806358,13.759714,79.785675,13.369132,77.836243,12.671128,76.951424,12.434152


Let's compare the train to validation mean/std skews.

In [26]:
pd.concat({
    "train": folds[0][0][get_feature_cols(df)].agg(["mean", "std"]),
    "validation": folds[0][1][get_feature_cols(df)].agg(["mean", "std"]),
})

Unnamed: 0,Unnamed: 1,acc_mag,acc_mag_diff,acc_x,acc_x_diff,acc_y,acc_y_diff,acc_z,acc_z_diff,angular_vel_x,angular_vel_x_diff,...,tof_5_v59,tof_5_v59_diff,tof_5_v60,tof_5_v60_diff,tof_5_v61,tof_5_v61_diff,tof_5_v62,tof_5_v62_diff,tof_5_v63,tof_5_v63_diff
train,mean,10.004488,0.000862,1.611876,-0.033106,1.764006,0.040635,-0.427362,-0.074988,0.004108,0.002136,...,110.212059,0.050321,105.685349,0.058049,98.920914,0.019799,95.03289,0.034731,93.488266,0.022082
train,std,1.193863,1.713022,5.79473,2.112629,4.977146,1.541242,6.105071,2.038839,0.195563,0.189077,...,81.767296,14.453572,80.929665,13.838927,79.857689,13.439622,78.086006,12.726898,76.991798,12.46045
validation,mean,10.050064,0.000774,1.756865,-0.036775,1.901739,0.048824,-0.594768,-0.075944,0.004593,0.002196,...,106.187935,0.068617,102.395561,0.039518,98.609146,0.037508,96.245979,0.073732,93.629982,0.065683
validation,std,1.287031,1.80374,5.723436,2.302049,5.112428,1.543508,6.058836,2.073469,0.205252,0.198975,...,80.595642,14.064533,80.237495,13.425301,79.485359,13.071937,76.781349,12.4365,76.783539,12.324168


### Normalize sequences lengths.  
And turn the Dataframes into ndarrays.

#### Visualize histogram of sequences lengths.

Entire dataset sequences lengths.

In [27]:
px.histogram(
    (
        df
        .groupby("sequence_id", observed=True)
        .size()
    ),
    title="Sequence length frequency",
)

Second(to avoid always look at the first one) Train/validation split sequences lengths comparaison.

In [28]:
def get_set_sequences_lengths(set:DF, name:str) -> DF:
    return (
        set
        .groupby("sequence_id", observed=True)
        .size()
        .reset_index(name="length")
        .assign(set=name)
    )

full_se_lengths = pd.concat((
    get_set_sequences_lengths(folds[2][0], "Train"),
    get_set_sequences_lengths(folds[2][1], "Validation"),
))

fig = px.histogram(
    full_se_lengths,
    x="length",
    color="set",
    barmode="overlay",  # or 'group' if you want side-by-side bars
    nbins=50,           # adjust bin size if needed
    title="Sequence Length Distribution: Train vs Validation"
)

fig.update_traces(opacity=0.8)  # better visibility with overlay
fig.show()


In [29]:
for train, val in folds:
    print("train normed sequence len:", int(train.groupby("sequence_id", observed=True).size().quantile(SEQUENCE_NORMED_LEN_QUANTILE)))
    print("validation normed sequence len:", int(val.groupby("sequence_id", observed=True).size().quantile(SEQUENCE_NORMED_LEN_QUANTILE)))
    print()

train normed sequence len: 129
validation normed sequence len: 121

train normed sequence len: 127
validation normed sequence len: 124

train normed sequence len: 122
validation normed sequence len: 144

train normed sequence len: 125
validation normed sequence len: 135

train normed sequence len: 131
validation normed sequence len: 114



#### Sequence length norm implementation

In [30]:
def length_normed_sequence_feat_arr(
        sequence: DF,
        normed_sequence_len: int,
        pad_trunc_mode:Literal["pre", "center", "post"]
    ) -> ndarray:
    features = (
        sequence
        .loc[:, get_feature_cols(df)]
        .values
    )
    len_diff = abs(normed_sequence_len - len(features))
    len_diff_h = len_diff // 2 # half len diff
    len_diff_r = len_diff % 2 # len diff remainder
    if len(features) < normed_sequence_len:
        padding_dict = {
            "pre": (len_diff, 0),
            "center": (len_diff_h + len_diff_r, len_diff_h),
            "post": (0, len_diff),
        }
        padded_features = np.pad(
            features,
            (padding_dict[pad_trunc_mode], (0, 0)),
        )
        return padded_features
    elif len(features) > normed_sequence_len:
        truncating_dict = {
            "pre": slice(len_diff),
            "center": slice(len_diff_h, -len_diff_h),
            "post": slice(0, -len_diff),
        }
        return features[len_diff // 2:-len_diff // 2]
    else:
        return features

def df_to_ndarrays(df:DF, normed_sequence_len:int, seq_pad_trunc_mode:str) -> tuple[np.ndarray, np.ndarray]:
    sequence_it = df.groupby("sequence_id", observed=True, as_index=False)
    x = np.empty(
        shape=(len(sequence_it), normed_sequence_len, len(get_feature_cols(df))),
        dtype="float32"
    )
    y = np.empty(
        shape=(len(sequence_it), len(TARGET_NAMES)),
        dtype="float32"
    )
    for sequence_idx, (_, sequence) in tqdm(enumerate(sequence_it), total=len(sequence_it)):
        normed_seq_feat_arr = length_normed_sequence_feat_arr(sequence, normed_sequence_len, seq_pad_trunc_mode)
        x[sequence_idx] = normed_seq_feat_arr
        # Take the first value as they are(or at least should be) all the same in a single sequence
        y[sequence_idx] = sequence[TARGET_NAMES].iloc[0].values

    return x, y

def get_normed_seq_len(dataset:DF) -> int:
    return int(
        dataset
        .groupby("sequence_id", observed=True)
        .size()
        .quantile(SEQUENCE_NORMED_LEN_QUANTILE)
    )

def fold_dfs_to_ndarrays(train:DF, validation:DF, seq_pad_trunc_mode:str) -> tuple[ndarray, ndarray, ndarray, ndarray]:
    """
    Returns:
        (train X, train Y, validation X, validation Y)
    """
    full_dataset_normed_seq_len = get_normed_seq_len(df)
    return (
        *df_to_ndarrays(train, full_dataset_normed_seq_len, seq_pad_trunc_mode),
        *df_to_ndarrays(validation, full_dataset_normed_seq_len, seq_pad_trunc_mode),
    )

## Create dataset

In [None]:
# Clean dataset directory if it already exists
! rm -rf preprocessed_dataset
# Create dataset direcory
! mkdir preprocessed_dataset
# Save folds
for pad_trunc_mode in ("center", ):
    folds_it = enumerate(starmap(partial(fold_dfs_to_ndarrays, seq_pad_trunc_mode=pad_trunc_mode), folds))
    for fold_i, (train_x, train_y, val_x, val_y) in folds_it:
        fold_dir_path = join("preprocessed_dataset", pad_trunc_mode, f"fold_{fold_i}")
        os.makedirs(fold_dir_path)
        # save features (X)
        np.save(join(fold_dir_path, "train_X.npy"), train_x, allow_pickle=False)
        np.save(join(fold_dir_path, "validation_X.npy"), val_x, allow_pickle=False)
        # Save targets (Y)
        np.save(join(fold_dir_path, "train_Y.npy"), train_y, allow_pickle=False)
        np.save(join(fold_dir_path, "validation_Y.npy"), val_y, allow_pickle=False)
    # Save full dataset
    full_dataset_dir_path = join("preprocessed_dataset/full_dataset", pad_trunc_mode)
    full_dataset_sequence_length_norm = get_normed_seq_len(df)
    full_x, full_y = df_to_ndarrays(df, full_dataset_sequence_length_norm, pad_trunc_mode)
    os.makedirs(full_dataset_dir_path)
    np.save(join(full_dataset_dir_path, "X.npy"), full_x, allow_pickle=False)
    np.save(join(full_dataset_dir_path, "Y.npy"), full_y, allow_pickle=False)
# Save dataset meta data
# full_dataset_meta_data["target_names"] = TARGET_NAMES
full_dataset_meta_data["pad_seq_len"] = full_dataset_sequence_length_norm
full_dataset_meta_data["feature_cols"] = get_feature_cols(df)

with open("preprocessed_dataset/full_dataset_meta_data.json", "w") as fp:
    json.dump(full_dataset_meta_data, fp, indent=4)

shutil.make_archive("preprocessed_dataset", 'zip', "preprocessed_dataset")

  0%|          | 0/6519 [00:00<?, ?it/s]

  0%|          | 0/1632 [00:00<?, ?it/s]

  0%|          | 0/6423 [00:00<?, ?it/s]

  0%|          | 0/1728 [00:00<?, ?it/s]

  0%|          | 0/6570 [00:00<?, ?it/s]

  0%|          | 0/1581 [00:00<?, ?it/s]

  0%|          | 0/6573 [00:00<?, ?it/s]

  0%|          | 0/1578 [00:00<?, ?it/s]

  0%|          | 0/6519 [00:00<?, ?it/s]

  0%|          | 0/1632 [00:00<?, ?it/s]

  0%|          | 0/8151 [00:00<?, ?it/s]

In [None]:
# ! zip -r preprocessed_dataset.zip preprocessed_dataset/

In [None]:
df[get_feature_cols(df)]

Unnamed: 0,acc_mag,acc_mag_diff,acc_x,acc_x_diff,acc_y,acc_y_diff,acc_z,acc_z_diff,angular_vel_x,angular_vel_x_diff,...,tof_5_v59,tof_5_v59_diff,tof_5_v60,tof_5_v60_diff,tof_5_v61,tof_5_v61_diff,tof_5_v62,tof_5_v62_diff,tof_5_v63,tof_5_v63_diff
0,9.723882,0.000000,6.683594,0.000000,6.214844,0.000000,3.355469,0.000000,0.000000,0.000000,...,88.0,0.0,91.0,0.0,89.0,0.0,226.0,0.0,88.0,0.0
1,9.832678,0.108796,6.949219,0.265625,6.214844,0.000000,3.125000,-0.230469,-0.010703,-0.010703,...,88.0,0.0,91.0,0.0,89.0,0.0,226.0,0.0,88.0,0.0
2,9.561135,-0.271543,5.722656,-1.226562,5.410156,-0.804688,5.421875,2.296875,-0.105290,-0.094587,...,88.0,0.0,91.0,0.0,89.0,0.0,226.0,0.0,88.0,0.0
3,9.886537,0.325401,6.601562,0.878906,3.531250,-1.878906,6.457031,1.035156,-0.185590,-0.080300,...,88.0,0.0,91.0,0.0,89.0,0.0,226.0,0.0,88.0,0.0
4,11.128921,1.242384,5.566406,-1.035156,0.277344,-3.253906,9.632812,3.175781,-0.117877,0.067713,...,88.0,0.0,91.0,0.0,89.0,0.0,226.0,0.0,88.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574940,9.150011,-0.753489,3.503906,-0.078125,-0.433594,0.308594,-8.441406,0.761719,-0.041609,-0.053515,...,69.0,0.0,71.0,0.0,68.0,0.0,73.0,0.0,71.0,1.0
574941,9.972424,0.822412,3.773438,0.269531,-0.664062,-0.230469,-9.207031,-0.765625,-0.019417,0.022191,...,69.0,0.0,71.0,0.0,68.0,0.0,73.0,0.0,71.0,0.0
574942,8.021313,-1.951111,3.082031,-0.691406,0.218750,0.882812,-7.402344,1.804688,0.002137,0.021554,...,69.0,0.0,71.0,0.0,68.0,0.0,73.0,0.0,71.0,0.0
574943,9.919848,1.898536,3.964844,0.882812,-0.359375,-0.578125,-9.085938,-1.683594,-0.019210,-0.021347,...,69.0,0.0,71.0,0.0,68.0,0.0,73.0,0.0,71.0,0.0


## Dataset upload
Optionally upload the dataset to kaggle.

In [None]:
# upload_input = input("Do you want to upload the  dataset to kaggle?[yes/no]").lower()
# if upload_input == "yes":
#     version_notes = input("Provide a version notes:")
# version_notes = version_notes if version_notes else DEFAULT_VERSION_NOTES
version_notes = "Added-back-tof-stats"
# Updaload the dataset
dataset_upload(
    join(whoami()["username"], "prepocessed-cmi-2025"),
    "preprocessed_dataset.zip",
    version_notes=version_notes,
)
# elif upload_input == "no":
#     print("Dataset has not been uploaded.")
# else:
#     print("Did not understand user input, dataset has not been uploaded.")

Kaggle credentials successfully validated.
Uploading Dataset https://www.kaggle.com/datasets/mauroabidalcarrer/prepocessed-cmi-2025 ...
Starting upload for file preprocessed_dataset.zip


Uploading: 100%|██████████| 2.21G/2.21G [03:15<00:00, 11.3MB/s]

Upload successful: preprocessed_dataset.zip (2GB)





Your dataset has been created.
Files are being processed...
See at: https://www.kaggle.com/datasets/mauroabidalcarrer/prepocessed-cmi-2025
