In [1]:
import numpy as np
import pandas as pd
import polars as pl
import torch
import torch.nn.functional as F
import joblib
from sklearn.preprocessing import StandardScaler

In [2]:
RAW_DIR = "/kaggle/input/cmi-detect-behavior-with-sensor-data"
label_encoder = joblib.load("/kaggle/input/cmi-label-encoder/label_encoder.joblib")

PAD_PERCENTILE = 95

In [3]:
train = pl.read_csv(f"{RAW_DIR}/train.csv")
train_demo = pl.read_csv(f"{RAW_DIR}/train_demographics.csv")
train_merged = train.join(train_demo, on="subject", how="left")

In [4]:
train = train.with_columns(
    pl.fold(
        acc = pl.lit(0),
        function = lambda acc,x: acc + x**2,
        exprs = "^acc_.*$" 
    ).sqrt().alias("acc_mag"),

    (2*np.arccos(pl.col("rot_w").clip(-1, 1))).alias("rot_angle")
    
).with_columns(
    pl.col("acc_mag").diff().over("sequence_id").fill_null(0).alias("acc_mag_diff"),
    pl.col("rot_angle").diff().over("sequence_id").fill_null(0).alias("rot_angle_diff"),
    
    ## Differential features for temperature
    pl.col("thm_1").diff().over("sequence_id").fill_null(0).alias("thm_1_diff"),
    pl.col("thm_2").diff().over("sequence_id").fill_null(0).alias("thm_2_diff"),
    pl.col("thm_3").diff().over("sequence_id").fill_null(0).alias("thm_3_diff"),
    pl.col("thm_4").diff().over("sequence_id").fill_null(0).alias("thm_4_diff"),
    pl.col("thm_5").diff().over("sequence_id").fill_null(0).alias("thm_5_diff"),

    ## Differential features for acceleration features
    pl.col("acc_x").diff().over("sequence_id").fill_null(0).alias("acc_x_diff"),
    pl.col("acc_y").diff().over("sequence_id").fill_null(0).alias("acc_y_diff"),
    pl.col("acc_z").diff().over("sequence_id").fill_null(0).alias("acc_z_diff"),
).to_pandas()

## Encode gestures
train['gesture_int'] = label_encoder.transform(train['gesture'])

In [5]:
meta_cols = {
    'gesture', 'gesture_int', 'sequence_type', 'behavior', 'orientation',
    'row_id', 'subject', 'phase', 'sequence_id', 'sequence_counter'
}

## imu features
imu_cols = ["acc_x", "acc_y", "acc_z", 
            "acc_x_diff", "acc_y_diff", "acc_z_diff",
            "rot_w", "rot_x", "rot_y", "rot_z",
            "acc_mag", "rot_angle", 
            "acc_mag_diff", "rot_angle_diff"]

## thm features
all_thm_cols = ["thm_1", "thm_2", "thm_3", "thm_4", "thm_5",
                "thm_1_diff", "thm_2_diff", "thm_3_diff", "thm_4_diff", "thm_5_diff"]
tof_cols = [f"tof_{i}_v{j}" for i in range(1,6) for j in range(0,64)]

## Store summary statistics of tof features
tof_aggregated_cols_template = []
for i in range(1, 6):
    tof_aggregated_cols_template.extend([f'tof_{i}_mean', f'tof_{i}_std', f'tof_{i}_min', f'tof_{i}_max'])

thm_tof_cols = all_thm_cols + tof_aggregated_cols_template + tof_cols

## Aggregated features' names
final_feature_cols = imu_cols + thm_tof_cols

## Dimensions of imu features and aggreated thm-tof features
imu_dim_final = len(imu_cols)
tof_thm_aggregated_dim_final = len(all_thm_cols) + len(tof_aggregated_cols_template)

In [6]:
print(f"Total Features Count: {len(final_feature_cols)}")

Total Features Count: 364


In [7]:
seq_groups = train.groupby('sequence_id') 

all_steps_for_scaler_list = []
X_list, y_int, lens = [], [], [] 

## Make and store only reelvant features, stores gesture, store sequence lengths 
for seq_id,seq_df_orig in seq_groups:
    seq_df = seq_df_orig.copy()
    for i in range(1, 6):
        pixel_cols_tof = [f"tof_{i}_v{p}" for p in range(64)]
        tof_sensor_data = seq_df[pixel_cols_tof].replace(-1, np.nan)
        seq_df[f'tof_{i}_mean'] = tof_sensor_data.mean(axis=1)
        seq_df[f'tof_{i}_std']  = tof_sensor_data.std(axis=1)
        seq_df[f'tof_{i}_min']  = tof_sensor_data.min(axis=1)
        seq_df[f'tof_{i}_max']  = tof_sensor_data.max(axis=1)
        
    mat_unscaled = seq_df[final_feature_cols].ffill().bfill().fillna(0).values.astype('float32')
        
    all_steps_for_scaler_list.append(mat_unscaled)
    X_list.append(mat_unscaled)
    y_int.append(seq_df['gesture_int'].iloc[0])
    lens.append(len(mat_unscaled))

In [8]:
all_steps_concatenated = np.concatenate(all_steps_for_scaler_list, axis=0)
scaler = StandardScaler().fit(all_steps_concatenated)

del all_steps_for_scaler_list, all_steps_concatenated

In [9]:
def preprocess_sequence(df_seq: pd.DataFrame, feature_cols: list, scaler: StandardScaler):
    """Normalizes and cleans the time series sequence"""
    mat = df_seq[feature_cols].ffill().bfill().fillna(0).values
    return scaler.transform(mat).astype('float32')

In [10]:
def pad_sequences(sequences, maxlen, padding='post', truncating='post', dtype=torch.float32):
    """
    Pads a list of PyTorch tensors to a specified maxlen.
    """
    if not sequences: return torch.empty(0, maxlen, 0, dtype=dtype) # Handle empty list of sequences

    # Assume all sequences have the same feature dimension
    feature_dim = sequences[0].shape[-1]
    # Convert all sequences to tensor type
    sequences = [torch.tensor(seq, dtype=dtype) if not isinstance(seq, torch.Tensor) else seq 
                 for seq in sequences]
    
    padded_sequences = []
    for seq in sequences:
        current_len = seq.shape[0]
        
        # Truncate if necessary
        if current_len > maxlen:
            if truncating == 'pre':
                truncated_seq = seq[-maxlen:]
            else: # 'post'
                truncated_seq = seq[:maxlen]
        else:
            truncated_seq = seq

        padding_needed = maxlen - truncated_seq.shape[0]
        
        # Pad if necessary
        if padding_needed > 0:
            if padding == 'pre':
                # F.pad format: (pad_left, pad_right, pad_top, pad_bottom, ...)
                # For (timesteps, features), we pad (0,0) for features, (padding_needed, 0) for timesteps
                padded_seq = F.pad(truncated_seq, (0, 0, padding_needed, 0), 'constant', 0)
            else: # 'post'
                padded_seq = F.pad(truncated_seq, (0, 0, 0, padding_needed), 'constant', 0)
        else:
            padded_seq = truncated_seq
        padded_sequences.append(padded_seq)
            
    return torch.stack(padded_sequences).to(dtype)

In [11]:
## Process all sequences
X_list = [
    preprocess_sequence(
        pd.DataFrame(x_seq, columns=final_feature_cols), 
        final_feature_cols,
        scaler)
    for x_seq in X_list
]

## Finds the appropriate pad length
pad_len = int(np.percentile(lens, PAD_PERCENTILE))

## Pad all sequences and converts to tensor 
X = pad_sequences(X_list, maxlen=pad_len, padding='post', truncating='post')

del X_list

In [12]:
y_int = np.array(y_int)
# Convert to one-hot for MixupDataset (which uses soft targets)
y_one_hot = F.one_hot(torch.tensor(y_int, dtype=torch.long), num_classes=18).float()

In [13]:
torch.save(X, "X.pt")
torch.save(y_one_hot, "y_ohe.pt")
np.save("y_int.npy", y_int)
joblib.dump(scaler, "StandardScaler.joblib")
joblib.dump(imu_cols, "imu_cols.joblib")
joblib.dump(thm_tof_cols, "thm_tof_cols.joblib")
joblib.dump(tof_cols, "tof_cols.joblib")
joblib.dump(final_feature_cols, "final_feature_cols.joblib")

['final_feature_cols.joblib']

In [14]:
X.shape
## X shape: (total num of sequences, padded size of each sequence, number of features per sequence)

torch.Size([8151, 127, 364])