# Data Splitting

This notebooks take the output dataset from `scripts/dataset` and splits it into train/val/test, standardizing the features along the way using different mean and std for every user; and ordinally encoding the static features (for embedding lookup later on)

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pyarrow as pa

In [2]:
df = pd.read_parquet(
    "./endomondoHR_proper_interpolated.parquet",
    engine="pyarrow",
    dtype_backend="pyarrow",
)

In [3]:
# sort by first timestamp by user
df["first_timestamp"] = df["timestamp"].apply(lambda x: x[0])

In [4]:
# split into train/va/test
# 80/10/10

# We have to do proportional temporal split per user.

# 1. Ensure data is sorted by User and Time (CRITICAL)
df = df.sort_values(by=["first_timestamp", "userId"])

# 2. Calculate the position of each row relative to its user group
user_id_groupby = df.groupby("userId")

# cumcount starts at 0, so row 1 is index 0
df["user_row"] = user_id_groupby.cumcount() + 1
df["user_rows_total"] = user_id_groupby["userId"].transform("count")

df["user_rows_val_start"] = (df["user_rows_total"] * 0.8).astype(int)
df["user_rows_test_start"] = (df["user_rows_total"] * 0.9).astype(int)

train_df = df[df["user_row"] < df["user_rows_val_start"]].copy()
val_df = df[
    (df["user_row"] >= df["user_rows_val_start"])
    & (df["user_row"] < df["user_rows_test_start"])
].copy()
test_df = df[df["user_row"] >= df["user_rows_test_start"]].copy()

train_df.drop(
    columns=[
        "first_timestamp",
        "user_row",
        "user_rows_total",
        "user_rows_val_start",
        "user_rows_test_start",
    ],
    inplace=True,
)
val_df.drop(
    columns=[
        "first_timestamp",
        "user_row",
        "user_rows_total",
        "user_rows_val_start",
        "user_rows_test_start",
    ],
    inplace=True,
)
test_df.drop(
    columns=[
        "first_timestamp",
        "user_row",
        "user_rows_total",
        "user_rows_val_start",
        "user_rows_test_start",
    ],
    inplace=True,
)

In [5]:
# normalize them - learn the means and stds from train only - apply to val and test

user_stats = {}
columns_to_standardize = [
    "time_elapsed",
    "heart_rate",
    "altitude",
    "derived_speed",
    "derived_distance",
]
for user_id, group in tqdm(train_df.groupby("userId")):
    user_stats[user_id] = {}

    for col in columns_to_standardize:
        user_stats[user_id][col] = []

        # Concatenate all vectors for this user into one giant 1D array
        all_values = np.concatenate(group[col].values)

        user_stats[user_id][col].append(np.mean(all_values))
        user_stats[user_id][col].append(np.std(all_values) + 1e-6)

100%|██████████| 887/887 [00:45<00:00, 19.38it/s]


In [6]:
for col in tqdm(columns_to_standardize, desc="Standardizing columns..."):
    new_col = f"{col}_standardized"

    train_df[new_col] = train_df.apply(
        lambda row: (row[col] - user_stats[row["userId"]][col][0])
        / user_stats[row["userId"]][col][1],
        axis=1,
    )

    val_df[new_col] = val_df.apply(
        lambda row: (row[col] - user_stats[row["userId"]][col][0])
        / user_stats[row["userId"]][col][1],
        axis=1,
    )

    test_df[new_col] = test_df.apply(
        lambda row: (row[col] - user_stats[row["userId"]][col][0])
        / user_stats[row["userId"]][col][1],
        axis=1,
    )

Standardizing columns...: 100%|██████████| 5/5 [00:04<00:00,  1.20it/s]


In [7]:
# ordinally encode static features
user_mapping = {user_id: idx for idx, user_id in enumerate(train_df["userId"].unique())}

sport_mapping = {sport: idx for idx, sport in enumerate(train_df["sport"].unique())}

gender_mapping = {gender: idx for idx, gender in enumerate(train_df["gender"].unique())}

In [8]:
train_df["user_idx"] = train_df["userId"].map(user_mapping)
train_df["sport_idx"] = train_df["sport"].map(sport_mapping)
train_df["gender_idx"] = train_df["gender"].map(gender_mapping)

val_df["user_idx"] = val_df["userId"].map(user_mapping)
val_df["sport_idx"] = val_df["sport"].map(sport_mapping)
val_df["gender_idx"] = val_df["gender"].map(gender_mapping)

test_df["user_idx"] = test_df["userId"].map(user_mapping)
test_df["sport_idx"] = test_df["sport"].map(sport_mapping)
test_df["gender_idx"] = test_df["gender"].map(gender_mapping)

In [12]:
schema = pa.schema(
    [
        pa.field("id", pa.int64()),
        pa.field("userId", pa.int64()),
        pa.field("sport", pa.string()),
        pa.field("gender", pa.string()),
        pa.field("timestamp", pa.list_(pa.int64())),
        pa.field("derived_speed", pa.list_(pa.float64())),
        pa.field("heart_rate", pa.list_(pa.float64())),
        pa.field("longitude", pa.list_(pa.float64())),
        pa.field("latitude", pa.list_(pa.float64())),
        pa.field("derived_distance", pa.list_(pa.float64())),
        pa.field("time_elapsed", pa.list_(pa.int64())),
        pa.field("altitude", pa.list_(pa.float64())),
        pa.field("time_elapsed_standardized", pa.list_(pa.float64())),
        pa.field("heart_rate_standardized", pa.list_(pa.float64())),
        pa.field("altitude_standardized", pa.list_(pa.float64())),
        pa.field("derived_speed_standardized", pa.list_(pa.float64())),
        pa.field("derived_distance_standardized", pa.list_(pa.float64())),
        pa.field("user_idx", pa.int64()),
        pa.field("sport_idx", pa.int64()),
        pa.field("gender_idx", pa.int64()),
    ]
)

train_df.to_parquet(
    "train.parquet",
    schema=schema,
    engine="pyarrow",
)


val_df.to_parquet(
    "val.parquet",
    schema=schema,
    engine="pyarrow",
)


test_df.to_parquet(
    "test.parquet",
    schema=schema,
    engine="pyarrow",
)