In [None]:
from os.path import isdir, join, exists
from os import mkdir

import torch as th
from tqdm import tqdm
import pandas as pd
import json
from typing import Literal

import random

tqdm.pandas()

In [None]:
data_folder = "/home/samuel/Téléchargements/cmi-detect-behavior-with-sensor-data"

assert isdir(data_folder)

output_folder = join(data_folder, "extracted_tensors")
if not exists(output_folder):
    mkdir(output_folder)
elif not isdir(output_folder):
    raise ValueError("Output folder doesn't exist")


In [None]:
id_columns = [
    "sequence_id", "sequence_counter"
]

tof_columns = [
    f"tof_{tof_sensor}_v{tof_pixel}"
    for tof_sensor in range(1, 6)
    for tof_pixel in range(64)
]

acc_columns = [
    "acc_x",
    "acc_y",
    "acc_z",
]

rot_columns = [
    "rot_w",
    "rot_x",
    "rot_y",
    "rot_z",
]

thm_columns = [
    "thm_1",
    "thm_2",
    "thm_3",
    "thm_4",
    "thm_5",
]

other_columns = acc_columns + rot_columns + thm_columns

feature_columns = tof_columns + other_columns

In [None]:
def process_dataframe(df: pd.DataFrame, is_test_df: bool = False) -> pd.DataFrame:

    for id_col in id_columns:
        df[id_col] = df[id_col].astype(str)

    print("nb features", len(feature_columns))

    # process columns

    tof_features_df = df[["sequence_id", "sequence_counter"] + tof_columns].copy()
    df.drop(tof_columns, axis=1, inplace=True)
    for tof_col in tqdm(tof_columns):
        tof_features_df[tof_col] = tof_features_df[tof_col].fillna(0).astype(int)

    other_features_df = df[["sequence_id", "sequence_counter"] + other_columns].copy()
    df.drop(other_columns, axis=1, inplace=True)
    for other_col in tqdm(other_columns):
        other_features_df[other_col] = other_features_df[other_col].fillna(0).astype(int)

    grids_series = tof_features_df.progress_apply(
        lambda row: [
            [
                [
                    row[f"tof_{tof_sensor}_v{y * 8 + x}"]
                    for x in range(8)
                ] for y in range(8)
            ] for tof_sensor in range(1, 6)
        ],
        axis=1,
    )
    grids_series.name = "grids"
    tof_features_df.drop(columns=tof_columns, inplace=True)
    tof_features_df = pd.concat([tof_features_df, grids_series], axis=1)

    aggregated_other_features_df = (
        other_features_df
        .sort_values(["sequence_id", "sequence_counter"])
        .groupby("sequence_id")
        .agg({
            col: list
            for col in other_columns
        })
        .reset_index()
    )

    aggregated_tof_features_df = (
        tof_features_df
        .sort_values(["sequence_id", "sequence_counter"])
        .groupby("sequence_id")
        .agg({"grids": list})
        .reset_index()
    )

    processed_df = (
        aggregated_other_features_df
        .merge(aggregated_tof_features_df, on="sequence_id")
    )

    if not is_test_df:
        targets_df = df[["sequence_id", "sequence_counter", "gesture"]].copy()
        targets_df["gesture"] = targets_df["gesture"].astype(str)

        aggregated_target_df = (
            targets_df.groupby("sequence_id")
            .agg({"gesture": "first"})
            .reset_index()
        )

        processed_df = (
            processed_df
            .merge(aggregated_target_df, on="sequence_id")
        )

    return processed_df

In [None]:
random.seed(314159)

In [None]:
data_df = pd.read_csv(join(data_folder, "train.csv"), header=0)
sequence_ids = list(data_df["sequence_id"].dropna().unique())

random.shuffle(sequence_ids)

train_ratio = 0.8
train_sequence_ids = sequence_ids[:int(len(sequence_ids) * train_ratio)]
valid_sequence_ids = sequence_ids[int(len(sequence_ids) * train_ratio):]

train_df = data_df[data_df["sequence_id"].isin(train_sequence_ids)].copy()
valid_df = data_df[data_df["sequence_id"].isin(valid_sequence_ids)].copy()

In [None]:
test_df = pd.read_csv(join(data_folder, "test.csv"), header=0)

In [None]:
processed_train_df = process_dataframe(train_df)
processed_valid_df = process_dataframe(valid_df)

In [None]:
processed_test_df = process_dataframe(test_df, is_test_df=True)

In [None]:
def save_to_tensors(df: pd.DataFrame, curr_class_to_idx: dict[str, int], folder: Literal["train", "valid", "test"]) -> None:
    if not exists(join(output_folder, folder)):
        mkdir(join(output_folder, folder))
    elif not isdir(join(output_folder, folder)):
        raise NotADirectoryError("Output folder doesn't exist")

    with open(join(output_folder, folder, "class_to_idx.json"), "w", encoding="utf-8") as f:
        json.dump(curr_class_to_idx, f)

    for _, row in tqdm(list(df.iterrows())):
        sequence_id = row["sequence_id"]


        grid = th.clamp_min(th.tensor(row["grids"]), 0).to(th.uint8)
        features = th.tensor([row[c] for c in other_columns], dtype=th.float).T


        th.save(grid, join(output_folder, folder, f"{sequence_id}_grids.pth"))
        th.save(features, join(output_folder, folder, f"{sequence_id}_features.pth"))

        if folder in {"train", "valid"}:
            target = th.tensor([curr_class_to_idx[row["gesture"]]], dtype=th.long)
            th.save(target, join(output_folder, folder, f"{sequence_id}_target.pth"))
        else:
            target = th.tensor([0], dtype=th.long)
            th.save(target, join(output_folder, folder, f"{sequence_id}_target.pth"))

In [None]:
unique_classes = list(
    set(
        list(processed_train_df["gesture"].unique())
        + list(processed_valid_df["gesture"].unique())
    )
)

class_to_idx = {
    name: i
    for i, name in enumerate(sorted(unique_classes))
}

save_to_tensors(processed_train_df, class_to_idx, folder="train")
save_to_tensors(processed_valid_df, class_to_idx, folder="valid")

In [None]:
with open(join(output_folder, "train", "class_to_idx.json"), "r", encoding="utf-8") as f:
    save_to_tensors(processed_test_df, json.load(f), folder="test")

In [None]:
processed_test_df