In [None]:
import pandas as pd
from os.path import join, isdir, exists
from os import mkdir
import torch as th
from torch.nn import functional as F
from tqdm import tqdm
from random import shuffle, seed
from torchaudio import functional as th_audio_f

In [None]:
dataset_path = "/home/samuel/Téléchargements/hms-harmful-brain-activity-classification"
train_output_folder = "/home/samuel/Téléchargements/hms-harmful-brain-activity-classification/train_eeg_data_2"
valid_output_folder = "/home/samuel/Téléchargements/hms-harmful-brain-activity-classification/valid_eeg_data_2"

if not exists(train_output_folder):
    mkdir(train_output_folder)
elif not isdir(train_output_folder):
    raise NotADirectoryError("Output folder '{}' does not exist!".format(train_output_folder))

if not exists(valid_output_folder):
    mkdir(valid_output_folder)
elif not isdir(valid_output_folder):
    raise NotADirectoryError("Output folder '{}' does not exist!".format(valid_output_folder))

In [None]:
train_ratio = 0.8

seed(314159)

all_data_df = pd.read_csv(join(dataset_path, "train.csv"), sep=",")

unique_egg_ids = list(all_data_df["eeg_id"].unique())
shuffle(unique_egg_ids)

train_eeg_ids = unique_egg_ids[:int(len(unique_egg_ids) * train_ratio)]
valid_eeg_ids = unique_egg_ids[int(len(unique_egg_ids) * train_ratio):]

train_df = all_data_df[all_data_df["eeg_id"].isin(train_eeg_ids)]
valid_df = all_data_df[all_data_df["eeg_id"].isin(valid_eeg_ids)]

In [None]:
target_cols = [
    "seizure_vote",
    "lpd_vote",
    "gpd_vote",
    "lrda_vote",
    "grda_vote",
    "other_vote",
]

sample_rate_eegs = 200
nb_seconds_eegs = 50
sample_rate_spec = 0.5
nb_seconds_spectrogram = 10 * 60

In [None]:
def extract_eeg(input_df: pd.DataFrame, output_folder: str) -> None:
    per_eeg_labels_df = (
        input_df.groupby("eeg_id")
        .agg(
            {
                col: "sum"
                for col in target_cols
            }
        )
        .reset_index()
    )

    for i, row in tqdm(list(per_eeg_labels_df.iterrows())):
        eeg = pd.read_parquet(
            join(dataset_path, "train_eegs", f"{int(row['eeg_id'])}.parquet"),
             engine="pyarrow",
        )

        offset = (len(eeg) - nb_seconds_eegs * sample_rate_eegs) // 2
        eeg = eeg.iloc[offset : offset + nb_seconds_eegs * sample_rate_eegs]
        
        targets_df = row[target_cols].astype(float).fillna(0.0)
        classes = targets_df / targets_df.sum()

        eeg_t = th.nan_to_num(th.tensor(eeg.to_numpy(), dtype=th.float))
        classes_t = th.nan_to_num(th.tensor(classes.to_numpy(), dtype=th.float))

        th.save(eeg_t, join(output_folder, f"{i}_eeg.pt"))
        th.save(classes_t, join(output_folder, f"{i}_classes.pt"))

In [None]:
extract_eeg(train_df, train_output_folder)

In [None]:
extract_eeg(valid_df, valid_output_folder)