In [None]:
import pandas as pd
from os.path import join, isdir, exists
from os import mkdir
import torch as th
from torch.nn import functional as F
from tqdm import tqdm
from random import shuffle, seed
from torchaudio import functional as th_audio_f

In [None]:
dataset_path = "/home/samuel/Téléchargements/hms-harmful-brain-activity-classification"
train_output_folder = "/home/samuel/Téléchargements/hms-harmful-brain-activity-classification/train_eeg_data"
valid_output_folder = "/home/samuel/Téléchargements/hms-harmful-brain-activity-classification/valid_eeg_data"

if not exists(train_output_folder):
    mkdir(train_output_folder)
elif not isdir(train_output_folder):
    raise NotADirectoryError("Output folder '{}' does not exist!".format(train_output_folder))

if not exists(valid_output_folder):
    mkdir(valid_output_folder)
elif not isdir(valid_output_folder):
    raise NotADirectoryError("Output folder '{}' does not exist!".format(valid_output_folder))

In [None]:
train_ratio = 0.8

seed(314159)

all_data_df = pd.read_csv(join(dataset_path, "train.csv"), sep=",")

unique_egg_ids = list(all_data_df["eeg_id"].unique())
shuffle(unique_egg_ids)

train_eeg_ids = unique_egg_ids[:int(len(unique_egg_ids) * train_ratio)]
valid_eeg_ids = unique_egg_ids[int(len(unique_egg_ids) * train_ratio):]

train_df = all_data_df[all_data_df["eeg_id"].isin(train_eeg_ids)]
valid_df = all_data_df[all_data_df["eeg_id"].isin(valid_eeg_ids)]

In [None]:
target_cols = [
    "seizure_vote",
    "lpd_vote",
    "gpd_vote",
    "lrda_vote",
    "grda_vote",
    "other_vote",
]

sample_rate_eegs = 200
nb_seconds_eegs = 50
sample_rate_spec = 0.5
nb_seconds_spectrogram = 10 * 60

In [None]:
def stft_eeg(eeg: th.Tensor, n_fft: int, fft_stride: int) -> th.Tensor:
    nb_signals = eeg.size(0)

    stft_tensors = []

    for i in range(nb_signals):
        signal = eeg[i, :]
        signal = (signal - signal.mean()) / (signal.std() + 1e-8)

        stft_t = th_audio_f.spectrogram(
            signal,
            pad=0,
            window=th.hann_window(n_fft),
            n_fft=n_fft,
            hop_length=fft_stride,
            win_length=n_fft,
            power=None,
            normalized=True,
        )

        stft_tensors.append(stft_t)

    return th.cat(stft_tensors, dim=0)



def extract_eeg(input_df: pd.DataFrame, output_folder: str) -> None:
    last_eeg_id = None
    last_eeg = None
    
    for i, row in tqdm(list(input_df.iterrows())):
            
        if row["eeg_id"] != last_eeg_id:
            last_eeg = pd.read_parquet(
                join(dataset_path, "train_eegs", f"{row['eeg_id']}.parquet"),
                engine="pyarrow",
            )
            last_eeg_id = row['eeg_id']
        
        sub_eeg = last_eeg.iloc[
            int(sample_rate_eegs * row["eeg_label_offset_seconds"])
            : int(sample_rate_eegs * (row["eeg_label_offset_seconds"] + nb_seconds_eegs))
        ]
        
        targets_df = row[target_cols].astype(float).fillna(0.0)
        classes = targets_df / targets_df.sum()
        
        # specs_t = th.nan_to_num(th.tensor(sub_spec.to_numpy().T, dtype=th.float))
        eeg_t = th.nan_to_num(th.tensor(sub_eeg.to_numpy().T, dtype=th.float))
        classes_t = th.nan_to_num(th.tensor(classes.to_numpy(), dtype=th.float))
        
        # specs_t = F.pad(specs_t, (specs_t.size(1) - int(nb_seconds_spectrogram * sample_rate_spec), 0))
        eeg_t = F.pad(eeg_t, (eeg_t.size(1) - int(nb_seconds_eegs * sample_rate_eegs), 0)).transpose(0, 1)
        # eeg_stft = stft_eeg(eeg_t, 40, 40)[:, 1:]

        # th.save(specs_t, join(output_folder, f"{i}_spec.pt"))
        th.save(eeg_t, join(output_folder, f"{i}_eeg.pt"))
        th.save(classes_t, join(output_folder, f"{i}_classes.pt"))

In [None]:
extract_eeg(train_df, train_output_folder)

In [None]:
extract_eeg(valid_df, valid_output_folder)