In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## DATA LOAD

In [None]:
cur_dir = os.getcwd()

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sub_df = pd.read_csv('sample_submission.csv')

## EDA

In [None]:
display(train_df.head())
print(f'Train rows: {len(train_df)}')
display(train_df.describe())
display(test_df.head())
print(f'Test rows: {len(test_df)}')
display(sub_df.head())
print(f'Sub rows: {len(sub_df)}')

In [None]:
sample_train_eeg_id = '568657'

sample_train_eeg = pd.read_parquet(f'train_eegs/{sample_train_eeg_id}.parquet')
display(sample_train_eeg.head())
print(len(sample_train_eeg))
train_df[train_df['eeg_id'] == int(sample_train_eeg_id)]

In [None]:
train_eeg_dir = 'train_eegs'
train_spec_dir = 'train_spectrograms'
print(f'Items in train_eegs: {len(os.listdir(train_eeg_dir))}')
print(f'Items in train_spectrograms: {len(os.listdir(train_spec_dir))}')

In [None]:
train_df.groupby(by='eeg_id').agg({'patient_id':'nunique',
                                       'spectrogram_id':'nunique',
                                       'expert_consensus':'nunique'})['expert_consensus'].unique()

In [None]:
train_df.groupby(by='spectrogram_id').agg({'eeg_id':'nunique',
                                       'patient_id':'nunique',
                                       'expert_consensus':'nunique'})

In [None]:
print(len(train_df['eeg_id'].unique()))

In [None]:
features = ['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3']
features = pd.read_parquet(f'train_eegs/{1628180742}.parquet').columns

def parquet_to_numpy(eeg_row_id, features):
    offset = int(train_df.iloc[eeg_row_id]['eeg_label_offset_seconds'])
    eeg_id = int(train_df.iloc[eeg_row_id]['eeg_id'])
    eeg_df = pd.read_parquet(f'train_eegs/{eeg_id}.parquet')
    eeg_df = eeg_df.iloc[offset*200:(offset+50)*200][features]
    eeg_np = eeg_df.to_numpy()
    return eeg_np

def plot_sample(row, ax):
    eeg_data = parquet_to_numpy(row, features)

    dy = (eeg_data.min() - eeg_data.max()) * 0.7
    tics = [dy*i for i in range(len(features))]
    ax.set_yticks(tics, labels=features)

    for i in range(eeg_data.shape[1]):
        ax.plot(np.arange(eeg_data.shape[0]), eeg_data[:,i] + i*dy)

    annotation_text = f"{train_df.iloc[row][['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']]}"
    ax.annotate(annotation_text, xy=(1.05, 0.5), xycoords='axes fraction', fontsize=10, ha='left', va='center')


fig, axs = plt.subplots(3, 2, figsize=(20,25))

axs = axs.flatten()

for i, ax in zip(np.random.randint(0, len(train_df), 6), axs):
    plot_sample(i, ax)

plt.tight_layout()
plt.show()