In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
from eeg_list import *
from spectrograms_list import *

In [3]:
def find_outliers_IQR(df):
   q1=df.quantile(0.2)
   q3=df.quantile(0.8)
   IQR=q3-q1
   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

In [4]:
df_train = pd.read_csv('train.csv').sort_values(['eeg_id', 'eeg_sub_id'])

In [5]:
df_train_max = df_train.groupby(['eeg_id', 'spectrogram_id'])[['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']].max()
df_train_max['total_votes'] = df_train_max.sum(axis=1)
df_train_max['seizure_vote_prob'] = df_train_max['seizure_vote']/df_train_max['total_votes']
df_train_max['lpd_vote_prob'] = df_train_max['lpd_vote']/df_train_max['total_votes']
df_train_max['gpd_vote_prob'] = df_train_max['gpd_vote']/df_train_max['total_votes']
df_train_max['lrda_vote_prob'] = df_train_max['lrda_vote']/df_train_max['total_votes']
df_train_max['grda_vote_prob'] = df_train_max['grda_vote']/df_train_max['total_votes']
df_train_max['other_vote_prob'] = df_train_max['other_vote']/df_train_max['total_votes']
df_train_max_prob = df_train_max.drop(columns=['seizure_vote', 'lpd_vote','gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote', 'total_votes'])
df_train_max_prob= df_train_max_prob.reset_index()

In [6]:
def get_eegs_features(filepath: str):
    #get file
    df_test_eegs = pq.read_table(source=f'/Users/jeremybellucci/hms-harmful-brain-activity-classification/train_eegs/{filepath}.parquet').to_pandas()
    #smooth data
    eegs_rolling = df_test_eegs.rolling(200, center=True).mean().iloc[::200, :].iloc[1:].reset_index().drop(columns=['index'])
    #get features
    Fp1_outliers = len(find_outliers_IQR(eegs_rolling['Fp1']))
    F3_outliers = len(find_outliers_IQR(eegs_rolling['F3']))
    C3_outliers = len(find_outliers_IQR(eegs_rolling['C3']))
    P3_outliers = len(find_outliers_IQR(eegs_rolling['P3']))
    F7_outliers = len(find_outliers_IQR(eegs_rolling['F7']))
    T3_outliers = len(find_outliers_IQR(eegs_rolling['T3']))
    T5_outliers = len(find_outliers_IQR(eegs_rolling['T5']))
    O1_outliers = len(find_outliers_IQR(eegs_rolling['O1']))
    Fz_outliers = len(find_outliers_IQR(eegs_rolling['Fz']))
    Cz_outliers = len(find_outliers_IQR(eegs_rolling['Cz']))
    Pz_outliers = len(find_outliers_IQR(eegs_rolling['Pz']))
    Fp2_outliers = len(find_outliers_IQR(eegs_rolling['Fp2']))
    F4_outliers = len(find_outliers_IQR(eegs_rolling['F4']))
    C4_outliers = len(find_outliers_IQR(eegs_rolling['C4']))
    P4_outliers = len(find_outliers_IQR(eegs_rolling['P4']))
    F8_outliers = len(find_outliers_IQR(eegs_rolling['F8']))
    T4_outliers = len(find_outliers_IQR(eegs_rolling['T4']))
    T6_outliers = len(find_outliers_IQR(eegs_rolling['T6']))
    O2_outliers = len(find_outliers_IQR(eegs_rolling['O2']))
    EKG_outliers = len(find_outliers_IQR(eegs_rolling['EKG']))
    Fp1_std = eegs_rolling['Fp1'].std()
    F3_std = eegs_rolling['F3'].std()
    C3_std = eegs_rolling['C3'].std()
    P3_std = eegs_rolling['P3'].std()
    F7_std = eegs_rolling['F7'].std()
    T3_std = eegs_rolling['T3'].std()
    T5_std = eegs_rolling['T5'].std()
    O1_std = eegs_rolling['O1'].std()
    Fz_std = eegs_rolling['Fz'].std()
    Cz_std = eegs_rolling['Cz'].std()
    Pz_std = eegs_rolling['Pz'].std()
    Fp2_std = eegs_rolling['Fp2'].std()
    F4_std = eegs_rolling['F4'].std()
    C4_std = eegs_rolling['C4'].std()
    P4_std = eegs_rolling['P4'].std()
    F8_std = eegs_rolling['F8'].std()
    T4_std = eegs_rolling['T4'].std()
    T6_std = eegs_rolling['T6'].std()
    O2_std = eegs_rolling['O2'].std()
    EKG_std = eegs_rolling['EKG'].std()
    Fp1_max_to_mean = (eegs_rolling['Fp1'].max()-eegs_rolling['Fp1'].median())/eegs_rolling['Fp1'].median()
    F3_max_to_mean = (eegs_rolling['F3'].max()-eegs_rolling['F3'].median())/eegs_rolling['F3'].median()
    C3_max_to_mean = (eegs_rolling['C3'].max()-eegs_rolling['C3'].median())/eegs_rolling['C3'].median()
    P3_max_to_mean = (eegs_rolling['P3'].max()-eegs_rolling['P3'].median())/eegs_rolling['P3'].median()
    F7_max_to_mean = (eegs_rolling['F7'].max()-eegs_rolling['F7'].median())/eegs_rolling['F7'].median()
    T3_max_to_mean = (eegs_rolling['T3'].max()-eegs_rolling['T3'].median())/eegs_rolling['T3'].median()
    T5_max_to_mean = (eegs_rolling['T5'].max()-eegs_rolling['T5'].median())/eegs_rolling['T5'].median()
    O1_max_to_mean = (eegs_rolling['O1'].max()-eegs_rolling['O1'].median())/eegs_rolling['O1'].median()
    Fz_max_to_mean = (eegs_rolling['Fz'].max()-eegs_rolling['Fz'].median())/eegs_rolling['Fz'].median()
    Cz_max_to_mean = (eegs_rolling['Cz'].max()-eegs_rolling['Cz'].median())/eegs_rolling['Cz'].median()
    Pz_max_to_mean = (eegs_rolling['Pz'].max()-eegs_rolling['Pz'].median())/eegs_rolling['Pz'].median()
    Fp2_max_to_mean = (eegs_rolling['Fp2'].max()-eegs_rolling['Fp2'].median())/eegs_rolling['Fp2'].median()
    F4_max_to_mean = (eegs_rolling['F4'].max()-eegs_rolling['F4'].median())/eegs_rolling['F4'].median()
    C4_max_to_mean = (eegs_rolling['C4'].max()-eegs_rolling['C4'].median())/eegs_rolling['C4'].median()
    P4_max_to_mean = (eegs_rolling['P4'].max()-eegs_rolling['P4'].median())/eegs_rolling['P4'].median()
    F8_max_to_mean = (eegs_rolling['F8'].max()-eegs_rolling['F8'].median())/eegs_rolling['F8'].median()
    T4_max_to_mean = (eegs_rolling['T4'].max()-eegs_rolling['T4'].median())/eegs_rolling['T4'].median()
    T6_max_to_mean = (eegs_rolling['T6'].max()-eegs_rolling['T6'].median())/eegs_rolling['T6'].median()
    O2_max_to_mean = (eegs_rolling['O2'].max()-eegs_rolling['O2'].median())/eegs_rolling['O2'].median()
    EKG_max_to_mean = (eegs_rolling['EKG'].max()-eegs_rolling['EKG'].median())/eegs_rolling['EKG'].median()
    data = {'eeg_id':int(f'{filepath}'),'Fp1_outliers':Fp1_outliers,'F3_outliers':F3_outliers,'C3_outliers':C3_outliers, 'P3_outliers':P3_outliers,
    'F7_outliers':F7_outliers,'T3_outliers':T3_outliers,'T5_outliers':T5_outliers, 'O1_outliers':O1_outliers,
    'Fz_outliers':Fz_outliers,'Cz_outliers':Cz_outliers,'Pz_outliers':Pz_outliers, 'Fp2_outliers':Fp2_outliers,
    'F4_outliers':F4_outliers,'C4_outliers':C4_outliers,'P4_outliers':P4_outliers, 'F8_outliers':F8_outliers,
    'T4_outliers':T4_outliers,'T6_outliers':T6_outliers,'O2_outliers':O2_outliers, 'EKG_outliers':EKG_outliers,
    'Fp1_std':Fp1_std, 'F3_std':F3_std, 'C3_std':C3_std, 'P3_std':P3_std,
    'F7_std':F7_std, 'T3_std':T3_std, 'T5_std':T5_std, 'O1_std':O1_std,
    'Fz_std':Fz_std, 'Cz_std':Cz_std, 'Pz_std':Pz_std, 'Fp2_std':Fp2_std,
    'F4_std':F4_std, 'C4_std':C4_std, 'P4_std':P4_std, 'F8_std':F8_std,
    'T4_std':T4_std, 'T6_std':T6_std, 'O2_std':O2_std, 'EKG_std':EKG_std,
    'Fp1_max_to_mean':Fp1_max_to_mean, 'F3_max_to_mean':F3_max_to_mean, 'C3_max_to_mean':C3_max_to_mean, 'P3_max_to_mean':P3_max_to_mean,
    'F7_max_to_mean':F7_max_to_mean, 'T3_max_to_mean':T3_max_to_mean, 'T5_max_to_mean':T5_max_to_mean, 'O1_max_to_mean':O1_max_to_mean,
    'Fz_max_to_mean':Fz_max_to_mean, 'Cz_max_to_mean':Cz_max_to_mean, 'Pz_max_to_mean':Pz_max_to_mean, 'Fp2_max_to_mean':Fp2_max_to_mean,
    'F4_max_to_mean':F4_max_to_mean, 'C4_max_to_mean':C4_max_to_mean, 'P4_max_to_mean':P4_max_to_mean, 'F8_max_to_mean':F8_max_to_mean,
    'T4_max_to_mean':T4_max_to_mean, 'T6_max_to_mean':T6_max_to_mean, 'O2_max_to_mean':O2_max_to_mean, 'EKG_max_to_mean':EKG_max_to_mean
    }
    aggregate_eeg_df = pd.DataFrame(data, index=[0])

    return aggregate_eeg_df

In [7]:
def get_spectrograms_features(filepath: str):
    df_test_spectrograms = pq.read_table(source=f'/Users/jeremybellucci/hms-harmful-brain-activity-classification/train_spectrograms/{filepath}.parquet').to_pandas().set_index('time')
    df_test_spectrograms['LL_mean'] = df_test_spectrograms.iloc[:, : 100].mean(axis=1)
    df_test_spectrograms['RL_mean'] = df_test_spectrograms.iloc[:, 100:200].mean(axis=1)
    df_test_spectrograms['LP_mean'] = df_test_spectrograms.iloc[:, 200:300].mean(axis=1)
    df_test_spectrograms['RP_mean'] = df_test_spectrograms.iloc[:, 300:400].mean(axis=1)
    df_test_spectrograms_averaged = df_test_spectrograms[['LL_mean', 'RL_mean', 'LP_mean', 'RP_mean']].rolling(window=5).mean()
    LL_outliers = len(find_outliers_IQR(df_test_spectrograms_averaged['LL_mean']))
    RL_outliers = len(find_outliers_IQR(df_test_spectrograms_averaged['RL_mean']))
    LP_outliers = len(find_outliers_IQR(df_test_spectrograms_averaged['LP_mean']))
    RP_outliers = len(find_outliers_IQR(df_test_spectrograms_averaged['RP_mean']))
    LL_stdev = df_test_spectrograms_averaged['LL_mean'].std()
    RL_stdev = df_test_spectrograms_averaged['RL_mean'].std()
    LP_stdev = df_test_spectrograms_averaged['LP_mean'].std()
    RP_stdev = df_test_spectrograms_averaged['RP_mean'].std()
    LL_max_to_mean = (df_test_spectrograms_averaged['LL_mean'].max()-df_test_spectrograms_averaged['LL_mean'].median())/df_test_spectrograms_averaged['LL_mean'].median()
    RL_max_to_mean = (df_test_spectrograms_averaged['RL_mean'].max()-df_test_spectrograms_averaged['RL_mean'].median())/df_test_spectrograms_averaged['RL_mean'].median()
    LP_max_to_mean = (df_test_spectrograms_averaged['LP_mean'].max()-df_test_spectrograms_averaged['LP_mean'].median())/df_test_spectrograms_averaged['LP_mean'].median()
    RP_max_to_mean = (df_test_spectrograms_averaged['RP_mean'].max()-df_test_spectrograms_averaged['RP_mean'].median())/df_test_spectrograms_averaged['RP_mean'].median()
    data = {'spectrogram_id':int(f'{filepath}'), 'LL_outliers':LL_outliers,'RL_outliers':RL_outliers,'LP_outliers':LP_outliers, 'RP_outliers':RP_outliers, 
    'LL_stdev':LL_stdev, 'RL_stdev':RL_stdev, 'LP_stdev':LP_stdev, 'RP_stdev':RP_stdev,
    'LL_max_to_mean':LL_max_to_mean, 'RL_max_to_mean':RL_max_to_mean, 'LP_max_to_mean':LP_max_to_mean, 'RP_max_to_mean':RP_max_to_mean}
    aggregate_spectrograms_df = pd.DataFrame(data, index=[0])
    return aggregate_spectrograms_df

    

In [8]:
eeg_features = []
for x in eegs_list:
    row = get_eegs_features(x)
    # store DataFrame in list
    eeg_features.append(row)
# see pd.concat documentation for more info
eeg_features = pd.concat(eeg_features, axis=0)
#eeg_features['eeg_id'] = eegs_list

  EKG_max_to_mean = (eegs_rolling['EKG'].max()-eegs_rolling['EKG'].median())/eegs_rolling['EKG'].median()
  EKG_max_to_mean = (eegs_rolling['EKG'].max()-eegs_rolling['EKG'].median())/eegs_rolling['EKG'].median()
  EKG_max_to_mean = (eegs_rolling['EKG'].max()-eegs_rolling['EKG'].median())/eegs_rolling['EKG'].median()
  EKG_max_to_mean = (eegs_rolling['EKG'].max()-eegs_rolling['EKG'].median())/eegs_rolling['EKG'].median()
  EKG_max_to_mean = (eegs_rolling['EKG'].max()-eegs_rolling['EKG'].median())/eegs_rolling['EKG'].median()
  EKG_max_to_mean = (eegs_rolling['EKG'].max()-eegs_rolling['EKG'].median())/eegs_rolling['EKG'].median()
  EKG_max_to_mean = (eegs_rolling['EKG'].max()-eegs_rolling['EKG'].median())/eegs_rolling['EKG'].median()
  EKG_max_to_mean = (eegs_rolling['EKG'].max()-eegs_rolling['EKG'].median())/eegs_rolling['EKG'].median()
  EKG_max_to_mean = (eegs_rolling['EKG'].max()-eegs_rolling['EKG'].median())/eegs_rolling['EKG'].median()
  EKG_max_to_mean = (eegs_rolling['EKG'].max()

In [9]:
eeg_features

Unnamed: 0,eeg_id,Fp1_outliers,F3_outliers,C3_outliers,P3_outliers,F7_outliers,T3_outliers,T5_outliers,O1_outliers,Fz_outliers,...,Pz_max_to_mean,Fp2_max_to_mean,F4_max_to_mean,C4_max_to_mean,P4_max_to_mean,F8_max_to_mean,T4_max_to_mean,T6_max_to_mean,O2_max_to_mean,EKG_max_to_mean
0,1000913311,0,0,0,0,1,0,0,0,0,...,-1.869223,-2.580416,-1523.453446,-1.226949,-1.518767,3.483476,5.435390,-1.598933,-27.362815,5.235462
0,1001369401,9,0,2,7,2,3,2,1,2,...,-2.434021,-3.222587,-2.408051,-1.900738,-2.385460,-2.519266,-3.222739,-2.226997,-2.727960,-9.161048
0,1001487592,0,1,1,0,0,0,0,0,0,...,-2.027988,-2.684916,-0.839318,-0.626944,-1.848068,-1.646958,-2.310089,-1.280538,-2.043150,-1.091277
0,1001717358,24,27,27,27,27,28,28,27,22,...,324.473978,103.936166,674.911811,5100.270514,225.675327,668.537042,-880.647740,-1752.475769,3666.406575,-226.657927
0,1002136740,11,9,13,12,3,7,11,10,8,...,-26.190819,21.464008,-64.387269,-7.744177,-5.817223,3.782720,5.234918,-3.637843,0.839453,-4.925885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,998940214,0,0,0,0,0,0,0,0,0,...,0.755964,0.161797,-0.184185,-0.380675,-0.678613,-0.229830,-0.312230,0.359022,-7.866595,0.118251
0,999199851,0,2,2,1,5,8,4,1,0,...,-25.231063,-3.955125,-2.665632,1.096275,-83.595968,-6.250710,27.934426,-0.906992,-1.040936,-2.596324
0,999294255,36,36,24,20,35,34,29,28,28,...,-1.341687,-14.339193,-1.460394,-1.839169,4.558578,-3.427861,-3.513380,2.732586,-10.991434,-0.903085
0,999550997,3,2,2,5,1,1,1,5,1,...,-3.401555,-6.559460,-4.407385,-0.484864,-0.996008,-32.057153,-40.256524,-2.006368,5.593517,-0.227116


In [10]:
spectrogram_features = []
for x in spectograms_list:
    row = get_spectrograms_features(x)
    # store DataFrame in list
    spectrogram_features.append(row)
# see pd.concat documentation for more info
spectrogram_features = pd.concat(spectrogram_features, axis=0)
#spectrogram_features['spectrogram_id'] = spectograms_list

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [11]:
spectrogram_features

Unnamed: 0,spectrogram_id,LL_outliers,RL_outliers,LP_outliers,RP_outliers,LL_stdev,RL_stdev,LP_stdev,RP_stdev,LL_max_to_mean,RL_max_to_mean,LP_max_to_mean,RP_max_to_mean
0,1000086677,5,5,5,5,133.324420,385.927014,656.199525,261.694599,368.504444,2006.666919,2282.504013,1110.822476
0,1000189855,12,13,12,17,1.387222,4.919758,1.377605,4.929861,9.288273,76.305519,8.130364,91.330677
0,1000317312,6,6,0,4,0.991267,0.812625,0.421770,0.483055,3.408186,1.394807,1.016603,2.115040
0,1000381196,12,10,13,15,51.619232,51.676502,40.974662,44.482686,4.759969,5.740787,8.741669,11.757766
0,1000493950,12,0,12,0,6.182703,0.958663,5.037753,1.270831,16.376974,0.877692,22.026620,1.186319
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,999011233,0,0,0,21,48609.072135,40922.584992,59132.179401,51150.592247,0.130094,0.809837,0.220071,0.202981
0,999095199,25,0,23,3,23.879774,1.090551,19.374999,0.966809,21.108691,0.768754,20.348262,0.931007
0,999320161,32,11,19,0,6.123638,1.316450,6.847023,1.496620,18.639822,4.345064,17.121171,2.641272
0,999431,42,31,39,11,7.222492,5.157118,6.690839,6.101984,6.882167,0.435868,9.695876,0.339724


In [12]:
compiled_df = df_train_max_prob.set_index('eeg_id').join(eeg_features.set_index('eeg_id'), how='left', on='eeg_id').reset_index()
compiled_df = compiled_df.set_index('spectrogram_id').join(spectrogram_features.set_index('spectrogram_id'), how = 'left', on = 'spectrogram_id').reset_index()

In [14]:
compiled_df.to_csv('features_and_votes.csv', index=False)