In [1]:
import os
import pandas as pd, numpy as np
from glob import glob
import matplotlib.pyplot as plt
from scipy.stats import kurtosis
VER = 1

In [2]:
# check the reading of one parquet for understanding

BASE_PATH = '/kaggle/input/hms-harmful-brain-activity-classification/'

df = pd.DataFrame({'path': glob(BASE_PATH + '**/*.parquet')})
df['test_type'] = df['path'].str.split('/').str.get(-2).str.split('_').str.get(-1)
df['id'] = df['path'].str.split('/').str.get(-1).str.split('.').str.get(0)

df_eeg = pd.read_parquet(BASE_PATH + 'train_eegs/1000913311.parquet')
df_eeg.head()

Unnamed: 0,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,Cz,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG
0,-105.849998,-89.230003,-79.459999,-49.23,-99.730003,-87.769997,-53.330002,-50.740002,-32.25,-42.099998,-43.27,-88.730003,-74.410004,-92.459999,-58.93,-75.739998,-59.470001,8.21,66.489998,1404.930054
1,-85.470001,-75.07,-60.259998,-38.919998,-73.080002,-87.510002,-39.68,-35.630001,-76.839996,-62.740002,-43.040001,-68.629997,-61.689999,-69.32,-35.790001,-58.900002,-41.66,196.190002,230.669998,3402.669922
2,8.84,34.849998,56.43,67.970001,48.099998,25.35,80.25,48.060001,6.72,37.880001,61.0,16.58,55.060001,45.02,70.529999,47.82,72.029999,-67.18,-171.309998,-3565.800049
3,-56.32,-37.279999,-28.1,-2.82,-43.43,-35.049999,3.91,-12.66,8.65,3.83,4.18,-51.900002,-21.889999,-41.330002,-11.58,-27.040001,-11.73,-91.0,-81.190002,-1280.930054
4,-110.139999,-104.519997,-96.879997,-70.25,-111.660004,-114.43,-71.830002,-61.919998,-76.150002,-79.779999,-67.480003,-99.029999,-93.610001,-104.410004,-70.07,-89.25,-77.260002,155.729996,264.850006,4325.370117


In [3]:
# Determine the number of channels
# Assuming each row is a time point and each column is a channel
n_channels = df_eeg.shape[1]
n_channels

20

# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>5 |</span></b> <b>LOAD TRAIN DATA</b></div>

In [4]:
df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
TARGETS = df.columns[-6:]
print('Train shape:', df.shape )
print('Targets', list(TARGETS))
df.head()

Train shape: (106800, 15)
Targets ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


In [5]:
# Creating a Unique EEG Segment per eeg_id:
# The code groups (groupby) the EEG data (df) by eeg_id. Each eeg_id represents a different EEG recording.
# It then picks the first spectrogram_id and the earliest (min) spectrogram_label_offset_seconds for each eeg_id. This helps in identifying the starting point of each EEG segment.
# The resulting DataFrame train has columns spec_id (first spectrogram_id) and min (earliest spectrogram_label_offset_seconds).
train = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_id':'first','spectrogram_label_offset_seconds':'min'})
train.columns = ['spec_id','min']


# Finding the Latest Point in Each EEG Segment:
# The code again groups the data by eeg_id and finds the latest (max) spectrogram_label_offset_seconds for each segment.
# This max value is added to the train DataFrame, representing the end point of each EEG segment.
tmp = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_label_offset_seconds':'max'})
train['max'] = tmp


tmp = df.groupby('eeg_id')[['patient_id']].agg('first') # The code adds the patient_id for each eeg_id to the train DataFrame. This links each EEG segment to a specific patient.
train['patient_id'] = tmp


tmp = df.groupby('eeg_id')[TARGETS].agg('sum') # The code sums up the target variable counts (like votes for seizure, LPD, etc.) for each eeg_id.
for t in TARGETS:
    train[t] = tmp[t].values
    
y_data = train[TARGETS].values # It then normalizes these counts so that they sum up to 1. This step converts the counts into probabilities, which is a common practice in classification tasks.
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGETS] = y_data

tmp = df.groupby('eeg_id')[['expert_consensus']].agg('first') # For each eeg_id, the code includes the expert_consensus on the EEG segment's classification.
train['target'] = tmp

train = train.reset_index() # This makes eeg_id a regular column, making the DataFrame easier to work with.
print('Train non-overlapp eeg_id shape:', train.shape )
train.head()

Train non-overlapp eeg_id shape: (17089, 12)


Unnamed: 0,eeg_id,spec_id,min,max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,target
0,568657,789577333,0.0,16.0,20654,0.0,0.0,0.25,0.0,0.166667,0.583333,Other
1,582999,1552638400,0.0,38.0,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429,LPD
2,642382,14960202,1008.0,1032.0,5955,0.0,0.0,0.0,0.0,0.0,1.0,Other
3,751790,618728447,908.0,908.0,38549,0.0,0.0,1.0,0.0,0.0,0.0,GPD
4,778705,52296320,0.0,0.0,40955,0.0,0.0,0.0,0.0,0.0,1.0,Other


# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>7 |</span></b> <b>FEATURE ENGINEERING</b></div>



In [6]:
READ_SPEC_FILES = False # If READ_SPEC_FILES is False, the code reads the combined file instead of individual files.
FEATURE_ENGINEER = True
READ_EEG_SPEC_FILES = False

In [7]:
%%time
# READ ALL SPECTROGRAMS
PATH = '/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/'
files = os.listdir(PATH)
print(f'There are {len(files)} spectrogram parquets')

if READ_SPEC_FILES:    
    spectrograms = {}
    for i,f in enumerate(files):
        if i%100==0: print(i,', ',end='')
        tmp = pd.read_parquet(f'{PATH}{f}')
        name = int(f.split('.')[0])
        spectrograms[name] = tmp.iloc[:,1:].values
else:
    spectrograms = np.load('/kaggle/input/brain-spectrograms/specs.npy',allow_pickle=True).item()

There are 11138 spectrogram parquets
CPU times: user 5.58 s, sys: 12.7 s, total: 18.3 s
Wall time: 1min 9s


In [8]:
# %%time
# # READ ALL EEG SPECTROGRAMS
# if READ_EEG_SPEC_FILES:
#     all_eegs = {}
#     for i,e in enumerate(train.eeg_id.values):
#         if i%100==0: print(i,', ',end='')
#         x = np.load(f'/kaggle/input/brain-eeg-spectrograms/EEG_Spectrograms/{e}.npy')
#         all_eegs[e] = x
# else:
#     all_eegs = np.load('/kaggle/input/brain-eeg-spectrograms/eeg_specs.npy',allow_pickle=True).item()

In [9]:
%time
# ENGINEER FEATURES
import warnings
# 20 (~21) sec windows optimal - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9739775/
# ~90 sec windows good - https://onelab-eg.com/wp-content/uploads/2019/05/MOCAST_2019_paper_37.pdf
warnings.filterwarnings('ignore')

SPEC_COLS = pd.read_parquet(f'{PATH}1000086677.parquet').columns[1:]

FEATURES = [f'{c}_mean_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_mean_20s' for c in SPEC_COLS]

print(f'We are creating {len(FEATURES)} features for {len(train)} rows... ',end='')

# A data matrix data is initialized to store the new features for each eeg_id in the train DataFrame.
# For each row in train, the code calculates the mean and minimum values within the specified 10-minute and 20-second windows.
# These calculated values are then stored in the data matrix.
# Finally, the matrix is added to the train DataFrame as new columns.

if FEATURE_ENGINEER:
    data = np.zeros((len(train),len(FEATURES)))
    for k in range(len(train)):
        if k%100==0: print(k,', ',end='')
        row = train.iloc[k]
        r = int( (row['min'] + row['max'])//4 ) 
        
        # each offset is the start of (potentially overlapping) 10min subsample of a
        # spectogram / eeg pair with unique label row. 
        # e.g. row['min'] -> row['min'] + 10 min is a subsample of EEG that has some annotated portion with labels in the row, same with row['max'] + 10min
        # since test set has no overlapping subsamples, we just get 1 set of features
        # per unique EEG_id
        # we take the midpoint of the largest and smallest offset for that EEG_ID
        # thereby centering the 10min window within all the subsamples of the EEG
        # since time in the spectogram rows is t=2,4,6,... (multiples of two)
        # we divide the midpoint by 2 to get the right start row 'r'.
        # then logically r+300 is end of 300*2 = 600 sec. and 20sec window is centered too.
        
        # 10 MINUTE WINDOW FEATURES (MEANS and MINS) - Just 1 window in middle of spectogram
        x = np.nanmean(spectrograms[row.spec_id][r:r+300,:],axis=0)
        data[k,:400] = x
        
        # 20 SECOND WINDOW FEATURES (MEANS and MINS) - Just 1 window in middle of spectogram
        x = np.nanmean(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
        data[k,400:800] = x
 
    train[FEATURES] = data
else:
    train = pd.read_parquet('/kaggle/input/brain-spectrograms/train.pqt')
print()
print('New train shape:',train.shape)

CPU times: user 1 µs, sys: 3 µs, total: 4 µs
Wall time: 9.3 µs
We are creating 800 features for 17089 rows... 0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 2200 , 2300 , 2400 , 2500 , 2600 , 2700 , 2800 , 2900 , 3000 , 3100 , 3200 , 3300 , 3400 , 3500 , 3600 , 3700 , 3800 , 3900 , 4000 , 4100 , 4200 , 4300 , 4400 , 4500 , 4600 , 4700 , 4800 , 4900 , 5000 , 5100 , 5200 , 5300 , 5400 , 5500 , 5600 , 5700 , 5800 , 5900 , 6000 , 6100 , 6200 , 6300 , 6400 , 6500 , 6600 , 6700 , 6800 , 6900 , 7000 , 7100 , 7200 , 7300 , 7400 , 7500 , 7600 , 7700 , 7800 , 7900 , 8000 , 8100 , 8200 , 8300 , 8400 , 8500 , 8600 , 8700 , 8800 , 8900 , 9000 , 9100 , 9200 , 9300 , 9400 , 9500 , 9600 , 9700 , 9800 , 9900 , 10000 , 10100 , 10200 , 10300 , 10400 , 10500 , 10600 , 10700 , 10800 , 10900 , 11000 , 11100 , 11200 , 11300 , 11400 , 11500 , 11600 , 11700 , 11800 , 11900 , 12000 , 12100 , 12200 , 12300 , 12400 , 12

In [10]:
from scipy import signal
from scipy.stats import skew, kurtosis

In [11]:
def extract_frequency_band_features(segment):
    # Define EEG frequency bands
    cols = pd.read_parquet(f'{PATH}1000086677.parquet').columns[1:] # like LR_14.32
    channel_groups = ['LL', 'RL', 'LP', 'RP']
    
    eeg_bands = {'Delta': (0.5, 4), 'Theta': (4, 8), 'Alpha': (8, 12), 'Beta': (12, 30), 'Gamma': (30, 45)}
    band_features = []
    
    for channel_group in channel_groups:
        for band in eeg_bands:
            low, high = eeg_bands[band]
            # Filter signal for the specific band
            idxs = []
            for idx, col in enumerate(cols):
                if channel_group in col and float(col.split("_")[1]) <= high and float(col.split("_")[1]) >= low:
                    idxs.append(idx)
                        
            filtered = segment[:, idxs].flatten()

            mean_val = np.nanmean(filtered) if len(filtered) > 0 else 0
            variance_val = np.nanvar(filtered) if len(filtered) > 0 else 0
            skewness_val = skew(filtered, nan_policy='omit') if len(filtered) > 0 else 0
            kurtosis_val = kurtosis(filtered, nan_policy='omit') if len(filtered) > 0 else 0
            # Extend the band_features list with the computed statistics
            band_features.extend([mean_val, variance_val, skewness_val, kurtosis_val])
        
    return band_features

In [12]:
import time
from sklearn.impute import SimpleImputer

# Initialize an array for original features
num_rows = len(train)
num_activity_bands = 5
NEW_FEATURES = np.array([[f"mean_{a}_{c}", f"variance_{a}_{c}", f"skewness_{a}_{c}", f"kurtosis_{a}_{c}"] for a in range(num_activity_bands) for c in ['LL', 'RL', 'LP', 'RP']]).flatten().tolist()
num_features = len(NEW_FEATURES)

data_original = np.zeros((num_rows, num_features))
print(data_original.shape)
num_activity_bands = 5

print("Starting feature extraction...")
start_time = time.time()
print(f"Processing {num_rows} rows. Done: ", end = ' ')
for k in range(num_rows):
    if k % 100 == 0:
        print(f"{k}...", end=' ')

    row = train.iloc[k]
    r = int((row['min'] + row['max']) // 4)
    eeg_segment = spectrograms[row.spec_id][r:r+300, :]
    all_channel_features = extract_frequency_band_features(eeg_segment)
    data_original[k, :] = all_channel_features
    
train[NEW_FEATURES] = data_original

FEATURES += NEW_FEATURES

# Measure total processing time
total_time = time.time() - start_time
print(f"Total processing time: {total_time:.2f} seconds.")

(17089, 80)
Starting feature extraction...
Processing 17089 rows. Done:  0... 100... 200... 300... 400... 500... 600... 700... 800... 900... 1000... 1100... 1200... 1300... 1400... 1500... 1600... 1700... 1800... 1900... 2000... 2100... 2200... 2300... 2400... 2500... 2600... 2700... 2800... 2900... 3000... 3100... 3200... 3300... 3400... 3500... 3600... 3700... 3800... 3900... 4000... 4100... 4200... 4300... 4400... 4500... 4600... 4700... 4800... 4900... 5000... 5100... 5200... 5300... 5400... 5500... 5600... 5700... 5800... 5900... 6000... 6100... 6200... 6300... 6400... 6500... 6600... 6700... 6800... 6900... 7000... 7100... 7200... 7300... 7400... 7500... 7600... 7700... 7800... 7900... 8000... 8100... 8200... 8300... 8400... 8500... 8600... 8700... 8800... 8900... 9000... 9100... 9200... 9300... 9400... 9500... 9600... 9700... 9800... 9900... 10000... 10100... 10200... 10300... 10400... 10500... 10600... 10700... 10800... 10900... 11000... 11100... 11200... 11300... 11400... 1150

In [13]:
train.head()

Unnamed: 0,eeg_id,spec_id,min,max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,...,skewness_4_RL,kurtosis_4_RL,mean_4_LP,variance_4_LP,skewness_4_LP,kurtosis_4_LP,mean_4_RP,variance_4_RP,skewness_4_RP,kurtosis_4_RP
0,568657,789577333,0.0,16.0,20654,0.0,0.0,0.25,0.0,0.166667,...,13.148849,180.780941,9.91516,6098.298,13.597351,197.133545,0.0,0.0,0.0,0.0
1,582999,1552638400,0.0,38.0,20230,0.0,0.857143,0.0,0.071429,0.0,...,2.504685,8.136285,0.104415,0.007555665,5.473937,61.608337,0.0,0.0,0.0,0.0
2,642382,14960202,1008.0,1032.0,5955,0.0,0.0,0.0,0.0,0.0,...,9.847957,124.031875,1.216096,38.87291,11.857481,197.179486,0.0,0.0,0.0,0.0
3,751790,618728447,908.0,908.0,38549,0.0,0.0,1.0,0.0,0.0,...,1.812118,4.167706,0.22203,0.04527618,2.316767,7.397173,0.0,0.0,0.0,0.0
4,778705,52296320,0.0,0.0,40955,0.0,0.0,0.0,0.0,0.0,...,19.939571,443.986533,246.690369,13091870.0,17.660602,346.660055,0.0,0.0,0.0,0.0


In [14]:
from sklearn.preprocessing import StandardScaler

# Columns to be excluded from scaling
excluded_columns = ['eeg_id', 'spec_id', 'min', 'max', 'patient_id', 'seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote','target']

# Save the columns to be excluded
excluded_data = train[excluded_columns]

# DataFrame with only the columns to be scaled
features = train.drop(columns=excluded_columns)

# Initialize the StandardScaler
train_scaler = StandardScaler()

# Fit the scaler to the features and transform them
features_scaled = train_scaler.fit_transform(features)

# Create a DataFrame from the scaled features
features_scaled_df = pd.DataFrame(features_scaled, columns=features.columns)

# Concatenate the scaled features with the excluded columns
train_scaled_df = pd.concat([excluded_data.reset_index(drop=True),features_scaled_df,], axis=1)
# train_scaled_df.to_csv("/kaggle/working/")
train_scaled_df 


Unnamed: 0,eeg_id,spec_id,min,max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,...,skewness_4_RL,kurtosis_4_RL,mean_4_LP,variance_4_LP,skewness_4_LP,kurtosis_4_LP,mean_4_RP,variance_4_RP,skewness_4_RP,kurtosis_4_RP
0,568657,789577333,0.0,16.0,20654,0.0,0.000000,0.25,0.000000,0.166667,...,1.891628,1.320005,-0.068564,-0.033882,1.724362,1.052575,0.0,0.0,0.0,0.0
1,582999,1552638400,0.0,38.0,20230,0.0,0.857143,0.00,0.071429,0.000000,...,-0.494266,-0.464257,-0.068904,-0.033882,0.018945,-0.045464,0.0,0.0,0.0,0.0
2,642382,14960202,1008.0,1032.0,5955,0.0,0.000000,0.00,0.000000,0.000000,...,1.151732,0.733510,-0.068866,-0.033882,1.359096,1.052947,0.0,0.0,0.0,0.0
3,751790,618728447,908.0,908.0,38549,0.0,0.000000,1.00,0.000000,0.000000,...,-0.649505,-0.505272,-0.068900,-0.033882,-0.643866,-0.484689,0.0,0.0,0.0,0.0
4,778705,52296320,0.0,0.0,40955,0.0,0.000000,0.00,0.000000,0.000000,...,3.413772,4.040204,-0.060340,-0.033817,2.577395,2.264055,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17084,4293354003,1188113564,0.0,0.0,16610,0.0,0.000000,0.00,0.000000,0.500000,...,-0.496613,-0.439186,-0.068907,-0.033882,-0.782069,-0.506829,0.0,0.0,0.0,0.0
17085,4293843368,1549502620,0.0,0.0,15065,0.0,0.000000,0.00,0.000000,0.500000,...,-0.776277,-0.521626,-0.068882,-0.033882,-0.762586,-0.509376,0.0,0.0,0.0,0.0
17086,4294455489,2105480289,0.0,0.0,56,0.0,0.000000,0.00,0.000000,0.000000,...,-0.426913,-0.482858,-0.062701,-0.033880,-0.552097,-0.497519,0.0,0.0,0.0,0.0
17087,4294858825,657299228,0.0,12.0,4312,0.0,0.000000,0.00,0.000000,0.066667,...,2.670076,2.510738,-0.057313,-0.033774,2.400818,2.082669,0.0,0.0,0.0,0.0


In [15]:
import xgboost as xgb
from sklearn.svm import SVC
import gc
from sklearn.model_selection import KFold, GroupKFold
import pickle
from sklearn.multioutput import MultiOutputRegressor
from scipy.special import rel_entr

In [16]:
len(FEATURES)

880

In [17]:
all_oof = []
all_true = []
TARS = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}
n_splits = 5
gkf = GroupKFold(n_splits=5)
for i, (train_index, valid_index) in enumerate(gkf.split(train_scaled_df, train_scaled_df.target, train_scaled_df.patient_id)):   
    if i >= n_splits:
        continue
    print('#'*25)
    print(f'### Fold {i+1}')
    print(f'### train size {len(train_index)}, valid size {len(valid_index)}')
    print('#'*25)
    
    # Instantiate the XGBRegressor model
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', learning_rate = 0.1) # uses MSE to predict probabilities

    model = MultiOutputRegressor(xgb_model) # since we have multiple outputs
    
#     model = SVC(probability=True)    
    LABEL_NAMES = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
    # Prepare training and validation data
    X_train = train_scaled_df.loc[train_index, FEATURES]
    y_train = train_scaled_df.loc[train_index, LABEL_NAMES]
    X_valid = train_scaled_df.loc[valid_index, FEATURES]
    y_valid = train_scaled_df.loc[valid_index, LABEL_NAMES]
    model.fit(X_train, y_train, verbose=True,) 

    with open(f'XGBoost_f{i}.pkl', 'wb') as f:
        pickle.dump(model, f)

    y_pred = model.predict(X_valid)
    y_pred[y_pred < 0] = 0
    oof = y_pred / np.sum(y_pred, axis=1).reshape(-1,1) # ensure they sum to 1
    true = y_valid.values
    kl_divergence = np.mean(np.sum(true * (np.log(true + 1e-10) - np.log(oof + 1e-10)), axis=1))
    print(f"Kale Divergence: {kl_divergence}")
    
    all_oof.append(oof)
    all_true.append(true)
    
    del X_train, y_train, X_valid, y_valid, oof
    gc.collect()
    
all_oof = np.concatenate(all_oof)
all_true = np.concatenate(all_true)

#########################
### Fold 1
### train size 13671, valid size 3418
#########################
Kale Divergence: 0.9283122893993105
#########################
### Fold 2
### train size 13671, valid size 3418
#########################
Kale Divergence: 1.0583618993237998
#########################
### Fold 3
### train size 13671, valid size 3418
#########################
Kale Divergence: 0.9619105391463775
#########################
### Fold 4
### train size 13671, valid size 3418
#########################
Kale Divergence: 1.0230853294821822
#########################
### Fold 5
### train size 13672, valid size 3417
#########################
Kale Divergence: 1.0027364704623336


# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>11 |</span></b> <b>INFER TEST</b></div>

In [18]:
test = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
print('Test shape',test.shape)
test.head()

Test shape (1, 3)


Unnamed: 0,spectrogram_id,eeg_id,patient_id
0,853520,3911565283,6885


In [19]:
%%time
# READ ALL TEST SPECTROGRAMS
PATH2 = '/kaggle/input/hms-harmful-brain-activity-classification/test_spectrograms/'
files = os.listdir(PATH2)
print(f'There are {len(files)} spectrogram parquets')

spectrograms_test = {}
for i,f in enumerate(files):
    if i%100==0: print(i,', ',end='')
    tmp = pd.read_parquet(f'{PATH2}{f}')
    name = int(f.split('.')[0])
    spectrograms_test[name] = tmp.iloc[:,1:].values


There are 1 spectrogram parquets
0 , CPU times: user 70.1 ms, sys: 6.05 ms, total: 76.1 ms
Wall time: 76.1 ms


In [20]:
%time
# ENGINEER FEATURES
import warnings
warnings.filterwarnings('ignore')

# The code generates features from the spectrogram data for use in a model 
# The features are derived by calculating the mean and minimum values over time for each of the 400 spectrogram frequencies.
# Two types of windows are used for these calculations:
# A 10-minute window (_mean_10m, _min_10m).
# A 20-second window (_mean_20s, _min_20s).
# This process results in 1600 features (400 features × 4 calculations) for each EEG ID.

SPEC_COLS = pd.read_parquet(f'{PATH}1000086677.parquet').columns[1:]
TEST_FEATURES = [f'{c}_mean_10m' for c in SPEC_COLS]
TEST_FEATURES += [f'{c}_mean_20s' for c in SPEC_COLS]
print(f'We are creating {len(TEST_FEATURES)} features for {len(test)} rows... ',end='')


# A data matrix data is initialized to store the new features for each eeg_id in the train DataFrame.
# For each row in train, the code calculates the mean and minimum values within the specified 10-minute and 20-second windows.
# These calculated values are then stored in the data matrix.
# Finally, the matrix is added to the train DataFrame as new columns.

data = np.zeros((len(test),len(TEST_FEATURES)))
for k in range(len(test)):
    if k%100==0: print(k,', ',end='')
    row = test.iloc[k]  
    
    s = int( row.spectrogram_id )
    spec = pd.read_parquet(f'{PATH2}{s}.parquet')
        
    # 10 MINUTE WINDOW FEATURES (MEANS and MINS) - Just 1 window in middle of spectogram
    x = np.nanmean(spec.iloc[:,1:].values, axis=0)
    data[k,:400] = x

    # 20 SECOND WINDOW FEATURES (MEANS and MINS) - Just 1 window in middle of spectogram
    x = np.nanmean(spec.iloc[145:155,1:].values,axis=0)
    data[k,400:800] = x

    test[TEST_FEATURES] = data

    
print('New test shape:',test.shape)

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 13.1 µs
We are creating 800 features for 1 rows... 0 , New test shape: (1, 803)


In [21]:


# Initialize an array for original features
num_rows = len(test)
num_activity_bands = 5
NEW_FEATURES = np.array([[f"mean_{a}_{c}", f"variance_{a}_{c}", f"skewness_{a}_{c}", f"kurtosis_{a}_{c}"] for a in range(num_activity_bands) for c in ['LL', 'RL', 'LP', 'RP']]).flatten().tolist()

num_features = len(NEW_FEATURES)
data_original = np.zeros((num_rows, num_features))

print("Starting feature extraction and PCA processing...")
start_time = time.time()

for k in range(num_rows):
    if k % 1000 == 0:
        print(f"Processing row {k} of {num_rows}...")

    row = test.iloc[k]
#    r = int((row['min'] + row['max']) // 4)
#     eeg_segment = spectrograms_test[853520][r:r+300, :]
    s = int( row.spectrogram_id )
    spec = pd.read_parquet(f'{PATH2}{s}.parquet')
    eeg_segment = spec.iloc[:,1:].values
    
    # Apply the feature extraction function to each EEG channel
    all_channel_features = extract_frequency_band_features(eeg_segment)
    data_original[k, :] = all_channel_features

test[NEW_FEATURES] = data_original

# Measure total processing time
total_time = time.time() - start_time
print(f"Total processing time: {total_time:.2f} seconds.")
TEST_FEATURES += NEW_FEATURES
test.head()

Starting feature extraction and PCA processing...
Processing row 0 of 1...
Total processing time: 0.19 seconds.


Unnamed: 0,spectrogram_id,eeg_id,patient_id,LL_0.59_mean_10m,LL_0.78_mean_10m,LL_0.98_mean_10m,LL_1.17_mean_10m,LL_1.37_mean_10m,LL_1.56_mean_10m,LL_1.76_mean_10m,...,skewness_4_RL,kurtosis_4_RL,mean_4_LP,variance_4_LP,skewness_4_LP,kurtosis_4_LP,mean_4_RP,variance_4_RP,skewness_4_RP,kurtosis_4_RP
0,853520,3911565283,6885,16.864132,19.120565,18.342468,13.408634,8.0575,4.890133,3.460633,...,4.457234,30.945171,0.104872,0.00951,4.537331,29.30775,0.0,0.0,0.0,0.0


In [22]:

# Columns to be excluded from scaling
excluded_columns = ['eeg_id', 'spectrogram_id', 'patient_id']

# Save the columns to be excluded
excluded_data = test[excluded_columns]

# DataFrame with only the columns to be scaled
features = test.drop(columns=excluded_columns)

# Fit the scaler to the features and transform them
features_scaled = train_scaler.transform(features)

# Create a DataFrame from the scaled features
features_scaled_df = pd.DataFrame(features_scaled, columns=features.columns)

# Concatenate the scaled features with the excluded columns
test_scaled_df = pd.concat([excluded_data.reset_index(drop=True),features_scaled_df,], axis=1)
test_scaled_df 


Unnamed: 0,eeg_id,spectrogram_id,patient_id,LL_0.59_mean_10m,LL_0.78_mean_10m,LL_0.98_mean_10m,LL_1.17_mean_10m,LL_1.37_mean_10m,LL_1.56_mean_10m,LL_1.76_mean_10m,...,skewness_4_RL,kurtosis_4_RL,mean_4_LP,variance_4_LP,skewness_4_LP,kurtosis_4_LP,mean_4_RP,variance_4_RP,skewness_4_RP,kurtosis_4_RP
0,3911565283,853520,6885,-0.119342,-0.123135,-0.120159,-0.124161,-0.120451,-0.117668,-0.116764,...,-0.056601,-0.22853,-0.068904,-0.033882,-0.177685,-0.307167,0.0,0.0,0.0,0.0


In [23]:
# INFER XGBOOST ON TEST
preds = []

for i in range(n_splits):
    print(i, ', ', end='')
    
    # Load the XGBoost model
    with open(f'XGBoost_f{i}.pkl', 'rb') as f:
        model = pickle.load(f)
    
    # Make predictions
    test_data_scaled = test_scaled_df[TEST_FEATURES]
    
    # data_imputed = imputer.fit_transform(test_data_scaled)
    
    pred = model.predict(test_data_scaled)
    pred[pred < 0] = 0
    pred = pred / np.sum(pred, axis=1).reshape(-1,1)
    preds.append(pred) 

# Average the predictions from each fold
pred = np.mean(preds, axis=0)
print('Test preds shape', pred.shape)

0 , 1 , 2 , 3 , 4 , Test preds shape (1, 6)


# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>12 |</span></b> <b>SUBMISSION</b></div>

In [24]:
sub = pd.DataFrame({'eeg_id':test.eeg_id.values})
sub[LABEL_NAMES] = pred
sub.to_csv('submission.csv',index=False)
print('Submission shape',sub.shape)
sub.head()

Submission shape (1, 7)


Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.052125,0.04074,0.002776,0.280364,0.016862,0.607133


In [25]:
# SANITY CHECK TO CONFIRM PREDICTIONS SUM TO ONE
sub.iloc[:,-6:].sum(axis=1)

0    1.0
dtype: float32