In [1]:
import numpy as np
import pandas as pd

In [2]:
PATH = './hms-train/'
target_col = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
train_df = pd.read_csv(f'{PATH}train.csv')
spec = np.load(f'{PATH}specs.npy', allow_pickle=True).item()

In [3]:
train = train_df.groupby('eeg_id')[['spectrogram_id', 'patient_id']].agg('first')
min = train_df.groupby('eeg_id')[['spectrogram_label_offset_seconds']].agg('min')
train['min'] = min
max = train_df.groupby('eeg_id')[['spectrogram_label_offset_seconds']].agg('max')
train['max'] = max
tmp = train_df.groupby('eeg_id')[target_col].agg('sum')
tmp = tmp.div(tmp.sum(axis=1), axis=0) 
train = pd.concat([train, tmp], axis=1)
tmp = train_df.groupby('eeg_id')['expert_consensus'].agg('first')
train['label'] = tmp
train = train.reset_index()

In [4]:
data = np.zeros((train.shape[0], 400*3))
for i, row in train.iterrows():
    s= spec[row['spectrogram_id']]
    r = int((row['max'] - row["min"]) // 4)
    mean = np.nanmean(s[r:r+300, :], axis=0)
    min_val = np.nanmin(s[r:r+300, :], axis=0)
    max_val = np.nanmax(s[r:r+300, :], axis=0)
    data[i, :400] = mean
    data[i, 400:800] = min_val
    data[i, 800:1200] = max_val

In [5]:
data.shape

(17089, 1200)

In [7]:
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, log_loss
from scipy.stats import entropy

In [8]:
le_fit = LabelEncoder().fit(train.label)
label_classes = le_fit.classes_
tar = le_fit.transform(train.label)

In [9]:
label_classes

array(['GPD', 'GRDA', 'LPD', 'LRDA', 'Other', 'Seizure'], dtype=object)

In [10]:
gkf = GroupKFold(n_splits=5)

params = {
    'objective': 'multiclass',
    'num_class': 6,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    # 'num_leaves': 31,
    # 'learning_rate': 0.05,
    # 'feature_fraction': 0.9,
    # 'bagging_fraction': 0.8,
    # 'bagging_freq': 5,
    'verbose': 0
}

for i, (train_index, valid_index) in enumerate(gkf.split(train, train.label, train.patient_id)):
    print('-'*10)
    print(f'Fold {i+1}:')
    print('-'*10)

    X_train, X_val = data[train_index], data[valid_index]
    y_train, y_val = tar[train_index], tar[valid_index]
    y_true_prob = train.loc[valid_index, target_col].values

    train_set = lgb.Dataset(X_train, label=y_train)
    val_set = lgb.Dataset(X_val, label=y_val, reference=train_set)

    model = lgb.train(params, train_set, valid_sets=[val_set])

    y_pred_prob = model.predict(X_val)
    
    acc = accuracy_score(y_val, np.argmax(y_pred_prob, axis=1))
    print(f'Validation Accuracy Score: {acc:.6f}')

    # ll = log_loss(y_val, y_pred_prob)
    # print(f'Validation Log Loss: {ll:.6f}')

    kl_div = entropy(y_true_prob, y_pred_prob)
    print(f'Validation KL Divergence Score: {np.mean(kl_div):.6f}')

----------
Fold 1:
----------


Validation Accuracy Score: 0.510825
Validation KL Divergence Score: 2.353173
----------
Fold 2:
----------
Validation Accuracy Score: 0.518139
Validation KL Divergence Score: 2.471000
----------
Fold 3:
----------
Validation Accuracy Score: 0.555881
Validation KL Divergence Score: 2.544173
----------
Fold 4:
----------
Validation Accuracy Score: 0.505559
Validation KL Divergence Score: 2.552462
----------
Fold 5:
----------
Validation Accuracy Score: 0.532924
Validation KL Divergence Score: 2.284292
