In [1]:
# %cd ..
%cd /home/janneke/Documents/Master/Machine_Learning_in_Practice/HMS/MLiP_group_10_task1_HMS/

/home/janneke/Documents/Master/Machine_Learning_in_Practice/HMS/MLiP_group_10_task1_HMS


In [2]:
from copy import deepcopy

import pandas as pd
import numpy as np
from logging import getLogger, basicConfig, INFO
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
import pickle

from datasets.raw_data_loader import CustomRawDataset
from datasets.data_loader_configs import BaseDataConfig
from utils.evaluation_utils import score_kl_divergence
from utils.feature_extraction_utils import get_hfda

In [3]:
basicConfig(level=INFO)
logger = getLogger('main')
config = BaseDataConfig()

In [4]:
feature_list = ["desc", "hfda", "psd"]
normalize = True

In [5]:
subset_sample_count = 1000
dataset = CustomRawDataset(config, mode="train", cache=True, subset_sample_count=subset_sample_count, feature_list=feature_list)
dataset.print_summary()

2024-02-23 16:24:16,547 - data_loader.log - INFO - Loading dataset from cache: ./data/cache/CustomRawDataset_1000_train_feats:desc_hfda_psd.npz
INFO:data_loader.log:Loading dataset from cache: ./data/cache/CustomRawDataset_1000_train_feats:desc_hfda_psd.npz


Dataset Summary:
Mode: train
Total Samples: 1000
Unique Patients: 1000
Unique EEGs: 1000
Unique Spectrograms: 1000
Label Distribution:
seizure_vote    1246
lpd_vote         479
gpd_vote         610
lrda_vote        438
grda_vote       1053
other_vote      2733
dtype: int64

Vote Statistics:
        seizure_vote  lpd_vote  gpd_vote  lrda_vote  grda_vote  other_vote
mean         1.24600  0.479000  0.610000   0.438000   1.053000    2.733000
median       0.00000  0.000000  0.000000   0.000000   0.000000    0.000000
var          2.70619  4.059619  5.531431   1.585742   3.459651   26.193905
Probabilities Loaded: 1000
Features Loaded: 1000

Configuration Summary:
+---------------------+--------------------------------------------------------------------------------------------------------------------------------------+
| Configuration       | Value                                                                                                                                |
+----------------

In [6]:
if normalize:
	num_channels = 19
	num_features = int(dataset.features_per_sample.shape[1] / num_channels)
	one_hot_len = len(dataset.config.NAMES)

	normalized_features = np.zeros(dataset.features_per_sample.shape)
	means = np.zeros(dataset.features_per_sample.shape[1])
	stds = np.zeros(dataset.features_per_sample.shape[1])

	for i in range(num_channels):
		features = dataset.features_per_sample[:,i * num_features : i * num_features + num_features - one_hot_len]
		
		mean = np.mean(features, axis=0)
		means[i * num_features : i * num_features + num_features - one_hot_len] = mean
		std = np.std(features, axis=0)
		stds[i * num_features : i * num_features + num_features - one_hot_len] = std

		normalized_features[:,i * num_features : i * num_features + num_features - one_hot_len] = (features - mean) / std
		
		one_hot = dataset.features_per_sample[:,i * num_features + num_features - one_hot_len: i * num_features + num_features]
		normalized_features[:,i * num_features + num_features - one_hot_len : i * num_features + num_features] = one_hot

	dataset.features_per_sample = normalized_features
else:
	means = None
	stds = None
# dataset.features_per_sample[0]

In [7]:
if subset_sample_count == 1:
	x_test = x_train = dataset.features_per_sample
	y_test = y_train = dataset.lbl_probabilities
else:
	x_train, x_test, y_train, y_test = train_test_split(dataset.features_per_sample, dataset.lbl_probabilities, test_size=0.33, random_state=42)
y0, y1, y2, y3, y4, y5 = zip(*y_train)

Let multiple separate models regress on one label each

In [8]:
y_pred = np.zeros(y_test.shape)
y_pred_train = np.zeros(y_train.shape)

models = {"means": means, "stds": stds}
for i, lbl_group in enumerate([y0, y1, y2, y3, y4, y5]):
	print(dataset.label_cols[i])
	y_train_group = lbl_group
	clf = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=0, verbose=1)
	clf.fit(x_train, y_train_group)
	
	y_pred_group_train = clf.predict(x_train)
	y_pred_train[:,i] = y_pred_group_train
	
	y_pred_group = clf.predict(x_test)
	y_pred[:,i] = y_pred_group
	
	models[lbl_group] = clf

y_pred_train[y_pred_train < 0] = 0
y_pred_train_probabilities = y_pred_train / np.sum(y_pred_train, axis=1)[:,None]

y_pred[y_pred < 0] = 0
y_pred_probabilities = y_pred / np.sum(y_pred, axis=1)[:,None]

seizure_vote
      Iter       Train Loss   Remaining Time 
         1           0.2019            2.33s
         2           0.1914            2.27s
         3           0.1824            2.20s
         4           0.1744            2.12s
         5           0.1674            2.05s
         6           0.1608            1.99s
         7           0.1553            1.93s
         8           0.1497            1.89s
         9           0.1449            1.85s
        10           0.1409            1.81s
        20           0.1093            1.34s
        30           0.0937            0.90s
        40           0.0813            0.45s
        50           0.0701            0.00s
lpd_vote
      Iter       Train Loss   Remaining Time 
         1           0.0211            2.20s
         2           0.0206            2.21s
         3           0.0192            2.11s
         4           0.0186            2.05s
         5           0.0174            2.00s
         6           0.0171    

In [9]:
with open(f"checkpoints/other_models/ensemble_one_model_per_target_{subset_sample_count}_{'norm_' if normalize else ''}feats:{'_'.join(sorted(feature_list))}.pickle", "wb") as pickle_file:
	pickle.dump(models, pickle_file)

In [10]:
submission = pd.DataFrame(y_pred_train_probabilities, columns=dataset.label_cols)
solution = pd.DataFrame(y_train, columns=dataset.label_cols)

# Need to be added for score calculation, but are removed at the start of the function
submission.insert(0, "id", range(len(submission)))
solution.insert(0, "id", range(len(solution)))

score = score_kl_divergence(solution=deepcopy(solution), submission=deepcopy(submission), row_id_column_name="id")
print(f"Train score: {score}")

Train score: 0.47615149962474307


In [11]:
submission = pd.DataFrame(y_pred_probabilities, columns=dataset.label_cols)
solution = pd.DataFrame(y_test, columns=dataset.label_cols)

# Need to be added for score calculation, but are removed at the start of the function
submission.insert(0, "id", range(len(submission)))
solution.insert(0, "id", range(len(solution)))

score = score_kl_divergence(solution=deepcopy(solution), submission=deepcopy(submission), row_id_column_name="id")
print(f"Test score: {score}")

Test score: 1.2474992408719408


In [12]:
# Random
random_pred = np.random.rand(33, 6)
random_pred[random_pred < 0] = 0
random_pred_probabilities = random_pred / np.sum(random_pred, axis=1)[:,None]
random_submission = pd.DataFrame(random_pred_probabilities, columns=dataset.label_cols)

random_submission.insert(0, "id", range(len(random_submission)))

score_kl_divergence(solution=deepcopy(solution), submission=deepcopy(random_submission), row_id_column_name="id")

0.205705548549977