In [1]:
# %cd ..
%cd /home/janneke/Documents/Master/Machine_Learning_in_Practice/HMS/MLiP_group_10_task1_HMS/

/home/janneke/Documents/Master/Machine_Learning_in_Practice/HMS/MLiP_group_10_task1_HMS


In [2]:
from copy import deepcopy

import pandas as pd
import numpy as np
from logging import getLogger, basicConfig, INFO
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
import pickle

from datasets.raw_data_loader import CustomRawDataset
from datasets.data_loader_configs import BaseDataConfig
from utils.evaluation_utils import score_kl_divergence
from utils.feature_extraction_utils import get_hfda

In [3]:
basicConfig(level=INFO)
logger = getLogger('main')
config = BaseDataConfig()

In [4]:
feature_list = ["psd"]
normalize = True

In [5]:
import subprocess
import re
out = subprocess.run(["lscpu"], capture_output=True) 
num_cores = int(re.findall(r"CPU\(s\):\s+(?P<cores>[0-9]+)\\n", str(out))[0])
num_threads = int(num_cores * 0.75)

In [6]:
subset_sample_count = 5000
dataset = CustomRawDataset(config, mode="train", cache=True, subset_sample_count=subset_sample_count, feature_list=feature_list, num_threads=num_threads)
dataset.print_summary()

2024-02-24 15:00:43,577 - data_loader.log - INFO - Loading dataset from cache: ./data/cache/CustomRawDataset_5000_train_feats(psd).npz
INFO:data_loader.log:Loading dataset from cache: ./data/cache/CustomRawDataset_5000_train_feats(psd).npz


Dataset Summary:
Mode: train
Total Samples: 1950
Unique Patients: 1950
Unique EEGs: 1950
Unique Spectrograms: 1950
Label Distribution:
seizure_vote    2404
lpd_vote         897
gpd_vote        1086
lrda_vote        906
grda_vote       2006
other_vote      5207
dtype: int64

Vote Statistics:
        seizure_vote  lpd_vote  gpd_vote  lrda_vote  grda_vote  other_vote
mean        1.232821   0.46000  0.556923   0.464615   1.028718    2.670256
median      0.000000   0.00000  0.000000   0.000000   0.000000    0.000000
var         2.792355   3.59178  4.609123   1.901005   3.309590   25.045138
Probabilities Loaded: 1950
Features Loaded: 1950

Configuration Summary:
+---------------------+--------------------------------------------------------------------------------------------------------------------------------------+
| Configuration       | Value                                                                                                                                |
+----------------

In [7]:
if normalize:
	num_channels = 19
	num_features = int(dataset.features_per_sample.shape[1] / num_channels)
	one_hot_len = len(dataset.config.NAMES)

	normalized_features = np.zeros(dataset.features_per_sample.shape)
	means = np.zeros(dataset.features_per_sample.shape[1])
	stds = np.zeros(dataset.features_per_sample.shape[1])

	for i in range(num_channels):
		features = dataset.features_per_sample[:,i * num_features : i * num_features + num_features - one_hot_len]
		
		mean = np.mean(features, axis=0)
		means[i * num_features : i * num_features + num_features - one_hot_len] = mean
		std = np.std(features, axis=0)
		stds[i * num_features : i * num_features + num_features - one_hot_len] = std

		normalized_features[:,i * num_features : i * num_features + num_features - one_hot_len] = (features - mean) / std
		
		one_hot = dataset.features_per_sample[:,i * num_features + num_features - one_hot_len: i * num_features + num_features]
		normalized_features[:,i * num_features + num_features - one_hot_len : i * num_features + num_features] = one_hot

	dataset.features_per_sample = normalized_features
else:
	means = None
	stds = None
# dataset.features_per_sample[0]

In [8]:
if subset_sample_count == 1:
	x_test = x_train = dataset.features_per_sample
	y_test = y_train = dataset.lbl_probabilities
else:
	x_train, x_test, y_train, y_test = train_test_split(dataset.features_per_sample, dataset.lbl_probabilities, test_size=0.33, random_state=42)
y0, y1, y2, y3, y4, y5 = zip(*y_train)

Let multiple separate models regress on one label each

In [21]:
y_pred = np.zeros(y_test.shape)
y_pred_train = np.zeros(y_train.shape)

models = {"means": means, "stds": stds}
for i, lbl_group in enumerate([y0, y1, y2, y3, y4, y5]):
	print(dataset.label_cols[i])
	y_train_group = lbl_group
	clf = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=0, verbose=1)
	clf.fit(x_train, y_train_group)
	
	y_pred_group_train = clf.predict(x_train)
	y_pred_train[:,i] = y_pred_group_train
	
	y_pred_group = clf.predict(x_test)
	y_pred[:,i] = y_pred_group
	
	models[lbl_group] = clf

y_pred_train[y_pred_train < 0] = 0
y_pred_train_probabilities = y_pred_train / np.sum(y_pred_train, axis=1)[:,None]

y_pred[y_pred < 0] = 0
y_pred_probabilities = y_pred / np.sum(y_pred, axis=1)[:,None]

seizure_vote
      Iter       Train Loss   Remaining Time 
         1           0.2094            0.96s
         2           0.2041            0.95s
         3           0.1992            0.93s
         4           0.1958            0.91s
         5           0.1923            0.89s
         6           0.1894            0.87s
         7           0.1864            0.85s
         8           0.1832            0.82s
         9           0.1811            0.80s
        10           0.1785            0.78s
        20           0.1611            0.60s
        30           0.1500            0.40s
        40           0.1402            0.20s
        50           0.1331            0.00s
lpd_vote
      Iter       Train Loss   Remaining Time 
         1           0.0225            0.99s
         2           0.0222            0.97s
         3           0.0219            0.96s
         4           0.0217            0.95s
         5           0.0215            0.93s
         6           0.0212    

In [22]:
with open(f"checkpoints/other_models/ensemble_one_model_per_target_{subset_sample_count}_{'norm_' if normalize else ''}feats({'_'.join(sorted(feature_list))}).pickle", "wb") as pickle_file:
	pickle.dump(models, pickle_file)

In [23]:
submission = pd.DataFrame(y_pred_train_probabilities, columns=dataset.label_cols)
solution = pd.DataFrame(y_train, columns=dataset.label_cols)

# Need to be added for score calculation, but are removed at the start of the function
submission.insert(0, "id", range(len(submission)))
solution.insert(0, "id", range(len(solution)))

score = score_kl_divergence(solution=deepcopy(solution), submission=deepcopy(submission), row_id_column_name="id")
print(f"Train score: {score}")

Train score: 0.8047813917658148


In [24]:
submission = pd.DataFrame(y_pred_probabilities, columns=dataset.label_cols)
solution = pd.DataFrame(y_test, columns=dataset.label_cols)

# Need to be added for score calculation, but are removed at the start of the function
submission.insert(0, "id", range(len(submission)))
solution.insert(0, "id", range(len(solution)))

score = score_kl_divergence(solution=deepcopy(solution), submission=deepcopy(submission), row_id_column_name="id")
print(f"Test score: {score}")

Test score: 1.2997095566273134


In [25]:
# Random (compared to test df size)
random_pred = np.random.rand(submission.shape[0], submission.shape[1]-1)
random_pred[random_pred < 0] = 0
random_pred_probabilities = random_pred / np.sum(random_pred, axis=1)[:,None]
random_submission = pd.DataFrame(random_pred_probabilities, columns=dataset.label_cols)

random_submission.insert(0, "id", range(len(random_submission)))

score_kl_divergence(solution=deepcopy(solution), submission=deepcopy(random_submission), row_id_column_name="id")

1.8237684591365666