In [113]:
# %cd ..
%cd /home/janneke/Documents/Master/Machine_Learning_in_Practice/HMS/MLiP_group_10_task1_HMS/

/home/janneke/Documents/Master/Machine_Learning_in_Practice/HMS/MLiP_group_10_task1_HMS


In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
from tqdm import tqdm
from logging import getLogger, basicConfig, INFO

from datasets.data_loader import CustomDataset
from generics.configs import DataConfig, Paths, EEGConfig
from utils.data_preprocessing_utils import create_non_overlapping_eeg_crops
from utils.visualisation_utils import plot_spectrogram, plot_eeg_combined_graph
from utils.eeg_processing_utils import generate_spectrogram_from_eeg

In [115]:
basicConfig(level=INFO)
logger = getLogger('main')
config = DataConfig()

In [116]:
main_df = pd.read_csv(Paths.TRAIN_CSV)
main_df.head()
print(len(main_df))

106800


In [117]:
eeg_ids = set(main_df["eeg_id"])

main_df = main_df.sample(frac = 1)  #TODO remove (added to collect some random samples instead of the first few)

test_main_df = main_df.head(1000)
print(f"test_eeg_rows: {test_main_df}")

paths_eegs = glob(Paths.TRAIN_EEGS + "*.parquet")

eeg_ids = np.asarray(test_main_df["eeg_id"])
print(f"eeg_ids: {eeg_ids}")
eeg_label_offset_seconds = test_main_df["eeg_label_offset_seconds"]
print(f"eeg_label_offset_seconds: {list(eeg_label_offset_seconds)}")
eeg_files = [f"{Paths.TRAIN_EEGS}{eeg_id}.parquet" for eeg_id in eeg_ids]

# eeg = pd.read_parquet(eeg_files[0])
# print(eeg.head())

# plot_eeg_combined_graph(eeg)

test_eeg_rows:            eeg_id  eeg_sub_id  eeg_label_offset_seconds  spectrogram_id  \
9131   1164359526           3                      26.0       172731638   
1686   2542010643           5                      74.0        19384736   
23327  2770282032           1                       8.0       453548281   
21413  3151122694           1                       2.0       417136217   
10146  1713287648           0                       0.0       187627412   
...           ...         ...                       ...             ...   
23642  3144743661           3                      34.0       460195522   
15499   927494158           1                       4.0       305734932   
38797  2491654631           3                       6.0       759532808   
51733   767228869           9                      28.0      1017379890   
89699  3328682174          16                      58.0      1835497958   

       spectrogram_sub_id  spectrogram_label_offset_seconds    label_id  \
9131     

In [118]:
paths_eegs = glob(f"{Paths.TRAIN_EEGS}*.parquet")

eegs = {}

for parquet_path in paths_eegs:
	path_eeg_id = int(parquet_path.split("/")[-1].split(".")[0])
	if path_eeg_id in eeg_ids:
		eegs[path_eeg_id] = pd.read_parquet(parquet_path)

print(len(eegs))
print(eegs.keys())
print(list(eegs.values())[0])

871
dict_keys([1478916564, 3330141099, 3686091757, 2864918309, 2468812843, 2994756759, 1180832915, 3346737270, 2384261325, 1018200965, 1924598611, 463915196, 610369206, 579856397, 83838432, 243307256, 3930074338, 661843354, 1654580421, 180689244, 4265424053, 1386986988, 2065419710, 4149504955, 2853804909, 3328682174, 753481674, 3572672408, 2045303407, 39304625, 1191856007, 1669498389, 4058771663, 2777904796, 3002953640, 2322701247, 208592751, 3403533082, 3781876690, 2378992561, 2522822842, 1164359526, 186045431, 927494158, 1514010247, 82952885, 2681518409, 3424738020, 3611884678, 3422737868, 2770282032, 1439901768, 2892489522, 915968846, 4108263866, 1588665723, 1737965414, 2337750295, 3747939884, 717566795, 1309740482, 4104566695, 3241737256, 2304781376, 977828620, 3321981682, 2991142133, 1523718089, 3327346587, 1742308583, 2141762937, 2343534193, 3250156093, 2534075521, 1294245951, 2428433259, 276206784, 2438911278, 1789944806, 2857828278, 290851085, 2825224988, 2643547222, 178673981,

Show spectogram EEG of the first EEG

In [119]:
some_eeg_df = list(eegs.values())[0]
some_eeg_id = list(eegs.keys())[0]
some_eeg_file = f"{Paths.TRAIN_EEGS}{some_eeg_id}.parquet"
generate_spectrogram_from_eeg(some_eeg_file, display=True, display_eeg_id=some_eeg_id)

Starting shape: (13200, 20)
Final shape: (187,)


Extract simple features (mean, std, etc) and combine with one-hot encoding for the 4 groups.

In [120]:
feats = ["EKG"]
for i in DataConfig.FEATS:
	feats.extend(i)
feats = list(set(feats))

one_hot = {feat: [] for feat in feats}
one_hot["EKG"] = []
for group in DataConfig.FEATS:
	for feat in feats:
		one_hot[feat].append(int(feat in group))

one_hot_df = pd.DataFrame(one_hot, index=DataConfig.NAMES)

def extract_features(eeg_subsample: pd.DataFrame) -> np.ndarray:
	"""Extract features from eeg subsample (mean, std, etc) 
		and combine with one-hot encoding per channel

	Args:
		eeg_subsample (pd.DataFrame): subsample from an eeg (dataframe with 2000 rows)

	Returns:
		np.ndarray: 1D numpy array with features 
	"""
	desc = eeg_subsample.describe()[feats].iloc[1:]
	feature_df = pd.concat([desc, one_hot_df])
	feature_array = np.asarray(feature_df).flatten("F")

	return feature_array

In [121]:
test_main_df.insert(len(main_df.columns), "eeg_subsample_start_index", test_main_df["eeg_label_offset_seconds"] * 200)
test_main_df.insert(len(main_df.columns), "eeg_subsample_end_index", test_main_df["eeg_label_offset_seconds"] * 200 + (10 * 200))
test_main_df = test_main_df.astype({"eeg_subsample_start_index": int, "eeg_subsample_end_index": int})
test_main_df.dtypes

eeg_id                                int64
eeg_sub_id                            int64
eeg_label_offset_seconds            float64
spectrogram_id                        int64
spectrogram_sub_id                    int64
spectrogram_label_offset_seconds    float64
label_id                              int64
patient_id                            int64
expert_consensus                     object
seizure_vote                          int64
lpd_vote                              int64
gpd_vote                              int64
lrda_vote                             int64
grda_vote                             int64
other_vote                            int64
eeg_subsample_end_index               int64
eeg_subsample_start_index             int64
dtype: object

In [122]:
eeg_subsamples = np.zeros((len(test_main_df), 2000, 20))
expert_lbls = np.zeros((len(test_main_df), 6))
consensus_lbls = np.zeros(len(test_main_df))

sample_eeg = list(eegs.values())[0].iloc[:5]
features_per_sample = np.zeros(((len(test_main_df), len(extract_features(sample_eeg)))))

consensus_lbl_dict = {
	"Seizure": 0,
	"LPD": 1,
	"GPD": 2,
	"LRDA": 3,
	"GRDA": 4,
	"Other": 5,
}

print(eegs.keys())
print(test_main_df["eeg_id"])

for eeg_id, eeg in eegs.items():
	if eeg_id in list(test_main_df["eeg_id"]):
		for i, (df_i, subsample) in enumerate(test_main_df[test_main_df["eeg_id"] == eeg_id].iterrows()):
			subsample_eeg_id = subsample["eeg_sub_id"]
			subsample_consensus_lbl = consensus_lbl_dict[subsample["expert_consensus"]]
			subsample_expert_lbls = subsample[["seizure_vote", "lpd_vote", "gpd_vote", "lrda_vote", "grda_vote", "other_vote"]]
			subsample_eeg = eeg.iloc[
				int(subsample["eeg_label_offset_seconds"] * 200) :
				int(subsample["eeg_label_offset_seconds"] * 200 + (10 * 200))
			]
			eeg_subsamples[i] = np.asarray(subsample_eeg)
			expert_lbls[i] = np.asarray(subsample_expert_lbls)
			consensus_lbls[i] = subsample_consensus_lbl
			features_per_sample[i] = extract_features(subsample_eeg)

print(eeg_subsamples[0])
print(expert_lbls.shape)
print(consensus_lbls)

dict_keys([1478916564, 3330141099, 3686091757, 2864918309, 2468812843, 2994756759, 1180832915, 3346737270, 2384261325, 1018200965, 1924598611, 463915196, 610369206, 579856397, 83838432, 243307256, 3930074338, 661843354, 1654580421, 180689244, 4265424053, 1386986988, 2065419710, 4149504955, 2853804909, 3328682174, 753481674, 3572672408, 2045303407, 39304625, 1191856007, 1669498389, 4058771663, 2777904796, 3002953640, 2322701247, 208592751, 3403533082, 3781876690, 2378992561, 2522822842, 1164359526, 186045431, 927494158, 1514010247, 82952885, 2681518409, 3424738020, 3611884678, 3422737868, 2770282032, 1439901768, 2892489522, 915968846, 4108263866, 1588665723, 1737965414, 2337750295, 3747939884, 717566795, 1309740482, 4104566695, 3241737256, 2304781376, 977828620, 3321981682, 2991142133, 1523718089, 3327346587, 1742308583, 2141762937, 2343534193, 3250156093, 2534075521, 1294245951, 2428433259, 276206784, 2438911278, 1789944806, 2857828278, 290851085, 2825224988, 2643547222, 178673981, 564

In [123]:
x = features_per_sample
y = consensus_lbls

# max 1d array per sample
# x = np.asarray([
# 	[[1,1], [2,2]],
# 	[[1,1], [2,2]],
# 	[[1,1], [2,2]],
# ])

x.shape

# y = np.asarray([0, 1, 0])

y.shape

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [124]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
n_neighbors = int(np.sqrt(len(y_train)))
model = LinearRegression()
model.fit(x_train, y_train)

In [125]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, r2_score
y_pred = model.predict(x_test)
r2_score = r2_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
# print(classification_report(y_test, y_pred))

print(f"r2_score: {r2_score}")

r2_score: 0.7602000986487467


In [126]:
spectrogram_ids = set(main_df["eeg_id"])

paths_spectrograms = glob(Paths.TRAIN_SPECTROGRAMS + "*.parquet")[:3]

print(paths_spectrograms)

spectrogram_id = int(paths_spectrograms[0].split("/")[-1].split(".")[0])
spectrogram = pd.read_parquet(paths_spectrograms[0])

spectrogram.head()
main_df[main_df["spectrogram_id"] == spectrogram_id]

['./data/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/1057834128.parquet', './data/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/1626013378.parquet', './data/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/1541743732.parquet']


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
53394,4013518384,1,4.0,1057834128,1,4.0,4007171645,2524,GRDA,0,0,0,1,2,0
53396,4242698315,0,0.0,1057834128,3,330.0,356553988,2524,Other,0,0,1,2,1,10
53393,4013518384,0,0.0,1057834128,0,0.0,3828957267,2524,GRDA,0,0,0,1,2,0
53395,4013518384,2,8.0,1057834128,2,8.0,3779398001,2524,GRDA,0,0,0,1,2,0


In [127]:
# dataset = CustomDataset(config=config, subset_sample_count=100, mode="train")
# dataset.print_summary()
# len(dataset.eeg_spectrograms)