In [24]:
import pandas as pd
import mne as mne
import os 
import time
import numpy as np
import matplotlib.pyplot as plt
import joblib
import constants
from IPython.utils import io
import time
import sys
import yasa
from scipy.signal import welch

#Import my modules
import format_eeg_data
import constants
import eeg_stat_ts

from mne_connectivity import spectral_connectivity_epochs
from mne.datasets import sample
import seaborn as sns

# importing random forest classifier from ensemble module
from sklearn.ensemble import RandomForestClassifier

# Import models
from sklearn.tree import DecisionTreeClassifier

#metrics are used to find accuracy or error
from sklearn import metrics  

#Additional imports for ROC AUC if necessary
from sklearn.metrics import roc_auc_score

# from sklearn model selection import Grid Search CV
from sklearn.model_selection import GridSearchCV

import Fusion_Forest_Functions

In [25]:
# If I wish to reload Fusion_Forest_Functions
import importlib
importlib.reload(Fusion_Forest_Functions)

<module 'Fusion_Forest_Functions' from '/user/work/ko20929/RBD_using_custom_package/Execute New Experiments/Deep Learning/EEG2Rep/EEG2Rep/Dataset/KnowEEGRepo_Updated/Final_Model/Fusion_Forest_Functions.py'>

In [26]:
#1. Load all necessary data for training and testing ___________________________________________________________
#Folder for loading train-test indices, filtered TSFresh Feats, selected connectivity metric
folder = '../Toy_Data/Feature_Generation/Filtered_Features/' 

train_test_indices_dict = joblib.load(folder + 'train_test_indices.pkl') 

X_train_tsfresh = joblib.load(folder + 'X_train_tsfresh_features_filtered.pkl')
X_test_tsfresh = joblib.load(folder + 'X_test_tsfresh_features_filtered.pkl')

# First load the name of the selected metric into the variable selected_metric
with open(folder + 'selected_connectivity_metric.txt', "r") as f:
    selected_metric = f.read()

# Next Load the selected connectivity metric features from the connectivity metrics folder
connectivity_metrics_folder = '../Toy_Data/Feature_Generation/All_Features/'
X_connectivity = joblib.load(connectivity_metrics_folder + selected_metric)

X_train_connectivity = X_connectivity.iloc[train_test_indices_dict['train_idx']]
X_test_connectivity = X_connectivity.iloc[train_test_indices_dict['test_idx']]

# Finally load the y_labels and construct y_train and y_test
test_eeg_data =  np.load('../Toy_Data/test_data.npz')
y_labels = test_eeg_data['y']

y_train = y_labels[train_test_indices_dict['train_idx']]
y_test = y_labels[train_test_indices_dict['test_idx']]

#We now have : 
# X_train_tsfresh, X_test_tsfresh , X_connectivity_train, X_connectivity_test, y_train, y_test
# Reset indices
X_train_tsfresh.reset_index(drop = True, inplace = True), X_test_tsfresh.reset_index(drop = True, inplace = True) , X_train_connectivity.reset_index(drop = True, inplace = True), X_test_connectivity.reset_index(drop = True, inplace = True)

(None, None, None, None)

## Train and Test the Model (Fusion Forest)

In [27]:
# X_train_tsfresh, X_test_tsfresh , X_train_connectivity , X_test_connectivity, y_train, y_test
X_train_full = pd.concat([X_train_tsfresh, X_train_connectivity], axis = 1)
X_test_full = pd.concat([X_test_tsfresh, X_test_connectivity], axis = 1)

In [28]:
trained_trees_list , selected_feats_per_tree = Fusion_Forest_Functions.fit_fusion_forest(X_train_tsfresh, X_train_connectivity, y_train)
y_preds_all = Fusion_Forest_Functions.predict_fusion_forest(X_test_tsfresh, X_test_connectivity, trained_trees_list, selected_feats_per_tree)

y_probs = y_preds_all.mean(axis = 0)
y_preds_aggregated = np.round(y_probs)

auc = roc_auc_score(y_test, y_probs)
print(auc)

acc = metrics.accuracy_score(y_preds_aggregated, y_test)
print(acc)

0.6
0.4166666666666667


### Hyperparameter Tuned Fusion Forest

#### Train-Validation split

In [29]:
# Split training data into train_sub and val for hyperparameter selection
val_size = 0.2  # What percentage of the training data do you wish to set as the validation set

n_samples = len(y_train)
rng = np.random.default_rng(seed=5)
indices = np.arange(n_samples)
rng.shuffle(indices)

n_val = int(n_samples * val_size)
val_idx = indices[:n_val]
train_sub_idx = indices[n_val:]

In [30]:
# Using train_sub_idx and val_idx we split the training data into a sub_training set and a validation set for hyperparameter selection
X_train_sub_tsfresh, X_train_sub_connectivity , y_train_sub = X_train_tsfresh.iloc[train_sub_idx], X_train_connectivity.iloc[train_sub_idx] , y_train[train_sub_idx]
X_val_tsfresh, X_val_connectivity , y_val = X_train_tsfresh.iloc[val_idx], X_train_connectivity.iloc[val_idx] , y_train[val_idx]

#### Hyperparameter Tuning

In [31]:
# You need to run a Fusion Forest for each seperate num_trees parameter THEN combine
num_trees_list = [50, 100, 200, 500, 800, 1000]

# Outside of the loop you have: 
results_dict = {} # create dictionary to save the results to select the best metric
results_dict['num_trees'] = []
results_dict['accuracy'] = []

In [32]:
for num_trees in num_trees_list:
    trained_trees_list , selected_feats_per_tree = Fusion_Forest_Functions.fit_fusion_forest(X_train_sub_tsfresh, X_train_sub_connectivity, y_train_sub)
    y_preds_all = Fusion_Forest_Functions.predict_fusion_forest(X_val_tsfresh, X_val_connectivity, trained_trees_list, selected_feats_per_tree)
    
    y_probs = y_preds_all.mean(axis = 0)
    y_preds_aggregated = np.round(y_probs)
    acc = metrics.accuracy_score(y_preds_aggregated, y_val)
    
    results_dict['num_trees'].append(num_trees)
    results_dict['accuracy'].append(acc)

In [33]:
results_df = pd.DataFrame(results_dict)
results_df = results_df.sort_values(by = 'accuracy', ascending = False)

selected_num_trees = results_df.iloc[0,0]
selected_num_trees

800