In [8]:
import pandas as pd
import mne as mne
import os 
import time
import numpy as np
import matplotlib.pyplot as plt
import joblib
import constants
from IPython.utils import io
import time
import sys
import yasa
from scipy.signal import welch

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupShuffleSplit


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold

#Import my modules
import format_eeg_data
import constants
import eeg_stat_ts
import run_expts

# Set display options to show all rows and columns
pd.set_option('display.max_rows', 50)  # Show rows
pd.set_option('display.max_columns', 160)  # Show columns

In [26]:
# Quicker version of param grid - 160 fits 
quick_param_grid = { 
    'n_estimators': [1,2, 10, 40, 100, 250],
    'max_features': [None, 'sqrt'],
    'max_depth' : [2,3,5,8, None],
    'criterion' :['gini'] }


long_param_grid = { 
    'n_estimators': [1,2, 10, 40, 100, 250],
    'max_features': [None, 'sqrt'],
    'max_depth' : [2,3,5,8, None],
    'criterion' :['gini',  'entropy'],
    'min_samples_split' : [2,3,4,5]
}

# Slower Parameter grid for Random Forest - 600 fits
param_grid = { 
    'n_estimators': [10, 30, 100, 200, 500],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth' : [3,5,8, None],
    'criterion' :['gini', 'entropy'] }

DT_params =  {
    'min_samples_leaf': [1, 2, 3 , 5 ,10],
    'max_depth': [1, 2, 3, 5, None],
    'criterion': ["gini", "entropy"]
}

Ada_grid =  { 
    'n_estimators': [2, 3, 5, 10, 20, 40, 50, 100],
    'learning_rate': [0.01, 0.1, 0.4, 1.0 , 2.0, 10.0],
    
    }

 ### Load the static features (Wake data testing)

In [10]:
core_path = '/user/home/ko20929/work/RBD_using_custom_package/Execute New Experiments/Baseline_Extensions/Gen_New_Features/generated_feats/'
data_types = ['N2', 'N3','REM', 'Wake', 'N1']
data_type = 'Wake' # Select data_type to load

load_path = core_path + data_type

X = pd.read_hdf(load_path + 'six_second_max_freq_stats_df.h5', key='df', mode='r')
y = pd.read_hdf(load_path + '_y.h5', key='df', mode='r') 
groups = pd.read_hdf(load_path + '_groups.h5', key='df', mode='r')  

# Replace time sereis data with mean of the data ______________________________________________________________________________________________________

# Defining a function to replace time series of values with their mean
def function(x):
    return x.values.mean()
    
#Construct the static features
static_features_df = X.apply(np.vectorize(function))
X = static_features_df.copy()
X_full = X.copy()

### Dictionaries

In [11]:
#Function to generate the dictionaries

def gen_expt_dictionaries(X):
    # 1. #Generate region to features dictionary to enable experiments to be run regionally
    regional_features_dict = {}
    region_channel_dict = constants.region_to_channel_dict
    regions = list(region_channel_dict.keys())
    for region in regions:
        region_features = [col for col in X.columns if '_' + region in col]
        if len(region_features) > 0 : 
            regional_features_dict[region] = region_features

    # 2. #Create the combined regions dictionary
    regions = list(regional_features_dict.keys())
    combined_regions_features_dict = {}
    for i, region_1 in enumerate(regions):
        for region_2 in regions[i+1:]:
            new_key = region_1 + '_' + region_2
            combined_regions_features_dict[new_key] = regional_features_dict[region_1] + regional_features_dict[region_2]
    
    #3. Use all of the features
    all_data_dict = {'All_regions' : list(X.columns) , 'All_regions_2' : list(X.columns)}
    
    return regional_features_dict , combined_regions_features_dict, all_data_dict

#Generating the dictionaries
regional_features_dict , combined_regions_features_dict, all_data_dict = gen_expt_dictionaries(X_full)

#Test_dictionaries 
prefrontal_left_temp_dict = {}
prefrontal_left_temp_dict['Prefrontal_Left Temporal'] = combined_regions_features_dict['Prefrontal_Left Temporal']

Prefrontal_only_dict = {'Prefrontal' : [col for col in X.columns if 'Prefrontal' in col] }

#Checking how many combined regions there are
lens = []
for key in combined_regions_features_dict.keys():
    lens.append(len(combined_regions_features_dict[key]))
len(lens)

78

### Test fitting

### Expt 1

In [23]:
t1 = time.time()

X_expt , y_expt , groups_expt, expt_info = run_expts.generate_expt_x_y_groups(X,y,groups, 1 )

results_df = run_expts.run_mv_tsc(X_expt,y_expt,groups_expt, {'RF' :  GridSearchCV( RandomForestClassifier(), long_param_grid , refit = True, verbose = 1, cv = GroupKFold(n_splits = 4) )}, return_df = True , subset_names_and_cols = prefrontal_left_temp_dict, random_states = [1,2], groups_for_fit = True, best_params = True)

res_df = run_expts.generate_subset_acc_std(results_df.drop(columns = [col for col in results_df.columns if 'params' in col]), return_df = True)
display(pd.DataFrame(res_df))
        
t2 = time.time()

t2-t1

Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits
Fitting 4 folds for each of 50 candidates, totalling 200 fits


Unnamed: 0,random_state,Prefrontal_Left Temporal_RF_acc,Prefrontal_Left Temporal_RF_std
0,1,0.7,0.066667
1,2,0.566667,0.169967


141.73037934303284

#### Expt 2

In [29]:
for params in results_df.iloc[0,3] + results_df.iloc[1,3]:
    print(params)

{'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'n_estimators': 2}
{'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 2}
{'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'n_estimators': 100}
{'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 10}
{'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'n_estimators': 40}
{'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 2}
{'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'n_estimators': 10}
{'criterion': 'gini', 'max_depth': None, 'max_features': None, 'n_estimators': 40}
{'criterion': 'gini', 'max_depth': None, 'max_features': None, 'n_estimators': 2}
{'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'n_estimators': 100}


In [50]:
t1 = time.time()

X_expt , y_expt , groups_expt, expt_info = run_expts.generate_expt_x_y_groups(X,y,groups, 2 )

results_df = run_expts.run_mv_tsc(X_expt,y_expt,groups_expt, {'RF' :  GridSearchCV( RandomForestClassifier(), quick_param_grid , refit = True, verbose = 1, cv = GroupKFold(n_splits = 3) )}, return_df = True , subset_names_and_cols = prefrontal_left_temp_dict, random_states = [1,2], groups_for_fit = True, best_params = True)

res_df = run_expts.generate_subset_acc_std(results_df.drop(columns = [col for col in results_df.columns if 'params' in col]), return_df = True)
display(pd.DataFrame(res_df))
        
t2 = time.time()

t2-t1

Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 3 folds for each of 60 candidates, totalling 180 fits


Unnamed: 0,random_state,Prefrontal_Left Temporal_RF_acc,Prefrontal_Left Temporal_RF_std
0,1,0.814286,0.068325
1,2,0.789286,0.064286


105.48987793922424

In [51]:
for params in results_df.iloc[0,3] + results_df.iloc[1,3]:
    print(params)

{'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'n_estimators': 40}
{'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 10}
{'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'n_estimators': 10}
{'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'n_estimators': 1}
{'criterion': 'gini', 'max_depth': None, 'max_features': None, 'n_estimators': 10}
{'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'n_estimators': 1}
{'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 1}
{'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 2}
{'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'n_estimators': 10}
{'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 1}


### Decision Tree Expt 2 

In [44]:
t1 = time.time()

X_expt , y_expt , groups_expt, expt_info = run_expts.generate_expt_x_y_groups(X,y,groups, 2 )

results_df = run_expts.run_mv_tsc(X_expt,y_expt,groups_expt, {'DT' : DecisionTreeClassifier(random_state = 99) }, return_df = True , subset_names_and_cols = prefrontal_left_temp_dict, random_states = [1,2])

res_df = run_expts.generate_subset_acc_std(results_df.drop(columns = [col for col in results_df.columns if 'params' in col]), return_df = True)
display(pd.DataFrame(res_df))

print(res_df.mean()['Prefrontal_Left Temporal_DT_acc']) 

t2 = time.time()

t2-t1

Unnamed: 0,random_state,Prefrontal_Left Temporal_DT_acc,Prefrontal_Left Temporal_DT_std
0,1,0.946429,0.065854
1,2,0.925,0.1


0.9357142857142857


0.09488415718078613

#### Fitted Decision Tree , AdaBoost Expt 2 

In [30]:
t1 = time.time()

X_expt , y_expt , groups_expt, expt_info = run_expts.generate_expt_x_y_groups(X,y,groups, 2 )

# results_df = run_expts.run_mv_tsc(X_expt,y_expt,groups_expt, {'DT' : GridSearchCV( DecisionTreeClassifier(), DT_params , refit = True, verbose = 1, cv = GroupKFold(n_splits = 4) ) }, return_df = True , subset_names_and_cols = prefrontal_left_temp_dict, random_states = [1,2], groups_for_fit = True, best_params = True)

# results_df = run_expts.run_mv_tsc(X_expt,y_expt,groups_expt, {'Ada' : AdaBoostClassifier() }, return_df = True , subset_names_and_cols = prefrontal_left_temp_dict, random_states = [1,2])

results_df = run_expts.run_mv_tsc(X_expt,y_expt,groups_expt, {'Ada' :  GridSearchCV( AdaBoostClassifier(), Ada_grid , refit = True, verbose = 1, cv = GroupKFold(n_splits = 4) )}, return_df = True , subset_names_and_cols = prefrontal_left_temp_dict, random_states = [1,2], groups_for_fit = True, best_params = True)

Ada_grid

res_df = run_expts.generate_subset_acc_std(results_df.drop(columns = [col for col in results_df.columns if 'params' in col]), return_df = True)
display(pd.DataFrame(res_df))

print(res_df.mean()['Prefrontal_Left Temporal_Ada_acc']) 

t2 = time.time()

t2-t1

Fitting 4 folds for each of 48 candidates, totalling 192 fits
Fitting 4 folds for each of 48 candidates, totalling 192 fits
Fitting 4 folds for each of 48 candidates, totalling 192 fits
Fitting 4 folds for each of 48 candidates, totalling 192 fits
Fitting 4 folds for each of 48 candidates, totalling 192 fits
Fitting 4 folds for each of 48 candidates, totalling 192 fits
Fitting 4 folds for each of 48 candidates, totalling 192 fits
Fitting 4 folds for each of 48 candidates, totalling 192 fits
Fitting 4 folds for each of 48 candidates, totalling 192 fits
Fitting 4 folds for each of 48 candidates, totalling 192 fits


Unnamed: 0,random_state,Prefrontal_Left Temporal_Ada_acc,Prefrontal_Left Temporal_Ada_std
0,1,0.682143,0.132865
1,2,0.792857,0.094895


0.7375


74.26678895950317

In [21]:
results_df.iloc[0,3] + results_df.iloc[1,3] 

[{'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 3},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 10},
 {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 3},
 {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2},
 {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1},
 {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2},
 {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1},
 {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1},
 {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1},
 {'criterion': 'entropy', 'max_depth': 1, 'min_samples_leaf': 10}]