In [16]:
import pandas as pd
import mne as mne
import os 
import time
import numpy as np
import matplotlib.pyplot as plt
import joblib
import constants
from IPython.utils import io
import time
import sys
import yasa
from scipy.signal import welch

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupShuffleSplit


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold

#Import my modules
import format_eeg_data
import constants
import eeg_stat_ts
import run_expts

# Set display options to show all rows and columns
pd.set_option('display.max_rows', 50)  # Show rows
pd.set_option('display.max_columns', 160)  # Show columns

In [3]:
# Quicker version of param grid - 160 fits 
quick_param_grid = { 
    'n_estimators': [10, 40, 100, 250],
    'max_features': [None, 'sqrt'],
    'max_depth' : [3,5,8, None],
    'criterion' :['gini'] }

# Slower Parameter grid for Random Forest - 600 fits
param_grid = { 
    'n_estimators': [10, 30, 100, 200, 500],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth' : [3,5,8, None],
    'criterion' :['gini', 'entropy'] }

clf =  GridSearchCV( RandomForestClassifier(), param_grid, refit = True, verbose = 1)

### Dictionaries

In [48]:
def gen_expt_dictionaries(X):
    # 1. #Generate region to features dictionary to enable experiments to be run regionally
    regional_features_dict = {}
    region_channel_dict = constants.region_to_channel_dict
    regions = list(region_channel_dict.keys())
    for region in regions:
        region_features = [col for col in X.columns if '_' + region in col]
        if len(region_features) > 0 : 
            regional_features_dict[region] = region_features

    # 2. #Create the combined regions dictionary
    regions = list(regional_features_dict.keys())
    combined_regions_features_dict = {}
    for i, region_1 in enumerate(regions):
        for region_2 in regions[i+1:]:
            new_key = region_1 + '_' + region_2
            combined_regions_features_dict[new_key] = regional_features_dict[region_1] + regional_features_dict[region_2]
    
    #3. Use all of the features
    all_data_dict = {'All_regions' : list(X.columns) , 'All_regions_2' : list(X.columns)}
    
    return regional_features_dict , combined_regions_features_dict, all_data_dict

In [56]:
regional_features_dict , combined_regions_features_dict, all_data_dict = gen_expt_dictionaries(X_full)

In [59]:
lens = []
for key in combined_regions_features_dict.keys():
    lens.append(len(combined_regions_features_dict[key]))
len(lens)

78

 ### Load the static features (Wake data testing)

In [55]:
core_path = '/user/home/ko20929/work/RBD_using_custom_package/Execute New Experiments/Baseline_Extensions/Gen_New_Features/generated_feats/'
data_types = ['N2', 'N3','REM', 'Wake', 'N1']
data_type = 'Wake' # Select data_type to load

load_path = core_path + data_type

X = pd.read_hdf(load_path + 'six_second_max_freq_stats_df.h5', key='df', mode='r')
y = pd.read_hdf(load_path + '_y.h5', key='df', mode='r') 
groups = pd.read_hdf(load_path + '_groups.h5', key='df', mode='r')  

# Replace time sereis data with mean of the data ______________________________________________________________________________________________________

# Defining a function to replace time series of values with their mean
def function(x):
    return x.values.mean()
    
#Construct the static features
static_features_df = X.apply(np.vectorize(function))
X = static_features_df.copy()
X_full = X.copy()
X = X[[col for col in X.columns if 'Prefrontal' in col]]

### Test fitting

In [66]:
gkf = GroupKFold(n_splits = 5) 
clf =  GridSearchCV( RandomForestClassifier(), quick_param_grid , refit = True, verbose = 1, cv = GroupKFold(n_splits = 4) )
# clf =  GridSearchCV( RandomForestClassifier(), quick_param_grid , refit = True, verbose = 1, cv = None )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
groups_train = groups.loc[y_train.index]

In [68]:
t1 = time.time()
clf.fit(X_train, y_train, groups = groups_train)
t2 = time.time()
t2-t1

Fitting 4 folds for each of 32 candidates, totalling 128 fits


KeyboardInterrupt: 

In [63]:
# 11.5 seconds for one region 
# 1 region x 13 regions x 2 (CV twice) x 11 seconds x 5 fold cross validation
# 25 minutes estimated run time 
print( (1*13*2*11*5)/60 )

print( (1*78*2*11*5)/(60*60)  )

23.833333333333332
2.3833333333333333


### Best Parameters Using Groups

In [43]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': 5,
 'max_features': None,
 'n_estimators': 250}

In [40]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'sqrt',
 'n_estimators': 250}

In [37]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'sqrt',
 'n_estimators': 10}

### Best Parameters not using Groups

In [33]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'sqrt',
 'n_estimators': 10}

In [30]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'sqrt',
 'n_estimators': 250}

In [27]:
clf.best_params_

{'criterion': 'gini', 'max_depth': 8, 'max_features': None, 'n_estimators': 10}