# Machine learning Random Forest models

## Load the dataset

In [58]:
import pandas as pd
import seaborn
import numpy as np
from pylab import *

## Read the csv with the channels ordered by significance

#### Select the patient and the directories and the number of significant channels to train the model

In [59]:
nchannels = 1
patient = '\chb01'
datasetdir = r'C:\Users\Mario\Documents\Master\TFM\Codigo\Jupiter\EEG_ML\DataSetCreation\DatasetsWelch'
fdir = datasetdir + patient + 'features.h5'

#### Read the csv file and stract the ordered channels

In [60]:
channelsdf = pd.read_csv(datasetdir + patient + '_channel_order.csv', delimiter=',')
list_of_rows = [list(row) for row in channelsdf.values]

In [61]:
channel_score_dict = {}
for i, elem in enumerate(list_of_rows[0]):
    channel_score_dict[elem] = i
    
for row, rowlist in enumerate(list_of_rows[1:]):
    for i, elem in enumerate(rowlist):
        channel_score_dict[elem] = channel_score_dict[elem] + i
        
sorted_channels = sorted(channel_score_dict, key=channel_score_dict.get)
significant_channels = sorted_channels[:nchannels]

In [62]:
significant_channels

['FT9-FT10']

### Read Dataset

In [63]:
df = pd.read_hdf(datasetdir + patient + 'features.h5', key = 'fullpatient')

### Extract the desired channels

In [64]:
df = df[(df['channel'] == significant_channels[0])]
df = df.drop(['channel'], axis=1)

In [65]:
df[df['seizure'] == True]

Unnamed: 0,mean,variance,skewness,kurtosis,std,zero_crossings,peak2peak,total_energy,delta,theta,alpha,beta,gamma,seizure
361,0.163932,3154.293104,0.132924,-0.368701,56.163094,62.0,294.850406,5181.014869,2212.371990,2262.965153,884.035069,164.850940,97.218538,True
362,-3.363003,12616.597818,0.257843,1.596332,112.323630,50.0,734.158928,25164.604639,21904.924142,3090.354751,582.918383,202.580573,156.068159,True
363,-2.393524,33270.889695,-0.220567,-0.275883,182.403097,26.0,936.005650,71700.884842,64830.573651,6569.391804,561.658893,482.070473,420.365466,True
364,4.462458,35728.106631,-0.498677,-0.470689,189.018800,23.0,936.005650,72582.043554,67424.964543,4927.892459,624.006343,496.495812,480.370363,True
365,5.919438,29569.069241,-0.502099,-0.464093,171.956591,30.0,823.965651,64397.121875,61265.210019,2762.673411,605.275561,364.707896,249.885274,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4481,-3.487511,24526.576713,0.412570,-0.148388,156.609632,27.0,803.120688,47656.487313,43701.764134,3298.074941,1229.692259,844.388084,127.387650,True
4482,5.956805,35631.452117,-0.163537,-0.522764,188.762952,31.0,937.415190,69748.730243,56984.558758,11650.915016,3174.240999,1612.860894,158.779045,True
4483,8.543069,30012.773601,-0.197092,0.055967,173.241951,37.0,953.473278,54926.729283,37745.405525,16623.534404,2917.281057,1837.849381,180.535644,True
4484,-9.525658,17129.983943,0.212385,0.092414,130.881565,40.0,735.692634,34387.122868,26977.769588,7388.436007,1560.852934,1541.413992,132.844465,True


## Splitting 80-20

In [106]:
def split_train_test(data, test_ratio):
    test_set_size = int(len(data) * test_ratio)
    test_set = data.iloc[:test_set_size]
    train_set = data.iloc[test_set_size:]
    return train_set, test_set

# It is important that there is the moreless the same percentage of seizures in training and test sets
# Separate in 2 subsets
df = df.sample(frac=1)
df_seizures = df[df['seizure'] == True]
df_normal = df[df['seizure'] == False]

train_seizures, test_seizures = split_train_test(df_seizures, 0.2)
train_normal, test_normal = split_train_test(df_normal, 0.2)

df_train = pd.concat([train_normal, train_seizures], axis=0)
df_train = df_train.sample(frac=1)
df_train = df_train.reset_index(drop=True)
x_train, y_train = df_train[df_train.columns.difference(['seizure'])], df_train["seizure"]
df_test = pd.concat([test_normal, test_seizures], axis=0)
df_test = df_test.sample(frac=1)
df_test = df_test.reset_index(drop=True)
x_test, y_test = df_test[df_test.columns.difference(['seizure'])], df_test["seizure"]



In [114]:
df_train[df_train['seizure'] == True]

Unnamed: 0,mean,variance,skewness,kurtosis,std,zero_crossings,peak2peak,total_energy,delta,theta,alpha,beta,gamma,seizure
0,-5.239876,31127.011284,0.158359,-0.480914,176.428488,44.0,892.567711,54552.230912,39002.153636,11205.041910,4379.709077,2861.126808,203.747449,True
46,-3.142085,41671.463782,-0.147355,-0.134918,204.135895,42.0,1032.138598,91915.125984,86965.066864,4787.961786,715.829908,744.868993,437.221084,True
109,0.294945,18429.067300,-0.050355,-0.620892,135.753701,58.0,733.520156,40321.558853,34259.869182,3834.000424,747.681061,919.271647,1541.061083,True
112,2.572435,25505.569219,-0.160523,-0.403538,159.704631,27.0,876.340925,47370.796074,41946.135072,2898.720186,1773.962975,1381.214037,200.179469,True
134,-2.409932,35648.820129,-0.194070,-0.397248,188.808951,28.0,955.932683,78493.638166,66672.199986,14153.772030,1024.966463,1246.349485,269.923700,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5800,1.416713,12898.015050,0.624357,0.550196,113.569428,40.0,634.695019,19445.982552,15320.249120,3504.166206,1051.276049,200.594520,88.474091,True
5840,0.312391,8473.707455,0.109424,-0.347915,92.052743,59.0,444.946465,16020.359704,7326.881110,7780.713705,936.663065,456.675860,171.193359,True
5865,1.229628,22967.018614,-0.107179,0.104206,151.548733,38.0,824.453504,44801.110739,38193.177083,7653.798715,2352.186109,1186.308182,164.681444,True
5896,-0.797663,12741.506029,0.191717,0.702227,112.878280,57.0,659.790265,23725.661736,18793.259646,3893.514406,1210.511046,342.502093,217.933190,True


## Random Forest Classifiers

In [108]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, make_scorer, average_precision_score

In [109]:
rnd_clf = Pipeline((
    ("scaler", StandardScaler()),
    ("rnd", RandomForestClassifier(n_estimators = 50, max_leaf_nodes = 64, n_jobs = -1))
))

In [115]:
rnd_clf.fit(x_train, y_train)

y_train_pred = cross_val_predict(rnd_clf, x_train, y_train, cv=3, n_jobs = 12)
#y_train_pred = rnd_clf.predict(x_train)

In [119]:
x_train

Unnamed: 0,alpha,beta,delta,gamma,kurtosis,mean,peak2peak,skewness,std,theta,total_energy,variance,zero_crossings
0,4379.709077,2861.126808,39002.153636,203.747449,-0.480914,-5.239876,892.567711,0.158359,176.428488,11205.041910,54552.230912,31127.011284,44.0
1,74.941938,91.014198,4144.192312,16.980640,0.678558,-0.008598,262.931957,-0.763534,45.868414,254.120227,4541.244330,2103.911368,27.0
2,72.034448,60.868045,464.353509,8.631288,0.343631,-0.043721,111.900206,-0.211776,18.916937,121.204566,674.295351,357.850498,76.0
3,81.834655,321.591348,952.536951,650.880577,4.661112,0.977609,391.013844,0.336955,39.045686,117.136211,2071.463634,1524.565631,163.0
4,77.922267,42.964372,1760.500438,9.226961,3.490687,-0.402381,245.696762,0.542656,33.500273,198.485546,2050.295548,1122.268275,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5925,330.907842,88.972203,4463.749599,12.163422,0.233953,-0.006436,297.150589,0.424938,51.359827,1008.028331,5774.629487,2637.831831,42.0
5926,51.407340,110.810720,2345.445673,74.389308,2.027974,1.273526,259.906907,-0.974643,37.218884,312.114631,2798.657322,1385.245300,73.0
5927,29.676508,57.737707,284.904607,15.117265,0.210652,0.728111,96.942174,0.007776,16.346414,148.228570,508.563573,267.205261,78.0
5928,175.276302,50.356136,2272.395587,12.265063,0.708983,-0.895960,251.874821,0.162726,40.805300,564.209704,3017.162604,1665.072522,39.0


In [116]:
print('Precision score: ' + str(precision_score(y_train, y_train_pred)))
print('Recall score: ' + str(recall_score(y_train, y_train_pred)))
print('F1 score: ' + str(f1_score(y_train, y_train_pred)))

Precision score: 1.0
Recall score: 1.0
F1 score: 1.0


In [117]:
y_test_pred = rnd_clf.predict(x_test)

print('Precision score: ' + str(precision_score(y_test, y_test_pred)))
print('Recall score: ' + str(recall_score(y_test, y_test_pred)))
print('F1 score: ' + str(f1_score(y_test, y_test_pred)))

Precision score: 0.926829268292683
Recall score: 0.8837209302325582
F1 score: 0.9047619047619047


In [113]:
import pickle
# Save model
filename = 'rnf_chb01.sav'
pickle.dump(gs, open(filename, 'wb'))

## Grid search

### Linear and rbf kernel

In [52]:
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import classification_report

In [55]:
pipe_rnf = Pipeline((
        ("scl", StandardScaler()),
        ("clf", RandomForestClassifier())
    ))
# Establish params
param_grid = [
                {'clf__n_estimators': [50, 100, 200, 300, 400, 500], 'clf__max_leaf_nodes': [4,8,16,32,64]}]

gs = GridSearchCV(estimator=pipe_rnf,param_grid=param_grid, cv = 5, scoring='roc_auc', return_train_score=True, n_jobs = -1)
gs.fit(x_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=(('scl',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('clf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                 

In [56]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=64, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=500, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [57]:
y_true, y_pred = y_test, gs.predict(x_test)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00      1439
        True       0.90      0.84      0.87        43

    accuracy                           0.99      1482
   macro avg       0.95      0.92      0.93      1482
weighted avg       0.99      0.99      0.99      1482

