In [1]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

In [2]:
## Getting data ##
learned_data_path = '../clean_learned_EEG/combined_learned.csv'
not_learned_data_path = '../clean_not_learned_EEG/combined_not_learned.csv'

learned_data = pd.read_csv(learned_data_path, header=None)
not_learned_data = pd.read_csv(not_learned_data_path, header=None)

# checking how much data for each category
print(f"original learned shape: {learned_data.shape}")
print(f"original not_learned shape: {not_learned_data.shape}")

# make them equal to remove bias
learned_data = learned_data.sample(frac=1, random_state=42).reset_index(drop=True)
not_learned_data = not_learned_data.sample(frac=1, random_state=42).reset_index(drop=True)

# get length 
learned_length = learned_data.shape[0]
not_learned_length = not_learned_data.shape[0]

new_learned_data = learned_data.drop(index=range(not_learned_length, learned_length))
new_not_learned_data = not_learned_data.drop(index=range(learned_length, not_learned_length))

# print results
print(f"new learned shape: {new_learned_data.shape}")
print(f"new not_learned shape: {not_learned_data.shape}")

original learned shape: (746, 2456)
original not_learned shape: (141, 2456)
new learned shape: (141, 2456)
new not_learned shape: (141, 2456)


In [3]:
# convert pd to a np array for the feature matrix X
X_learned = new_learned_data.values  
X_not_learned = not_learned_data.values

# make the label vector y
'''
to indicate which epochs go to which condition
y vector label should be as long as x matrix is long

zeros = learned
ones = not learned
'''
y_learned = np.zeros(X_learned.shape[0])
y_not_learned = np.ones(X_not_learned.shape[0])

# combine the data and labels
# learned + not learned 
# learned labels + not learned labels
X = np.concatenate([X_learned, X_not_learned], axis=0)
y = np.concatenate([y_learned, y_not_learned], axis=0)

**Default Model**

In [4]:
## MODEL ##

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training
rf = RandomForestClassifier(random_state=41)
rf.fit(X_train, y_train)

# test it
y_pred = rf.predict(X_test)

# get the accuracy
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", cm)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Confusion Matrix: 
 [[22  4]
 [ 7 24]]
Accuracy: 0.8070175438596491


In [5]:
### Put the model in a pickle
import pickle

with open('my_model.pkl', 'wb') as file:
    pickle.dump(rf, file)

**Optimized Model**

In [13]:
print(rf.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [15]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 150, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


123 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
92 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\AarPi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\AarPi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "C:\Users\AarPi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\L

In [23]:
rf_random.best_params_


{'n_estimators': 1600,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [31]:
new_rf = RandomForestClassifier()

new_rf.set_params(**rf_random.best_params_)

# test it
new_rf.fit(X_train, y_train)
y_pred = new_rf.predict(X_test)

# get the accuracy
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", cm)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Confusion Matrix: 
 [[23  8]
 [ 8 18]]
Accuracy: 0.7192982456140351
