In [1]:
# data processing
import numpy as np
import pandas as pd

# model stuff
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Hyperparameter tuning
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, space_eval
#%pip install hyperopt

Load in data

In [2]:
# learned_data_path = '../clean_learned_EEG/clean_learned_Irby.csv'
# not_learned_data_path = '../clean_not_learned_EEG/clean_notLearned_Irby.csv'

learned_data_path = '../clean_learned_EEG/combined_learned.csv'
not_learned_data_path = '../clean_not_learned_EEG/combined_not_learned.csv'

learned_data = pd.read_csv(learned_data_path, header=None)
not_learned_data = pd.read_csv(not_learned_data_path, header=None)

# fill na with the mean value of the data
# learned_data.fillna(learned_data.mean())
# not_learned_data.fillna(not_learned_data.mean())

# checking how much data for each category
print(f"original learned shape: {learned_data.shape}")
print(f"original not_learned shape: {not_learned_data.shape}")

# make them equal to remove bias
learned_data = learned_data.sample(frac=1, random_state=42).reset_index(drop=True)
not_learned_data = not_learned_data.sample(frac=1, random_state=42).reset_index(drop=True)

# get length 
learned_length = learned_data.shape[0]
not_learned_length = not_learned_data.shape[0]

new_learned_data = learned_data.drop(index=range(not_learned_length, learned_length))
new_not_learned_data = not_learned_data.drop(index=range(learned_length, not_learned_length))

# print results
print(f"new learned shape: {new_learned_data.shape}")
print(f"new not_learned shape: {not_learned_data.shape}")



original learned shape: (746, 2456)
original not_learned shape: (141, 2456)
new learned shape: (141, 2456)
new not_learned shape: (141, 2456)


**Normal Model**

In [4]:
# Train test split
# can add .values to these 
X_learned = new_learned_data.values
X_not_learned = new_not_learned_data.values

# make the label vector y
'''
to indicate which epochs go to which condition
y vector label should be as long as x matrix is long

zeros = learned
ones = not learned
'''
y_learned = np.zeros(X_learned.shape[0])
y_not_learned = np.ones(X_not_learned.shape[0])

# combine the data and labels
# learned + not learned 
# learned labels + not learned labels
X = np.concatenate([X_learned, X_not_learned], axis=0)
y = np.concatenate([y_learned, y_not_learned], axis=0)

# converting X into a dataframe
X = pd.DataFrame(X)

# get size of X and Y

# split into training and test data
test_size_n = 0.2   # sets this percentage aside for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_n, random_state=0)

model = SVC(kernel='rbf')
#from SVM scikit-learn library, uses radial basis function
# use balanced to account for the bias in data, class_weight='balanced'
# without this it counts EVERYTHING as positive

# kernel options: linear, rbf, poly, 

# train the data using fit
model.fit(X_train, y_train)

# make predictions based on the testing data from before
y_pred = model.predict(X_test)

# evauluate the performance of the SVM model by caclulating confusino matrix
# and the accuracy score
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", cm)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

#model.predict()     # new data goes in here


Confusion Matrix: 
 [[14 13]
 [ 7 23]]
Accuracy:  0.6491228070175439


In [8]:
### Put the model in a pickle
import pickle

with open('my_model.pkl', 'wb') as file:
    pickle.dump(model, file)

Optimizing stuff

In [None]:
# Standardization
sc = StandardScaler()

# standardize the training dataset and test set, use fillna(0) instead of dropna for now, but this should probably change later
X_train_transformed = pd.DataFrame(sc.fit_transform(X_train),index=X_train.index, columns=X_train.columns)
X_test_transformed = pd.DataFrame(sc.fit_transform(X_test),index=X_test.index, columns=X_test.columns)

# summarize features
X_train_transformed.head()
#X_train_transformed.describe().T

# Default SVM
svc = SVC()
params = svc.get_params()
params_df = pd.DataFrame(params, index=[0])
params_df.T

# run model on default SVM parameters
svc.fit(X_train_transformed, y_train)

# evauluate the performance of the SVM model by caclulating confusinon matrix
# and the accuracy score
# train the data using fit

print(f'accuracy: {svc.score(X_test_transformed, y_test):.4f}')

# note: already seeing improved accuracy, likely because of the scalar from before

accuracy: 0.5439


**bayesian**

In [3]:
# Train test split
# can add .values to these 
X_learned = new_learned_data
X_not_learned = new_not_learned_data

# make the label vector y
'''
to indicate which epochs go to which condition
y vector label should be as long as x matrix is long

zeros = learned
ones = not learned
'''
y_learned = np.zeros(X_learned.shape[0])
y_not_learned = np.ones(X_not_learned.shape[0])

# combine the data and labels
# learned + not learned 
# learned labels + not learned labels
X = np.concatenate([X_learned, X_not_learned], axis=0)
y = np.concatenate([y_learned, y_not_learned], axis=0)

# converting X into a dataframe
X = pd.DataFrame(X)

# get size of X and Y

# split into training and test data
test_size_n = 0.2   # sets this percentage aside for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_n, random_state=0)

# Standardization
sc = StandardScaler()

# standardize the training dataset and test set, use fillna(0) instead of dropna for now, but this should probably change later
X_train_transformed = pd.DataFrame(sc.fit_transform(X_train),index=X_train.index, columns=X_train.columns)
X_test_transformed = pd.DataFrame(sc.fit_transform(X_test),index=X_test.index, columns=X_test.columns)

# summarize features
X_train_transformed.head()
#X_train_transformed.describe().T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2446,2447,2448,2449,2450,2451,2452,2453,2454,2455
176,-0.206464,-0.190938,-0.178066,-0.170313,-0.168215,-0.170613,-0.175794,-0.182612,-0.190492,-0.198731,...,-0.021922,-0.017298,-0.014247,-0.014481,-0.018974,-0.02704,-0.03652,-0.04523,-0.051843,-0.055526
200,1.707029,1.68893,1.66725,1.647228,1.632279,1.621858,1.611472,1.595388,1.570561,1.539134,...,-0.039092,-0.035335,-0.029926,-0.020969,-0.007996,0.007347,0.021189,0.029502,0.030786,0.026491
157,0.079256,0.073492,0.061102,0.048427,0.042049,0.046039,0.060071,0.079262,0.096127,0.104153,...,-0.063284,-0.058246,-0.057236,-0.059959,-0.064894,-0.069785,-0.072527,-0.071987,-0.068399,-0.063212
27,-0.31606,-0.308396,-0.299679,-0.289187,-0.276862,-0.263954,-0.253153,-0.247845,-0.25057,-0.261403,...,-0.076672,-0.073722,-0.069672,-0.065444,-0.061739,-0.058738,-0.056139,-0.053471,-0.050451,-0.047203
173,-0.280807,-0.270437,-0.265142,-0.264004,-0.263988,-0.261272,-0.252921,-0.238292,-0.219414,-0.199988,...,-0.053309,-0.05629,-0.057173,-0.056943,-0.057059,-0.058535,-0.061383,-0.064666,-0.067059,-0.067685


In [5]:
"""
use hyper opt for this bayesian modeling. 
define an objective function that takes in the parameters and returns the loss. we want to maximize the accuracy so we set
max(scores) as the best_score, and the set loss to be -best_score. 

use fmin to optimize the obejctive function. Hyperopt currently has three algorithms: random search,
Tree of Parzen Estimators, and adaptive.TPE. we use TPE as the search algorithm
"""
# increae the amount of C values and gamma values
C_range = np.logspace(-10, 10, 21)
print(f'C values: {C_range}')

# list of gamma values
gamma_range = np.logspace(-10, 10, 21)
print(f'gamma values: {gamma_range}')


# space
space = {
    'C': hp.choice('C', C_range),
    'gamma': hp.choice('gamma', gamma_range.tolist()+['scale', 'auto']),
    'kernel': hp.choice('kernel', ['rbf', 'poly'])
}

# set up k-fold
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# objective function
def objective(params):

    svc = SVC(**params)
    scores = cross_val_score(svc, X_train_transformed, y_train, cv=kfold, scoring='accuracy', n_jobs=-1)

    best_score = max(scores)

    loss = -best_score

    return {'loss':loss, 'params': params, 'status': STATUS_OK}

# track progress
bayes_trials = Trials()

# optimize
best = fmin(fn = objective, space = space, algo=tpe.suggest, max_evals=15, trials=bayes_trials)

# note that it can't run too many trials, 15 may be the max

C values: [1.e-10 1.e-09 1.e-08 1.e-07 1.e-06 1.e-05 1.e-04 1.e-03 1.e-02 1.e-01
 1.e+00 1.e+01 1.e+02 1.e+03 1.e+04 1.e+05 1.e+06 1.e+07 1.e+08 1.e+09
 1.e+10]
gamma values: [1.e-10 1.e-09 1.e-08 1.e-07 1.e-06 1.e-05 1.e-04 1.e-03 1.e-02 1.e-01
 1.e+00 1.e+01 1.e+02 1.e+03 1.e+04 1.e+05 1.e+06 1.e+07 1.e+08 1.e+09
 1.e+10]
 80%|████████  | 12/15 [11:55<02:58, 59.64s/trial, best loss: -0.5333333333333333]


KeyboardInterrupt: 

In [None]:
print(best)

print(space_eval(space, best))

{'C': np.int64(18), 'gamma': np.int64(8), 'kernel': np.int64(0)}
{'C': np.float64(100000000.0), 'gamma': 0.01, 'kernel': 'rbf'}


In [None]:
# now make predictions
svc_bo = SVC(C=space_eval(space,best)['C'], gamma=space_eval(space,best)['gamma'],kernel=space_eval(space,best)['kernel'])

svc_bo.fit(X_test_transformed, y_test)

print(f'accuracy: {svc_bo.score(X_test_transformed, y_test)}')


accuracy: 1.0
Confusion Matrix: 
 [[14 13]
 [ 7 23]]


**grid search**

In [None]:
## Hyperparameter Tuning with Grid Search ##

# look at the parameters defined above

# list of C values
C_range = np.logspace(-1,1,3)
print(f'C values are {C_range}')

# list of gamma values
gamma_range = np.logspace(-1,1,3)
print(f'gamma values are {gamma_range}')

"""
scoring is the metric to evaluate cross validation results for each model. set scoring = 'accuracy'
the scoring option can take more than one metric in the list

StratifieldKFold is used for cross validaiton. keeps the class ration in the folds the same as training dataset. n_splits = 3
means we are doing 3 fold cross validation. shuffle=True shuffles the data before splitting. random_state = 0 does as it sounds
"""

# define the search space
param_grid = {
    # regularization parameter
    "C": C_range,
    #Kernel type
    "kernel": ['rbf', 'poly'],
    # gamma
    "gamma": gamma_range.tolist()+['scale', 'atuo']
}

# set up score
scoring = ['accuracy']

# k fold cross validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# define grid search
grid_search = GridSearchCV(estimator=svc, #since we're using svc
                           param_grid=param_grid, # which we defined above
                           scoring=scoring, #defined as accuracy
                           refit='accuracy', # enables refitting the model with the best parameters
                           n_jobs=1, # parallel processing using all the processors
                           cv=kfold, #takes StratifieldKFold that we defined
                           verbose=0, # controls how much info is returned, =0 makes it silent 
                           )

# fit grid search
grid_result = grid_search.fit(X_train_transformed, y_train) 
#similar to running the model before, but now it's like going through several models

grid_result


## get the best values

# best accuracy score
print(f'best accuracy score: {grid_result.best_score_}')

# get hyperparameters of the best score
print(f'best hyperparameters: {grid_result.best_params_}')

# best accuracy score for the testing dataset
print(f'accuracy score for testing dataset: {grid_search.score(X_test_transformed, y_test)}')



**random search**

In [None]:
### Hyperparameter Optimizing with Random Search ###

# increae the amount of C values and gamma values
C_range = np.logspace(-10, 10, 21)
print(f'C values: {C_range}')

# list of gamma values
gamma_range = np.logspace(-10, 10, 21)
print(f'gamma values: {gamma_range}')

## Same code as with grid search, but one change in random_search

# define the search space
param_grid = {
    # regularization parameter
    "C": C_range,
    #Kernel type
    "kernel": ['rbf', 'poly'],
    # gamma
    "gamma": gamma_range.tolist()+['scale', 'atuo']
}

# set up score
scoring = ['accuracy']

# k fold cross validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# define grid search
random_search = RandomizedSearchCV(estimator=svc, #since we're using svc
                           param_distributions=param_grid, # which we defined above
                           n_iter=100,   #this is the change, testing 100 combdinations
                           scoring=scoring, #defined as accuracy
                           refit='accuracy', # enables refitting the model with the best parameters
                           n_jobs=1, # parallel processing using all the processors
                           cv=kfold, #takes StratifieldKFold that we defined
                           verbose=0, # controls how much info is returned, =0 makes it silent 
                           )

# fit grid search
grid_result = grid_search.fit(X_train_transformed, y_train) 
#similar to running the model before, but now it's like going through several models

grid_result


## get the best values

# best accuracy score
print(f'best accuracy score: {grid_result.best_score_}')

# get hyperparameters of the best score
print(f'best hyperparameters: {grid_result.best_params_}')

# best accuracy score for the testing dataset
print(f'accuracy score for testing dataset: {grid_search.score(X_test_transformed, y_test)}')