In [1]:
# data processing
import numpy as np
import pandas as pd

# model stuff
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Hyperparameter tuning
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, space_eval
#%pip install hyperopt

Load in data

In [None]:
# learned_data_path = '../clean_learned_EEG/clean_learned_Irby.csv'
# not_learned_data_path = '../clean_not_learned_EEG/clean_notLearned_Irby.csv'

learned_data_path = '../clean_learned_EEG/combined_learned.csv'
not_learned_data_path = '../clean_not_learned_EEG/combined_notLearned.csv'

learned_data = pd.read_csv(learned_data_path, header=None)
not_learned_data = pd.read_csv(not_learned_data_path, header=None)

# fill na with the mean value of the data
# learned_data.fillna(learned_data.mean())
# not_learned_data.fillna(not_learned_data.mean())

# checking how much data for each category
print(f"original learned shape: {learned_data.shape}")
print(f"original not_learned shape: {not_learned_data.shape}")

# make them equal to remove bias
learned_data = learned_data.sample(frac=1, random_state=42).reset_index(drop=True)
not_learned_data = not_learned_data.sample(frac=1, random_state=42).reset_index(drop=True)

# get length 
learned_length = learned_data.shape[0]
not_learned_length = not_learned_data.shape[0]

new_learned_data = learned_data.drop(index=range(not_learned_length, learned_length))
new_not_learned_data = not_learned_data.drop(index=range(learned_length, not_learned_length))

# print results
print(f"new learned shape: {new_learned_data.shape}")
print(f"new not_learned shape: {not_learned_data.shape}")



original learned shape: (2, 14736)
original not_learned shape: (2, 14736)
new learned shape: (2, 14736)
new not_learned shape: (2, 14736)


In [53]:
# check if the other files have nan
chenguyi = pd.read_csv('../clean_learned_EEG/clean_learned_Chengyi.csv', header=None)

def nan_finder(df):
    has_nan = df.isna().any().any()

    if has_nan:
        print("The DataFrame contains NaN values.")
    else:
        print("The DataFrame does not contain NaN values.")

nan_finder(chenguyi)

The DataFrame does not contain NaN values.


**Normal Model**

In [47]:
# Train test split
# can add .values to these 
X_learned = new_learned_data.values
X_not_learned = new_not_learned_data.values

# make the label vector y
'''
to indicate which epochs go to which condition
y vector label should be as long as x matrix is long

zeros = learned
ones = not learned
'''
y_learned = np.zeros(X_learned.shape[0])
y_not_learned = np.ones(X_not_learned.shape[0])

# combine the data and labels
# learned + not learned 
# learned labels + not learned labels
X = np.concatenate([X_learned, X_not_learned], axis=0)
y = np.concatenate([y_learned, y_not_learned], axis=0)

# get size of X and Y

# split into training and test data
test_size_n = 0.2   # sets this percentage aside for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_n, random_state=0)

model = SVC(kernel='rbf')
#from SVM scikit-learn library, uses radial basis function
# use balanced to account for the bias in data, class_weight='balanced'
# without this it counts EVERYTHING as positive

# kernel options: linear, rbf, poly, 

# train the data using fit
model.fit(X_train, y_train)

# make predictions based on the testing data from before
y_pred = model.predict(X_test)

# evauluate the performance of the SVM model by caclulating confusino matrix
# and the accuracy score
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", cm)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

#model.predict()     # new data goes in here


ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Standardization
sc = StandardScaler()

# standardize the training dataset and test set, use fillna(0) instead of dropna for now, but this should probably change later
X_train_transformed = pd.DataFrame(sc.fit_transform(X_train),index=X_train.index, columns=X_train.columns)
X_test_transformed = pd.DataFrame(sc.fit_transform(X_test),index=X_test.index, columns=X_test.columns)

# summarize features
X_train_transformed.head()
#X_train_transformed.describe().T

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [9]:
# Default SVM
svc = SVC()
params = svc.get_params()
params_df = pd.DataFrame(params, index=[0])
params_df.T

Unnamed: 0,0
C,1.0
break_ties,False
cache_size,200
class_weight,
coef0,0.0
decision_function_shape,ovr
degree,3
gamma,scale
kernel,rbf
max_iter,-1


In [10]:
# run model on default SVM parameters
svc.fit(X_train_transformed, y_train)

# evauluate the performance of the SVM model by caclulating confusinon matrix
# and the accuracy score
# train the data using fit

print(f'accuracy: {svc.score(X_test_transformed, y_test):.4f}')

# note: already seeing improved accuracy, likely because of the scalar from before

NameError: name 'X_train_transformed' is not defined

grid search

In [34]:
## Hyperparameter Tuning with Grid Search ##

# look at the parameters defined above

# list of C values
C_range = np.logspace(-1,1,3)
print(f'C values are {C_range}')

# list of gamma values
gamma_range = np.logspace(-1,1,3)
print(f'gamma values are {gamma_range}')


C values are [ 0.1  1.  10. ]
gamma values are [ 0.1  1.  10. ]


In [None]:
"""
scoring is the metric to evaluate cross validation results for each model. set scoring = 'accuracy'
the scoring option can take more than one metric in the list

StratifieldKFold is used for cross validaiton. keeps the class ration in the folds the same as training dataset. n_splits = 3
means we are doing 3 fold cross validation. shuffle=True shuffles the data before splitting. random_state = 0 does as it sounds
"""

# define the search space
param_grid = {
    # regularization parameter
    "C": C_range,
    #Kernel type
    "kernel": ['rbf', 'poly'],
    # gamma
    "gamma": gamma_range.tolist()+['scale', 'atuo']
}

# set up score
scoring = ['accuracy']

# k fold cross validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# define grid search
grid_search = GridSearchCV(estimator=svc, #since we're using svc
                           param_grid=param_grid, # which we defined above
                           scoring=scoring, #defined as accuracy
                           refit='accuracy', # enables refitting the model with the best parameters
                           n_jobs=1, # parallel processing using all the processors
                           cv=kfold, #takes StratifieldKFold that we defined
                           verbose=0, # controls how much info is returned, =0 makes it silent 
                           )

# fit grid search
grid_result = grid_search.fit(X_train_transformed, y_train) 
#similar to running the model before, but now it's like going through several models

grid_result

In [None]:
## get the best values

# best accuracy score
print(f'best accuracy score: {grid_result.best_score_}')

# get hyperparameters of the best score
print(f'best hyperparameters: {grid_result.best_params_}')

# best accuracy score for the testing dataset
print(f'accuracy score for testing dataset: {grid_search.score(X_test_transformed, y_test)}')

best accuracy score: 0.5696969696969697
best hyperparameters: {'C': np.float64(0.1), 'gamma': 0.1, 'kernel': 'rbf'}
accuracy score for testing dataset: 1.0


random search

In [11]:
### Hyperparameter Optimizing with Random Search ###

# increae the amount of C values and gamma values
C_range = np.logspace(-10, 10, 21)
print(f'C values: {C_range}')

# list of gamma values
gamma_range = np.logspace(-10, 10, 21)
print(f'gamma values: {gamma_range}')


C values: [1.e-10 1.e-09 1.e-08 1.e-07 1.e-06 1.e-05 1.e-04 1.e-03 1.e-02 1.e-01
 1.e+00 1.e+01 1.e+02 1.e+03 1.e+04 1.e+05 1.e+06 1.e+07 1.e+08 1.e+09
 1.e+10]
gamma values: [1.e-10 1.e-09 1.e-08 1.e-07 1.e-06 1.e-05 1.e-04 1.e-03 1.e-02 1.e-01
 1.e+00 1.e+01 1.e+02 1.e+03 1.e+04 1.e+05 1.e+06 1.e+07 1.e+08 1.e+09
 1.e+10]


In [12]:
## Same code as with grid search, but one change in random_search

# define the search space
param_grid = {
    # regularization parameter
    "C": C_range,
    #Kernel type
    "kernel": ['rbf', 'poly'],
    # gamma
    "gamma": gamma_range.tolist()+['scale', 'atuo']
}

# set up score
scoring = ['accuracy']

# k fold cross validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# define grid search
random_search = RandomizedSearchCV(estimator=svc, #since we're using svc
                           param_distributions=param_grid, # which we defined above
                           n_iter=100,   #this is the change, testing 100 combdinations
                           scoring=scoring, #defined as accuracy
                           refit='accuracy', # enables refitting the model with the best parameters
                           n_jobs=1, # parallel processing using all the processors
                           cv=kfold, #takes StratifieldKFold that we defined
                           verbose=0, # controls how much info is returned, =0 makes it silent 
                           )

# fit grid search
grid_result = grid_search.fit(X_train_transformed, y_train) 
#similar to running the model before, but now it's like going through several models

grid_result

18 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\AarPi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\AarPi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "C:\Users\AarPi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\Loc

In [13]:
## get the best values

# best accuracy score
print(f'best accuracy score: {grid_result.best_score_}')

# get hyperparameters of the best score
print(f'best hyperparameters: {grid_result.best_params_}')

# best accuracy score for the testing dataset
print(f'accuracy score for testing dataset: {grid_search.score(X_test_transformed, y_test)}')

best accuracy score: 0.5696969696969697
best hyperparameters: {'C': np.float64(0.1), 'gamma': 0.1, 'kernel': 'rbf'}
accuracy score for testing dataset: 1.0


bayesian

In [None]:
"""
use hyper opt for this bayesian modeling. 
define an objective function that takes in the parameters and returns the loss. we want to maximize the accuracy so we set
max(scores) as the best_score, and the set loss to be -best_score. 

use fmin to optimize the obejctive function. Hyperopt currently has three algorithms: random search,
Tree of Parzen Estimators, and adaptive.TPE. we use TPE as the search algorithm
"""

# space
space = {
    'C': hp.choice('C', C_range),
    'gamma': hp.choice('gamma', gamma_range.tolist()+['scale', 'auto']),
    'kernel': hp.choice('kernel', ['rbf', 'poly'])
}

# set up k-fold
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# objective function
def objective(params):

    svc = SVC(**params)
    scores = cross_val_score(svc, X_train_transformed, y_train, cv=kfold, scoring='accuracy', n_jobs=-1)

    best_score = max(scores)

    loss = -best_score

    return {'loss':loss, 'params': params, 'status': STATUS_OK}

# track progress
bayes_trials = Trials()

# optimize
best = fmin(fn = objective, space = space, algo=tpe.suggest, max_evals=15, trials=bayes_trials)



# note that it can't run too many trials, 15 may be the max

100%|██████████| 15/15 [00:03<00:00,  4.76trial/s, best loss: -0.5]


In [20]:
print(best)

print(space_eval(space, best))

{'C': np.int64(20), 'gamma': np.int64(11), 'kernel': np.int64(0)}
{'C': np.float64(10000000000.0), 'gamma': 10.0, 'kernel': 'rbf'}


In [None]:
# now make predictions
svc_bo = SVC(C=space_eval(space,best)['C'], gamma=space_eval(space,best)['gamma'],kernel=space_eval(space,best)['kernel'])

svc_bo.fit(X_test_transformed, y_test)

print(f'accuracy: {svc_bo.score(X_test_transformed, y_test)}')

accuracy: 1.0


**Original model here**

just use the train test split to trian this one

In [11]:
model = SVC(kernel='rbf')
#from SVM scikit-learn library, uses radial basis function
# use balanced to account for the bias in data, class_weight='balanced'
# without this it counts EVERYTHING as positive

# kernel options: linear, rbf, poly, 

# train the data using fit
model.fit(X_train, y_train)

# make predictions based on the testing data from before
y_pred = model.predict(X_test)

# evauluate the performance of the SVM model by caclulating confusino matrix
# and the accuracy score
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", cm)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

#model.predict()     # new data goes in here

ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

**Optimizing**

https://www.youtube.com/watch?v=203zKEecHgg

https://www.youtube.com/watch?v=tG7262z_Rck

https://www.youtube.com/watch?v=bGCafQT5h1s

https://www.youtube.com/watch?v=FB5EdxAGxQg
