## Dependencies

In [63]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
import scipy.stats as stats
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
import sklearn as sk
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
#from ipynb.fs.full.driver_drowsiness_extraction import select_channel
import numpy as np
import pandas as pd
from scipy import stats

## Feature Selection

In [64]:
def channel_selection(features, labels, channel_list):
    '''
    Select the desired channels from the total feature dataset
    '''
    selected_channels = []
    for channel in channel_list:
        selected_channels.append(features.loc[features['channels'] == channel])
    # return the corresponding labels for the selected channels
    selected_labels = labels[0:2022*len(channel_list)].to_numpy()
    return ((pd.concat(selected_channels).drop('channels', axis=1)), selected_labels)

In [65]:
def feature_selection(selected_channels, feature_subset):
    ''' 
    Select the desired subset of features to prepare training data on.
    '''
    selected_features = pd.DataFrame()
    for feature in feature_subset:
        selected_features[feature] = selected_channels[feature]
    # temporary sanity check, will delete
    # print(selected_features.head())
    return selected_features

## Preparation and Training

In [66]:
def data_preparation(selected_channels, selected_labels, feature_subset, split_size = 0.2, seed = 1):

    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler

    X = feature_selection(selected_channels=selected_channels, feature_subset = feature_subset) # select every feature
    y = selected_labels

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split_size, random_state = seed)

    # apply normalization after splitting to avoid leakage
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return [X_train, X_test, y_train, y_test]



In [118]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

def model_training(data, model_family, display_labels, verbose = True, stats=False, cm=False):

  X_train, X_test, y_train, y_test = data
  if model_family == 'K-NN':
    model = KNeighborsClassifier()
  elif model_family == 'DTC':
    model = DecisionTreeClassifier()
  elif model_family == 'RFC':
    model = RandomForestClassifier(n_estimators=100)
  elif model_family == 'Logistic Regression':
    model = LogisticRegression(max_iter=5000)
  elif model_family == 'SVM':
    model = SVC(C=1.0, kernel='rbf', degree=10, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=1)
  elif model_family == 'NN':
    model = MLPClassifier(activation='relu',solver='adam', alpha=1e-2, learning_rate='adaptive', max_iter=1000000, hidden_layer_sizes=(60,2), random_state=1)
  elif model_family == 'GBC':
    model = GradientBoostingClassifier(loss='log_loss',n_estimators=300, learning_rate=0.1, max_depth=10, random_state=1)

  model.fit(X_train, y_train)
  training_acc = model.score(X_train, y_train)
  test_acc = model.score(X_test, y_test)
  if verbose:
    print('Accuracy of {} classifier on training set: {:.8f}'
      .format(model_family, training_acc))
    print('Accuracy of {} classifier on test set: {:.8f}'
      .format(model_family, test_acc))

  if stats:
    print()
    print("==== Stats for the {} model ====".format(model_family))
    sensitivity = recall_score(y_test, model.predict(X_test))
    print("Sensitivity (Recall):", sensitivity)

    precision = precision_score(y_test, model.predict(X_test))
    print("Precision:", precision)

    accuracy = accuracy_score(y_test, model.predict(X_test))
    print("Accuracy (Recall):", accuracy)
        
    f1 = f1_score(y_test, model.predict(X_test))
    print("F1_score:", f1)

    fpr, tpr, thresholds = roc_curve(y_test, model.predict(X_test))
    auc = roc_auc_score(y_test, model.predict(X_test))
    print("AUC:", auc)

    logloss = log_loss(y_test, model.predict(X_test))
    print("Logloss:", logloss)
    print()

  if cm:
    model_cm = confusion_matrix(y_test, model.predict(X_test))
    model_disp = ConfusionMatrixDisplay(confusion_matrix=model_cm,display_labels=display_labels)
    model_disp.plot()
  
  return [training_acc, test_acc]

## I/O

In [68]:
# load the feature dataset as a dataframe
csv_file = 'eeg_features.csv'
df = pd.read_csv(csv_file,float_precision='round_trip')
df = df.drop('Unnamed: 0', axis=1)

In [69]:
# split the dataset to features and labels
features = df.drop('label', axis=1)
labels = df.iloc[:,-1:]
display_labels = ['drowsy' if label == 1 else 'alert' for label in labels['label'].unique()]

## Testing and Processing

In [155]:
import itertools

def feature_combination(feature_subset, min_n = 1, max_n = 5, training=False, pvalue=False):
    '''
    Go through a feature subset and calculate the combinations of different features on the subsets.
    '''
    filename = str(feature_subset) + '_' + str(min_n) + 'to' + str(max_n) + '.txt'
    file = open(filename, 'w')
    # selected channels and labels are global now, might fix it if we decide settling on this method.
    for i in range(min_n, max_n):
        for comb in list(itertools.combinations(feature_subset, i)):
            if training:
                data = data_preparation(selected_channels=selected_channels, selected_labels=selected_labels, feature_subset=comb)
                # parametrize the models
                for model in ['K-NN', 'SVM']:
                    train_acc, test_acc = model_training(data, model, display_labels, verbose=False, stats=False, cm=False)
                    # TODO: improve the readability of the file by either modifying the string or changing the filetype entirely
                    file.writelines(f"{comb}: {model} train acc: {train_acc:.2f} test acc: {test_acc:.2f}\n")
            # we can also add a thresholding section to see if the pvalue or variance value changes
            if pvalue:
                pass
    file.close()

In [152]:
# start with spectral features and their combinations
feature_combination(['spc_cnt', 'spc_roff','slope', 'mfcc_0', 'mfcc_1', 'mfcc_2', 'mfcc_3'], min_n = 2, max_n = 5, training=True, pvalue=False)

# if you wanna test your limits
'''
min_n = 2, max_n = 50 # did i ever tell you what the definition of insanity is?
feature_combination(selected_channels.columns, min_n = min_n, max_n = max_n, training=True, pvalue=False)
'''

# if you feel lucky
'''
import random
subset_size = 10 # go wild
random_subset = random.sample(selected_channels.columns, subset_size)
feature_combination(random_subset, min_n = 2, max_n = 5, training=True, pvalue=False)
'''

In [76]:
# select the channels to be processed
channel_list = ['F3', 'F4','C3','Cz','Oz']

# select the models to be trained
models = ['GBC', 'K-NN', 'SVM']

In [83]:
selected_channels, selected_labels = channel_selection(features=features, labels=labels, channel_list=channel_list)
data = data_preparation(selected_channels=selected_channels, selected_labels=selected_labels, feature_subset=['chr_11', 'chr_14', 'chr_12', 'mean_abs_sec_dif', 'delta_power'])

for model in models:
    model_training(data, model, display_labels, stats=True, cm=False)

Accuracy of GBC classifier on training set: 0.99950544
Accuracy of GBC classifier on test set: 0.64886251

==== Stats for the GBC model ====
Sensitivity (Recall): 0.6449704142011834
Precision: 0.651394422310757
Accuracy (Recall): 0.6488625123639961
F1_score: 0.6481665014866204
AUC: 0.6488740959894806
Logloss: 12.656277896277537

Accuracy of K-NN classifier on training set: 0.72143917
Accuracy of K-NN classifier on test set: 0.55736894

==== Stats for the K-NN model ====
Sensitivity (Recall): 0.5512820512820513
Precision: 0.5595595595595596
Accuracy (Recall): 0.5573689416419386
F1_score: 0.5553899652260309
AUC: 0.5573870573870574
Logloss: 15.954040446716048

Accuracy of SVM classifier on training set: 0.62685460
Accuracy of SVM classifier on test set: 0.63353116

==== Stats for the SVM model ====
Sensitivity (Recall): 0.6045364891518737
Precision: 0.6432318992654774
Accuracy (Recall): 0.6335311572700296
F1_score: 0.623284189120488
AUC: 0.6336174509251431
Logloss: 13.208875945269934



## Evaluation

## Archive

In [93]:
# P-Value Thresholding for Feature Selection
def p_value_thresholding(selected_features, selected_labels):

    p_values = []

    X_p = selected_features
    
    y_p = selected_labels.flatten()
    y_p = pd.Series(y_p)


    #y_p = pd.Series(y['0'])
    sorted_dict = {}
    for feature in X_p.columns:
        t_stat, p_value = stats.ttest_ind(X_p[feature][y_p == 0], X_p[feature][y_p == 1])
        p_values.append(p_value)
        sorted_dict[feature] = p_value


    alpha = 0.05

    # Select features with p-values below the significance level
    selected_features = [X_p.columns[i] for i, p in enumerate(p_values) if p < alpha]
    # Alternatively, you can rank features by p-value
    sorted_features = [x for _, x in sorted(zip(p_values, X_p.columns))]
    from collections import OrderedDict

    ordered = OrderedDict(sorted(sorted_dict.items(), key=lambda item:np.max(item[1])))
    for key, value in ordered.items():
        print(key, value)
        
    return sorted_features, sorted_dict

In [94]:
all = feature_selection(selected_channels=selected_channels, feature_subset=selected_channels.columns) # select every feature
p_all, p_dict = p_value_thresholding(selected_features=all, selected_labels=y)

#print(p_dict)

spc_roff 1.6147525601572645e-241
slope 2.1961591318448354e-218
mel_9 3.780620351407701e-197
mel_6 1.1018938972585085e-190
mel_8 7.18754692852844e-179
mel_7 4.889762489252905e-178
spc_cnt 2.887779599226387e-170
mel_5 5.517438027558447e-134
mfcc_2 3.901885014029416e-112
zc 8.621303483438252e-98
mel_1 7.514574188776398e-96
mel_2 4.0251470438104993e-85
mel_0 7.866398310131055e-85
mel_3 7.917307252001706e-84
dfa 5.4570769344308075e-80
mel_4 4.070881376943928e-78
gamma_beta 1.9635186392167987e-75
chr_9 1.0813929454262362e-62
chr_8 7.157621087120184e-59
gamma_alpha 1.8360025005449308e-58
mfcc_1 5.293632675036034e-58
mfcc_0 8.189398043909328e-58
mfcc_3 9.75984225924415e-57
alpha_delta 2.827046503995087e-54
alpha_theta 7.560080511524009e-42
chr_10 5.609345536289729e-37
peak_freq 1.2178319684208423e-34
beta_alpha 1.825251593945194e-32
mfcc_4 5.3463501550803495e-24
chr_7 3.0341498587256606e-21
gamma_delta 6.846410819985025e-20
gamma_theta 1.0310558268088689e-17
beta_theta 2.5202117308661734e-14
c

In [74]:
p_all

['spc_roff',
 'slope',
 'mel_9',
 'mel_6',
 'mel_8',
 'mel_7',
 'spc_cnt',
 'mel_5',
 'mfcc_2',
 'zc',
 'mel_1',
 'mel_2',
 'mel_0',
 'mel_3',
 'dfa',
 'mel_4',
 'gamma_beta',
 'chr_9',
 'chr_8',
 'gamma_alpha',
 'mfcc_1',
 'mfcc_0',
 'mfcc_3',
 'alpha_delta',
 'alpha_theta',
 'chr_10',
 'peak_freq',
 'beta_alpha',
 'mfcc_4',
 'chr_7',
 'gamma_delta',
 'gamma_theta',
 'beta_theta',
 'chr_1',
 'chr_2',
 'chr_0',
 'theta_delta',
 'chr_3',
 'beta_delta',
 'chr_4',
 'alpha_power',
 'max_freq',
 'gamma_power',
 'beta_power',
 'chr_5',
 'chr_6',
 'theta_power',
 'chr_13',
 'chr_12',
 'delta_power',
 'chr_14',
 'chr_11',
 'mean_abs_sec_dif']

In [None]:
['spc_roff',
 'mfcc_0',
 'spc_cnt',
 'mfcc_1',
 'mfcc_2',
 'zc',
 'dfa',
 'gamma_beta',
 'mfcc_3',
 'mel_4',
 'mel_7',
 'chr_3',
 'chr_2',
 'chr_7',
 'mel_5',
 'chr_4',
 'chr_5',
 'mel_6',
 'chr_1',
 'chr_6',
 'mel_3',
 'mel_0',
 'mel_1',
 'mel_2',
 'gamma_alpha',
 'chr_0',
 'chr_8',
 'mel_8',
 'alpha_delta',
 'mel_9',
 'alpha_theta',
 'chr_14',
 'beta_alpha',
 'chr_9',
 'mfcc_4',
 'gamma_delta',
 'gamma_theta',
 'chr_10',
 'beta_theta',
 'chr_11',
 'theta_delta',
 'beta_delta',
 'chr_13',
 'chr_12',
 'alpha_power',
 'gamma_power',
 'beta_power',
 'theta_power',
 'delta_power',
 'mean_abs_sec_dif']

In [None]:
'''
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler

#y = y.reset_index(drop=True)

pca = PCA(n_components = 0.999)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
#X = dataPCA
variance = pd.DataFrame(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_)
print(np.sum(pca.explained_variance_ratio_))
'''