# Dependencies

In [115]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
import scipy.stats as stats
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
import sklearn as sk
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
#from ipynb.fs.full.driver_drowsiness_extraction import select_channel
import numpy as np
import pandas as pd
from scipy import stats

In [104]:
np.random.seed(100)

# I/O

In [123]:
csv_file = 'eeg_features.csv'
df = pd.read_csv(csv_file,float_precision='round_trip')
df = df.drop('Unnamed: 0', axis=1)

In [124]:
features = df.drop('label', axis=1)
labels = df.iloc[:,-1:]
display_labels = ['drowsy' if label == 1 else 'alert' for label in labels['label'].unique()]

In [125]:
def channel_training(features, labels, channel_list):
    found_channels = []
    for channel in channel_list:
        found_channels.append(features.loc[df['channels'] == channel])
    return (pd.concat(found_channels).drop('channels', axis=1)).to_numpy()

In [126]:
channel_list = ['F3', 'F4','C3','Cz','Oz']
X = channel_training(features=features, labels=labels, channel_list=channel_list)
y = labels[0:2022*len(channel_list)].to_numpy()

In [110]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1)

# apply normalization after splitting to avoid leakage
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [111]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

def model_training(model_family, display_labels, stats=False, cm=False):

  if model_family == 'K-NN':
    model = KNeighborsClassifier()
  elif model_family == 'DTC':
    model = DecisionTreeClassifier()
  elif model_family == 'RFC':
    model = RandomForestClassifier(n_estimators=100)
  elif model_family == 'Logistic Regression':
    model = LogisticRegression(max_iter=5000)
  elif model_family == 'SVM':
    model = SVC(C=1.0, kernel='rbf', degree=10, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=1)
  elif model_family == 'NN':
    model = MLPClassifier(activation='relu',solver='adam', alpha=1e-2, learning_rate='adaptive', max_iter=1000000, hidden_layer_sizes=(60,2), random_state=1)
  elif model_family == 'GBC':
    model = GradientBoostingClassifier(loss='log_loss',n_estimators=300, learning_rate=0.1, max_depth=10, random_state=1)

  model.fit(X_train, y_train)
  print('Accuracy of {} classifier on training set: {:.8f}'
     .format(model_family, model.score(X_train, y_train)))
  print('Accuracy of {} classifier on test set: {:.8f}'
     .format(model_family, model.score(X_test, y_test)))

  if stats:
    print()
    print("==== Stats for the {} model ====".format(model_family))
    sensitivity = recall_score(y_test, model.predict(X_test))
    print("Sensitivity (Recall):", sensitivity)

    precision = precision_score(y_test, model.predict(X_test))
    print("Precision:", precision)

    accuracy = accuracy_score(y_test, model.predict(X_test))
    print("Accuracy (Recall):", accuracy)
        
    f1 = f1_score(y_test, model.predict(X_test))
    print("F1_score:", f1)

    fpr, tpr, thresholds = roc_curve(y_test, model.predict(X_test))
    auc = roc_auc_score(y_test, model.predict(X_test))
    print("AUC:", auc)

    logloss = log_loss(y_test, model.predict(X_test))
    print("Logloss:", logloss)
    print()

  if cm:
    model_cm = confusion_matrix(y_test, model.predict(X_test))
    model_disp = ConfusionMatrixDisplay(confusion_matrix=model_cm,display_labels=display_labels)
    model_disp.plot()

In [112]:
models = ['GBC']
for model in models:
    model_training(model, display_labels, stats=True, cm=False)
    

Accuracy of GBC classifier on training set: 1.00000000
Accuracy of GBC classifier on test set: 0.81651830

==== Stats for the GBC model ====
Sensitivity (Recall): 0.7928994082840237
Precision: 0.8331606217616581
Accuracy (Recall): 0.8165182987141444
F1_score: 0.8125315816068721
AUC: 0.8165885930309008
Logloss: 6.337297120523306



In [113]:
'''
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler

#y = y.reset_index(drop=True)

pca = PCA(n_components = 0.999)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
#X = dataPCA
variance = pd.DataFrame(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_)
print(np.sum(pca.explained_variance_ratio_))
'''

'\nfrom sklearn.decomposition import PCA\nfrom sklearn.preprocessing import LabelEncoder, StandardScaler\n\n#y = y.reset_index(drop=True)\n\npca = PCA(n_components = 0.999)\nX_train = pca.fit_transform(X_train)\nX_test = pca.transform(X_test)\n#X = dataPCA\nvariance = pd.DataFrame(pca.explained_variance_ratio_)\nprint(pca.explained_variance_ratio_)\nprint(np.sum(pca.explained_variance_ratio_))\n'

In [142]:
p_values = []

X_p = df.drop('channels',axis = 1)
X_p = X_p.drop('label',axis = 1)

#y_p = pd.Series(y['0'])
for feature in X_p.columns:
    t_stat, p_value = stats.ttest_ind(X_p[feature][y_p == 0], X_p[feature][y_p == 1])
    p_values.append(p_value)

alpha = 0.05

# Select features with p-values below the significance level
selected_features = [X_p.columns[i] for i, p in enumerate(p_values) if p < alpha]
# Alternatively, you can rank features by p-value
sorted_features = [x for _, x in sorted(zip(p_values, X_p.columns))]

In [129]:
X_p.head()

Unnamed: 0,spc_cnt,spc_roff,zc,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,chr_0,chr_1,...,gamma_theta,gamma_delta,beta_alpha,beta_theta,beta_delta,alpha_theta,alpha_delta,theta_delta,mean_abs_sec_dif,dfa
0,3590.459896,6675.292969,0.328125,347.159565,57.05881,-24.870341,33.284216,-22.960826,0.527888,0.643746,...,0.998724,1.132221,2.200579,1.193993,1.353591,0.542581,0.615107,1.133667,5.518584,0.822449
1,3752.577012,6998.291016,0.302083,338.965993,44.552563,-15.60018,22.562177,-20.510662,0.742114,0.79642,...,0.629322,0.51046,2.931911,1.770789,1.436335,0.603971,0.489897,0.811127,5.150809,0.82531
2,3647.662002,7105.957031,0.25,370.701522,50.350531,-16.85566,16.781457,-21.681031,0.71682,1.0,...,2.276071,0.69674,2.227426,2.938295,0.899457,1.319144,0.40381,0.306115,6.430567,0.957758
3,3649.478683,6901.391602,0.283854,352.184349,43.431659,-21.810604,30.115513,-3.182163,0.758215,1.0,...,2.486844,0.570555,2.818923,5.760856,1.32171,2.043637,0.46887,0.229429,5.120279,0.801699
4,3782.088471,7116.723633,0.302083,351.714564,44.032455,-17.494524,22.250526,-16.676041,0.64975,0.483859,...,0.855392,0.937899,2.069664,1.394641,1.529161,0.673849,0.738845,1.096455,5.472301,0.87574


In [141]:
y = y.flatten()
y_p = pd.Series(y)
