In [6]:
import mne 
import numpy as np
import matplotlib.pyplot as plt
import scipy 

In [2]:
def read_data(path):
    raw=mne.io.read_raw_gdf(path,preload=True,
                          eog=['EOG-left', 'EOG-central', 'EOG-right'])
    raw.drop_channels(['EOG-left', 'EOG-central', 'EOG-right'])
    raw.set_eeg_reference()
    events=mne.events_from_annotations(raw)
    epochs = mne.Epochs(raw, events[0], event_id=[7,8,9,10],on_missing ='warn')
    labels=epochs.events[:,-1]
    features=epochs.get_data()
    return features,labels

In [3]:
%%capture
features,labels,groups=[],[],[]
for i in range(1,10):
  feature,label=read_data(f'data/A0{i}T.gdf')
  features.append(feature) # append the features of eeg data
  labels.append(label) # append the labels of the eeg data
  groups.append([i]*len(label)) # append the group number of the eeg data

In [4]:
features=np.concatenate(features)
labels=np.concatenate(labels)
groups=np.concatenate(groups)

features.shape,labels.shape,groups.shape

((2448, 22, 176), (2448,), (2448,))

In [5]:
np.mean(labels==7),np.mean(labels==8),np.mean(labels==9),np.mean(labels==10)

(0.2647058823529412,
 0.2647058823529412,
 0.23529411764705882,
 0.23529411764705882)

In [7]:
from scipy import stats

In [11]:
np.mean(features[labels==7],axis=-1).shape

(648, 22)

In [21]:
# Statistical Analysis
# Get the statistical values of the features
x_features = []
for feature in features:
    stats_list = [
        np.mean(feature, axis=-1),
        np.std(feature, axis=-1),
        np.max(feature, axis=-1),
        np.min(feature, axis=-1),
        np.median(feature, axis=-1),
        np.var(feature, axis=-1),
        np.ptp(feature, axis=-1),
        stats.skew(feature, axis=-1),
        stats.kurtosis(feature, axis=-1),
        stats.iqr(feature, axis=-1),
        stats.mode(feature, axis=-1)[0],
        stats.sem(feature, axis=-1),
        np.argmax(feature, axis=-1),
        np.argmin(feature, axis=-1),
        np.sum(np.abs(np.diff(feature, axis=-1)),axis=-1),
        np.sqrt(np.mean(feature**2, axis=-1))
    ]
    # Flatten each statistic to ensure they are 1D arrays
    x_features.append(np.concatenate(stats_list))
x_features = np.array(x_features)
print(x_features.shape)

(2448, 352)


In [22]:
352/22

16.0

In [30]:
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [28]:
# Create a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=500))
])

# Create a parameter grid
param_grid = {
    'clf__C': [0.1, 0.5,0.7, 1, 3,5,7, 10, 50, 100],
}

# Create a group k-fold object
gkf = GroupKFold(n_splits=5)

# Create a grid search object
grid_search = GridSearchCV(pipe, param_grid, cv=gkf, scoring='accuracy',n_jobs= 12)

# Fit the grid search object
grid_search.fit(x_features, labels, groups=groups)

# Get the best parameters
print(grid_search.best_params_)

# Get the best score
grid_search.best_score_

{'clf__C': 0.1}


0.3174768518518518

In [31]:
gkf = GroupKFold(n_splits=5)
models = {
    'Logistic Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=500))
    ]),
    'SVC': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', SVC())
    ]),
    'Random Forest': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', RandomForestClassifier())
    ]),
    'Gradient Boosting': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', GradientBoostingClassifier())
    ])
}

In [32]:
param_grids = {
    'Logistic Regression': {
        'clf__C': [0.1, 0.5, 1, 5, 10]
    },
    'SVC': {
        'clf__C': [0.1, 1, 10],
        'clf__kernel': ['linear', 'rbf']
    },
    'Random Forest': {
        'clf__n_estimators': [50, 100],
        'clf__max_depth': [None, 10, 20]
    },
    'Gradient Boosting': {
        'clf__n_estimators': [50, 100],
        'clf__learning_rate': [0.01, 0.1, 1]
    }
}

In [None]:
best_scores = {}
for model_name, pipeline in models.items():
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=gkf, scoring='accuracy', n_jobs=12)
    grid_search.fit(x_features, labels, groups=groups)
    best_score = grid_search.best_score_
    best_params = grid_search.best_params_
    print(f"Best for {model_name}: Score = {best_score}, Params = {best_params}")
    best_scores[model_name] = best_score

Best for Logistic Regression: Score = 0.3174768518518518, Params = {'clf__C': 0.1}
Best for SVC: Score = 0.3108796296296297, Params = {'clf__C': 0.1, 'clf__kernel': 'linear'}
Best for Random Forest: Score = 0.2929398148148148, Params = {'clf__max_depth': 10, 'clf__n_estimators': 100}
Best for Gradient Boosting: Score = 0.2982638888888889, Params = {'clf__learning_rate': 0.1, 'clf__n_estimators': 100}
