In [15]:
import mne
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score
import matplotlib.pyplot as plt
from Preprocessing.feature_extraction import load_eeg_data, compute_band_power, extract_features

In [29]:
# Load data
base_path = '/Users/imdohyeon/Library/CloudStorage/GoogleDrive-dhlim1598@gmail.com/공유 드라이브/4N_PKNU/BXAI/EEG-LLM/Dataset/'
train_dir = base_path + 'train.csv'
test_dir = base_path + 'test.csv'
val_dir = base_path + 'val.csv'

window_size = 1000
selected_columns = [0, 2, 3, 4]  # 0: FCz, 2: F3, 3: Fz, 4: F4

In [30]:
data_train, label_train = load_eeg_data(train_dir)
data_val, label_val = load_eeg_data(val_dir)
data_test, label_test = load_eeg_data(test_dir)

In [18]:
def extract_features_col(data, selected_columns, sfreq=250):
    eeg_data = data.iloc[:, selected_columns].values  # Bring only the selected columns
    ch_names = data.columns[selected_columns].tolist()  # Use the selected column names as channel names
    info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types='eeg')  # Create an Info object
    raw = mne.io.RawArray(eeg_data.T, info)  # Create a RawArray object

    # Frequency band definition
    delta_band = (0.5, 4)
    theta_band = (4, 8)
    alpha_band = (8, 12)

    # Compute power in the frequency bands for each electrode
    delta_power = compute_band_power(raw, delta_band)
    theta_power = compute_band_power(raw, theta_band)
    alpha_power = compute_band_power(raw, alpha_band)

    # Initialize lists for the feature values
    alpha_delta_ratios = []
    delta_theta_ratios = []
    theta_alpha_ratios = []

    # Calculate the ratios for each selected channel
    for i in range(len(selected_columns)):
        alpha_delta_ratios.append(alpha_power[i] / delta_power[i])
        delta_theta_ratios.append(delta_power[i] / theta_power[i])
        theta_alpha_ratios.append(theta_power[i] / alpha_power[i])

    # Combine all the features in the correct order
    features = []
    for i in range(len(selected_columns)):
        features.extend([alpha_delta_ratios[i], delta_theta_ratios[i], theta_alpha_ratios[i]])

    # Create the appropriate column names
    columns = []
    for col in selected_columns:
        columns.extend([f'{col}ad', f'{col}dt', f'{col}ta'])

    # Create a DataFrame with the features
    features_df = pd.DataFrame([features], columns=columns)

    return features_df

In [19]:
dataX_temp = data_train.iloc[:1000, :]
ft_temp = extract_features_col(dataX_temp, selected_columns)
ft_temp

Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)


Unnamed: 0,0ad,0dt,0ta,2ad,2dt,2ta,3ad,3dt,3ta,4ad,4dt,4ta
0,0.947422,1.557824,0.677545,2.336448,2.262241,0.189193,0.697915,3.807867,0.376284,3.355433,1.161248,0.256641


In [20]:
fttemp = extract_features(dataX_temp, selected_columns)
fttemp

Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)


Unnamed: 0,Alpha:Delta Power Ratio,Theta:Alpha Power Ratio,Delta:Theta Power Ratio
0,0.947422,0.677545,1.557824
2,2.336448,0.189193,2.262241
3,0.697915,0.376284,3.807867
4,3.355433,0.256641,1.161248


In [21]:
def preprocessing(df, labels, window_size, selected_columns):
    """
    Convert a DataFrame of EEG data into a JSON format suitable for GPT-3 davinci.
    =================================
    1. You should pick selected_columns before running this function.
    2. It contains the process of feature extraction.
    
    =================================
    :param df: Data converted to pandas DataFrame from the original csv file
    :param labels: Label for each window (provide a list, left, right, top, bottom)
    :param window_size: Window size to divide EEG data
    :param selected_columns: EEG channel to use (provide a list)
    :return: List of data in JSON format
    """
    features_array = []

    for start in range(0, len(df) - window_size + 1, window_size):
        window_data = df.iloc[start:start + window_size, selected_columns]  # Pick a single window based on selected_columns
        label = int(labels[start])  # Assuming labels are provided for each window

        # Feature extraction
        features = extract_features(window_data, list(range(len(selected_columns))))  # feature extraction

        # Convert DataFrame to a list and flatten
        features_flat = features.values.flatten().tolist()

        # Append the label to the end of the features
        features_flat.append(label)

        features_array.append(features_flat)

    # Create column names for the features + label
    feature_columns = []
    for col in selected_columns:
        feature_columns.extend([f'{col}ad', f'{col}ta', f'{col}dt'])
    feature_columns.append('label')  # Add the label column

    # Convert to a DataFrame
    features_df = pd.DataFrame(features_array, columns=feature_columns)

    return features_df

In [22]:
features_train = preprocessing(data_train, label_train, window_size, selected_columns)
features_val = preprocessing(data_val, label_val, window_size, selected_columns)
features_test = preprocessing(data_test, label_test, window_size, selected_columns)

Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =     

In [23]:
features_train

Unnamed: 0,0ad,0ta,0dt,2ad,2ta,2dt,3ad,3ta,3dt,4ad,4ta,4dt,label
0,0.947422,0.677545,1.557824,2.336448,0.189193,2.262241,0.697915,0.376284,3.807867,3.355433,0.256641,1.161248,4
1,0.535228,1.541721,1.211868,0.308838,0.974414,3.322963,0.097351,2.368132,4.337652,0.331435,0.757159,3.984870,1
2,1.506086,0.516705,1.285014,0.479735,0.823771,2.530415,3.560972,0.429358,0.654051,0.632763,0.425545,3.713760,1
3,3.478946,0.233367,1.231724,5.501238,0.189125,0.961151,0.895041,0.566759,1.971329,7.732246,0.102708,1.259182,3
4,0.399718,1.369085,1.827325,0.478570,0.636459,3.283096,0.180359,1.340574,4.135906,0.743984,0.378047,3.555415,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,0.173156,1.620312,3.564224,0.218944,1.075679,4.246039,0.162026,1.291414,4.779139,0.038025,0.954195,27.561188,1
212,1.902576,0.989949,0.530940,0.550355,0.830821,2.187003,0.348631,0.922282,3.110074,1.659391,0.245137,2.458346,2
213,0.744690,0.738834,1.817512,0.852291,0.361505,3.245616,1.563474,0.671387,0.952657,1.293362,0.263613,2.933008,2
214,0.313448,2.670761,1.194537,2.787427,0.282739,1.268853,0.362317,1.226731,2.249895,0.734358,1.218595,1.117463,3


In [24]:
train_X = features_train.iloc[:, :-1].values
train_y = features_train.iloc[:, -1].values
val_X = features_val.iloc[:, :-1].values
val_y = features_val.iloc[:, -1].values
test_X = features_test.iloc[:, :-1].values
test_y = features_test.iloc[:, -1].values

In [25]:
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
val_X = scaler.transform(val_X)
test_X = scaler.transform(test_X)

In [26]:
svm = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],  # Reduced range of the regularization parameter
    'gamma': ['scale', 0.01, 0.1, 1],  # Key gamma values with a focus on potential sweet spots
    'kernel': ['linear', 'rbf'],  # Focus on the most commonly effective kernels
    'class_weight': [None, 'balanced'],  # Option to handle imbalanced classes
}

grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(train_X, train_y)  # Fit the model on the training data

# Print the best parameters and the best score from the validation process
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best parameters found:  {'C': 100, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation accuracy: 49.10%


In [27]:
# Use the best model to make predictions on the validation set
best_model = grid_search.best_estimator_
val_predictions = best_model.predict(val_X)

# Evaluate the model on the validation set
print("\nValidation Set Performance:")
print("Validation Accuracy: {:.2f}%".format(accuracy_score(val_y, val_predictions) * 100))
print("Validation Kappa Coefficient: {:.2f}".format(cohen_kappa_score(val_y, val_predictions)))
print("\nValidation Classification Report:")
print(classification_report(val_y, val_predictions))


Validation Set Performance:
Validation Accuracy: 44.44%
Validation Kappa Coefficient: 0.26

Validation Classification Report:
              precision    recall  f1-score   support

           1       0.46      0.65      0.54        17
           2       0.29      0.35      0.32        17
           3       0.64      0.33      0.44        21
           4       0.50      0.47      0.48        17

    accuracy                           0.44        72
   macro avg       0.47      0.45      0.44        72
weighted avg       0.48      0.44      0.44        72



In [28]:
# After validation, use the best model to predict on the test set
test_predictions = best_model.predict(test_X)

# Evaluate the model on the test set
print("\nTest Set Performance:")
print("Test Accuracy: {:.2f}%".format(accuracy_score(test_y, test_predictions) * 100))
print("Test Kappa Coefficient: {:.2f}".format(cohen_kappa_score(test_y, test_predictions)))
print("\nTest Classification Report:")
print(classification_report(test_y, test_predictions))


Test Set Performance:
Test Accuracy: 47.22%
Test Kappa Coefficient: 0.30

Test Classification Report:
              precision    recall  f1-score   support

           1       0.53      0.45      0.49        20
           2       0.43      0.60      0.50        15
           3       0.36      0.22      0.28        18
           4       0.52      0.63      0.57        19

    accuracy                           0.47        72
   macro avg       0.46      0.48      0.46        72
weighted avg       0.46      0.47      0.46        72

