In [1]:
import mne
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score
import matplotlib.pyplot as plt
from Preprocessing.feature_extraction import load_eeg_data, compute_band_power, extract_features

In [2]:
# Load data
base_path = '/Users/imdohyeon/Library/CloudStorage/GoogleDrive-dhlim1598@gmail.com/공유 드라이브/4N_PKNU/BXAI/EEG-LLM/Dataset/'
train_dir = base_path + 'train.csv'
test_dir = base_path + 'test.csv'
val_dir = base_path + 'val.csv'

window_size = 1000
selected_columns = [1, 3, 4, 5]  # 1: FCz, 3: F3, 4: Fz, 5: F4

In [3]:
data_train, label_train = load_eeg_data(train_dir)
data_val, label_val = load_eeg_data(val_dir)
data_test, label_test = load_eeg_data(test_dir)

In [4]:
def extract_features_col(data, selected_columns, sfreq=250):
    eeg_data = data.iloc[:, selected_columns].values  # Bring only the selected columns
    ch_names = data.columns[selected_columns].tolist()  # Use the selected column names as channel names
    info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types='eeg')  # Create an Info object
    raw = mne.io.RawArray(eeg_data.T, info)  # Create a RawArray object

    # Frequency band definition
    delta_band = (0.5, 4)
    theta_band = (4, 8)
    alpha_band = (8, 12)

    # Compute power in the frequency bands for each electrode
    delta_power = compute_band_power(raw, delta_band)
    theta_power = compute_band_power(raw, theta_band)
    alpha_power = compute_band_power(raw, alpha_band)

    # Initialize lists for the feature values
    alpha_delta_ratios = []
    delta_theta_ratios = []
    theta_alpha_ratios = []

    # Calculate the ratios for each selected channel
    for i in range(len(selected_columns)):
        alpha_delta_ratios.append(alpha_power[i] / delta_power[i])
        delta_theta_ratios.append(delta_power[i] / theta_power[i])
        theta_alpha_ratios.append(theta_power[i] / alpha_power[i])

    # Combine all the features in the correct order
    features = []
    for i in range(len(selected_columns)):
        features.extend([alpha_delta_ratios[i], delta_theta_ratios[i], theta_alpha_ratios[i]])

    # Create the appropriate column names
    columns = []
    for col in selected_columns:
        columns.extend([f'{col}ad', f'{col}dt', f'{col}ta'])

    # Create a DataFrame with the features
    features_df = pd.DataFrame([features], columns=columns)

    return features_df

In [5]:
dataX_temp = data_train.iloc[:1000, :]
ft_temp = extract_features_col(dataX_temp, selected_columns)
ft_temp

Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)


Unnamed: 0,1ad,1dt,1ta,3ad,3dt,3ta,4ad,4dt,4ta,5ad,5dt,5ta
0,0.410644,0.647858,3.75885,0.233171,4.839705,0.88615,0.384879,1.970707,1.318418,0.59581,3.621859,0.463405


In [6]:
fttemp = extract_features(dataX_temp, selected_columns)
fttemp

Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)


Unnamed: 0,Alpha:Delta Power Ratio,Theta:Alpha Power Ratio,Delta:Theta Power Ratio
1,0.410644,3.75885,0.647858
3,0.233171,0.88615,4.839705
4,0.384879,1.318418,1.970707
5,0.59581,0.463405,3.621859


In [7]:
def preprocessing(df, labels, window_size, selected_columns):
    """
    Convert a DataFrame of EEG data into a JSON format suitable for GPT-3 davinci.
    =================================
    1. You should pick selected_columns before running this function.
    2. It contains the process of feature extraction.
    
    =================================
    :param df: Data converted to pandas DataFrame from the original csv file
    :param labels: Label for each window (provide a list, left, right, top, bottom)
    :param window_size: Window size to divide EEG data
    :param selected_columns: EEG channel to use (provide a list)
    :return: List of data in JSON format
    """
    features_array = []

    for start in range(0, len(df) - window_size + 1, window_size):
        window_data = df.iloc[start:start + window_size, selected_columns]  # Pick a single window based on selected_columns
        label = int(labels[start])  # Assuming labels are provided for each window

        # Feature extraction
        features = extract_features(window_data, list(range(len(selected_columns))))  # feature extraction

        # Convert DataFrame to a list and flatten
        features_flat = features.values.flatten().tolist()

        # Append the label to the end of the features
        features_flat.append(label)

        features_array.append(features_flat)

    # Create column names for the features + label
    feature_columns = []
    for col in selected_columns:
        feature_columns.extend([f'{col}ad', f'{col}ta', f'{col}dt'])
    feature_columns.append('label')  # Add the label column

    # Convert to a DataFrame
    features_df = pd.DataFrame(features_array, columns=feature_columns)

    return features_df

In [8]:
features_train = preprocessing(data_train, label_train, window_size, selected_columns)
features_val = preprocessing(data_val, label_val, window_size, selected_columns)
features_test = preprocessing(data_test, label_test, window_size, selected_columns)

Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=4, n_times=1000
    Range : 0 ... 999 =     

In [9]:
features_train

Unnamed: 0,1ad,1ta,1dt,3ad,3ta,3dt,4ad,4ta,4dt,5ad,5ta,5dt,label
0,0.410644,3.758850,0.647858,0.233171,0.886150,4.839705,0.384879,1.318418,1.970707,0.595810,0.463405,3.621859,2
1,2.191649,0.632473,0.721418,0.313310,0.705128,4.526455,1.074693,0.349530,2.662142,1.884304,0.152109,3.488949,2
2,12.598128,0.098553,0.805424,2.353535,0.092776,4.579751,1.265791,0.133204,5.930912,5.695136,0.048178,3.644565,4
3,4.201428,0.146923,1.619994,2.268135,0.394123,1.118663,1.950281,0.679803,0.754257,8.791142,0.143238,0.794139,4
4,2.127290,0.150081,3.132182,4.795612,0.161892,1.288041,4.082949,0.465649,0.525978,2.384031,0.347815,1.205978,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,0.324689,2.380442,1.293823,0.743277,0.697661,1.928436,1.071797,0.562098,1.659876,2.683010,0.292598,1.273814,2
104,2.547287,0.397904,0.986606,8.497682,0.127157,0.925460,0.782445,0.583657,2.189719,9.125252,0.108471,1.010280,4
105,0.399718,1.369085,1.827325,0.478570,0.636459,3.283097,0.180359,1.340574,4.135906,0.743984,0.378047,3.555415,4
106,1.838338,0.709234,0.766982,1.714281,0.217644,2.680224,1.857585,1.021456,0.527026,2.968137,0.190218,1.771185,1


In [10]:
train_X = features_train.iloc[:, :-1].values
train_y = features_train.iloc[:, -1].values
val_X = features_val.iloc[:, :-1].values
val_y = features_val.iloc[:, -1].values
test_X = features_test.iloc[:, :-1].values
test_y = features_test.iloc[:, -1].values

In [11]:
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
val_X = scaler.transform(val_X)
test_X = scaler.transform(test_X)

In [12]:
svm = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],  # Reduced range of the regularization parameter
    'gamma': ['scale', 0.01, 0.1, 1],  # Key gamma values with a focus on potential sweet spots
    'kernel': ['linear', 'rbf'],  # Focus on the most commonly effective kernels
    'class_weight': [None, 'balanced'],  # Option to handle imbalanced classes
}

grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(train_X, train_y)  # Fit the model on the training data

# Print the best parameters and the best score from the validation process
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best parameters found:  {'C': 10, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf'}
Best cross-validation accuracy: 48.96%


In [13]:
# Use the best model to make predictions on the validation set
best_model = grid_search.best_estimator_
val_predictions = best_model.predict(val_X)

# Evaluate the model on the validation set
print("\nValidation Set Performance:")
print("Validation Accuracy: {:.2f}%".format(accuracy_score(val_y, val_predictions) * 100))
print("Validation Kappa Coefficient: {:.2f}".format(cohen_kappa_score(val_y, val_predictions)))
print("\nValidation Classification Report:")
print(classification_report(val_y, val_predictions))


Validation Set Performance:
Validation Accuracy: 33.33%
Validation Kappa Coefficient: 0.11

Validation Classification Report:
              precision    recall  f1-score   support

           1       0.11      0.14      0.12         7
           2       0.27      0.25      0.26        12
           3       0.50      0.33      0.40         9
           4       0.50      0.62      0.56         8

    accuracy                           0.33        36
   macro avg       0.35      0.34      0.34        36
weighted avg       0.35      0.33      0.33        36



In [14]:
# After validation, use the best model to predict on the test set
test_predictions = best_model.predict(test_X)

# Evaluate the model on the test set
print("\nTest Set Performance:")
print("Test Accuracy: {:.2f}%".format(accuracy_score(test_y, test_predictions) * 100))
print("Test Kappa Coefficient: {:.2f}".format(cohen_kappa_score(test_y, test_predictions)))
print("\nTest Classification Report:")
print(classification_report(test_y, test_predictions))


Test Set Performance:
Test Accuracy: 44.44%
Test Kappa Coefficient: 0.26

Test Classification Report:
              precision    recall  f1-score   support

           1       0.40      0.22      0.29         9
           2       0.36      0.57      0.44         7
           3       0.62      0.50      0.56        10
           4       0.42      0.50      0.45        10

    accuracy                           0.44        36
   macro avg       0.45      0.45      0.44        36
weighted avg       0.46      0.44      0.44        36

