In [1]:
import mne
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score
import matplotlib.pyplot as plt
from Preprocessing.feature_extraction import load_eeg_data, compute_band_power, extract_features

In [2]:
# Load data
base_path = '/Users/imdohyeon/Library/CloudStorage/GoogleDrive-dhlim1598@gmail.com/공유 드라이브/4N_PKNU/BXAI/EEG-LLM/Dataset/'
train_dir = base_path + 'train.csv'
test_dir = base_path + 'test.csv'
val_dir = base_path + 'val.csv'

window_size = 1000
# selected_columns = [0, 2, 3, 4]
# 0: FCz, 2: C3, 3: Cz, 4: C4

In [3]:
"""
selected_columns = [
    [0, [(12, 14)]],
    [2, [(20, 22)]],
    [3, [(18, 20)]],
    [4, [(20, 22)]]
]
"""
# Train acc: 54.63% / val acc: 36%, kappa: 0.15 / test acc: 45%, kappa: 0.27
# 피처가 더 많아야 함

'\nselected_columns = [\n    [0, [(12, 14)]],\n    [2, [(20, 22)]],\n    [3, [(18, 20)]],\n    [4, [(20, 22)]]\n]\n'

In [4]:
"""
selected_columns = [
    [0, [(10, 12), (12, 14)]],
    [2, [(20, 22), (22, 24)]],
    [3, [(8, 10), (18, 20)]],
    [4, [(20, 22), (22, 24)]]
]
"""
# Train acc: 59% / val acc: 47%, kappa: 0.30 / test acc: 54%, kappa: 0.38

'\nselected_columns = [\n    [0, [(10, 12), (12, 14)]],\n    [2, [(20, 22), (22, 24)]],\n    [3, [(8, 10), (18, 20)]],\n    [4, [(20, 22), (22, 24)]]\n]\n'

In [5]:
# 0: FCz / 1: FC4 / 2: C3 / 3: Cz / 4: C4 / 5: CP3 
# label 1: C4(20, 22), C4(22, 24), FC4(10, 12)
# label 2: C3(20, 22), C3(22, 24)
# label 3: Cz(18, 20), Cz(8, 10),  CP3(28, 30)
# label 4: FCz(10, 12), FCz(12, 14), FCz(28, 30) 
selected_columns = [
    [0, [(10, 12), (12, 14), (28, 30)]],  # FCz
    # [1, [(10, 12)]],  # FC4
    [2, [(20, 22), (22, 24)]],  # C3
    [3, [(8, 10), (18, 20)]],  # Cz
    [4, [(20, 22), (22, 24)]],  # C4
    # [5, [(28, 30)]]  # CP3
]

In [6]:
data_train, label_train = load_eeg_data(train_dir)
data_val, label_val = load_eeg_data(val_dir)
data_test, label_test = load_eeg_data(test_dir)

In [7]:
def extract_features_ml(data, selected_columns, sfreq=250, precision=6):
    feature_dict = {}  # 결과를 저장할 딕셔너리

    for item in selected_columns:
        channel_idx = item[0]  # 채널 인덱스
        bands = item[1]  # 해당 채널에서 추출할 주파수 대역 리스트

        # 주파수 대역이 하나만 주어졌을 때도 리스트로 처리
        if isinstance(bands, tuple):
            bands = [bands]

        # 채널의 데이터 추출
        eeg_data = data.iloc[:, channel_idx].values  # 특정 채널의 데이터를 가져옴
        ch_name = data.columns[channel_idx]  # 채널 이름

        # mne RawArray 객체 생성
        info = mne.create_info(ch_names=[ch_name], sfreq=sfreq, ch_types='eeg')
        raw = mne.io.RawArray(eeg_data[np.newaxis, :], info)  # 2D array 필요

        # 주파수 대역별로 PSD 계산
        for band in bands:
            band_power = compute_band_power(raw, band)

            # band_power가 numpy 배열인 경우 float로 변환
            if isinstance(band_power, np.ndarray):
                band_power = band_power.item()

            # 소수점 이하 precision 자리까지 반올림
            band_power = float(f"{band_power:.{precision}f}")

            # 열 이름 생성 (예: Channel_1_10-12Hz)
            column_name = f'{ch_name}_{band[0]}-{band[1]}Hz'
            feature_dict[column_name] = band_power

    # 최종 데이터프레임 생성
    features = pd.DataFrame([feature_dict])

    return features


In [8]:
def preprocessing(df, labels, window_size, selected_columns):
    """
    Convert a DataFrame of EEG data into a JSON format suitable for GPT-3 davinci.
    =================================
    1. You should pick selected_columns before running this function.
    2. It contains the process of feature extraction.
    
    =================================
    :param df: Data converted to pandas DataFrame from the original csv file
    :param labels: Label for each window (provide a list, left, right, top, bottom)
    :param window_size: Window size to divide EEG data
    :param selected_columns: EEG channel to use, along with their respective frequency bands (provide a list of lists)
    :return: List of data in JSON format
    """
    features_array = []

    for start in range(0, len(df) - window_size + 1, window_size):
        # Extract the window data
        window_data = df.iloc[start:start + window_size, :]
        
        # Extract features using the provided function
        features = extract_features_ml(window_data, selected_columns)
        
        # Convert features DataFrame to a list
        features_flat = features.values.flatten().tolist()

        # Add the label for the current window
        label = int(labels[start])
        features_flat.append(label)

        features_array.append(features_flat)

    # Extract column names directly from the features dataframe and append 'label' for the label column
    feature_columns = list(features.columns)
    feature_columns.append('label')

    # Convert the list of features to a DataFrame
    features_df = pd.DataFrame(features_array, columns=feature_columns)

    return features_df


In [9]:
features_train = preprocessing(data_train, label_train, window_size, selected_columns)
features_val = preprocessing(data_val, label_val, window_size, selected_columns)
features_test = preprocessing(data_test, label_test, window_size, selected_columns)

Creating RawArray with float64 data, n_channels=1, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=1, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=1, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=1, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=1, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
E

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=1, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=1, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=1, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=1, n_times=1000
    Range : 0 ... 999 =      0.000 ...     3.996 secs
Ready.
Effective window size : 1.024 (s)
Effective window size : 1.024 (s)
Creating RawArray with float64 data, n_channels=1, n_times=1000
 

In [10]:
"""
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features_train.iloc[:, :-1] = scaler.fit_transform(features_train.iloc[:, :-1])
features_val.iloc[:, :-1] = scaler.transform(features_val.iloc[:, :-1])
features_test.iloc[:, :-1] = scaler.transform(features_test.iloc[:, :-1])
"""

'\nfrom sklearn.preprocessing import MinMaxScaler\nscaler = MinMaxScaler()\nfeatures_train.iloc[:, :-1] = scaler.fit_transform(features_train.iloc[:, :-1])\nfeatures_val.iloc[:, :-1] = scaler.transform(features_val.iloc[:, :-1])\nfeatures_test.iloc[:, :-1] = scaler.transform(features_test.iloc[:, :-1])\n'

In [11]:
features_train

Unnamed: 0,FCz_10-12Hz,FCz_12-14Hz,FCz_28-30Hz,C3_20-22Hz,C3_22-24Hz,Cz_8-10Hz,Cz_18-20Hz,C4_20-22Hz,C4_22-24Hz,label
0,0.000115,0.000010,0.000004,0.000005,0.000003,0.000077,0.000008,0.000003,0.000008,4
1,0.000025,0.000005,0.000010,0.000001,0.000001,0.000006,0.000005,0.000002,0.000001,1
2,0.000045,0.000005,0.000008,0.000005,0.000004,0.000016,0.000029,0.000002,0.000002,1
3,0.000148,0.000010,0.000003,0.000012,0.000008,0.000012,0.000002,0.000004,0.000010,3
4,0.000011,0.000019,0.000006,0.000006,0.000015,0.000023,0.000026,0.000005,0.000008,4
...,...,...,...,...,...,...,...,...,...,...
211,0.000010,0.000005,0.000005,0.000001,0.000001,0.000014,0.000004,0.000001,0.000003,1
212,0.000046,0.000009,0.000003,0.000002,0.000003,0.000010,0.000004,0.000003,0.000006,2
213,0.000052,0.000003,0.000003,0.000002,0.000001,0.000023,0.000015,0.000003,0.000001,2
214,0.000009,0.000002,0.000005,0.000005,0.000001,0.000020,0.000008,0.000003,0.000010,3


In [12]:
train_X = features_train.iloc[:, :-1].values
train_y = features_train.iloc[:, -1].values
val_X = features_val.iloc[:, :-1].values
val_y = features_val.iloc[:, -1].values
test_X = features_test.iloc[:, :-1].values
test_y = features_test.iloc[:, -1].values

In [13]:
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
val_X = scaler.transform(val_X)
test_X = scaler.transform(test_X)

In [14]:
svm = SVC()
param_grid = {
    'C': [0.1, 1, 10, 50, 80, 100, 150],  # Reduced range of the regularization parameter
    'gamma': ['scale', 0.01, 0.1, 1],  # Key gamma values with a focus on potential sweet spots
    'kernel': ['linear', 'rbf'],  # Focus on the most commonly effective kernels
    'class_weight': [None, 'balanced'],  # Option to handle imbalanced classes
}

grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(train_X, train_y)  # Fit the model on the training data

# Print the best parameters and the best score from the validation process
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

Fitting 5 folds for each of 112 candidates, totalling 560 fits
Best parameters found:  {'C': 100, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf'}
Best cross-validation accuracy: 58.37%


In [15]:
# Use the best model to make predictions on the validation set
best_model = grid_search.best_estimator_
val_predictions = best_model.predict(val_X)

# Evaluate the model on the validation set
print("\nValidation Set Performance:")
print("Validation Accuracy: {:.2f}%".format(accuracy_score(val_y, val_predictions) * 100))
print("Validation Kappa Coefficient: {:.2f}".format(cohen_kappa_score(val_y, val_predictions)))
print("\nValidation Classification Report:")
print(classification_report(val_y, val_predictions))


Validation Set Performance:
Validation Accuracy: 43.06%
Validation Kappa Coefficient: 0.25

Validation Classification Report:
              precision    recall  f1-score   support

           1       0.25      0.41      0.31        17
           2       0.38      0.35      0.36        17
           3       0.67      0.38      0.48        21
           4       0.62      0.59      0.61        17

    accuracy                           0.43        72
   macro avg       0.48      0.43      0.44        72
weighted avg       0.49      0.43      0.44        72



In [16]:
# After validation, use the best model to predict on the test set
test_predictions = best_model.predict(test_X)

# Evaluate the model on the test set
print("\nTest Set Performance:")
print("Test Accuracy: {:.2f}%".format(accuracy_score(test_y, test_predictions) * 100))
print("Test Kappa Coefficient: {:.2f}".format(cohen_kappa_score(test_y, test_predictions)))
print("\nTest Classification Report:")
print(classification_report(test_y, test_predictions))


Test Set Performance:
Test Accuracy: 51.39%
Test Kappa Coefficient: 0.34

Test Classification Report:
              precision    recall  f1-score   support

           1       0.44      0.75      0.56        20
           2       0.46      0.40      0.43        15
           3       0.67      0.33      0.44        18
           4       0.62      0.53      0.57        19

    accuracy                           0.51        72
   macro avg       0.55      0.50      0.50        72
weighted avg       0.55      0.51      0.51        72

