In [8]:
import mne
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
from Preprocessing.feature_extraction import load_eeg_data, compute_band_power, extract_features
mne.set_log_level('error')

In [9]:
def pipeline(base_path):
    train_dir = base_path + 'train4ml.csv'
    test_dir = base_path + 'test4ml.csv'
    val_dir = base_path + 'val4ml.csv'
    data_train, label_train = load_eeg_data(train_dir)
    data_val, label_val = load_eeg_data(val_dir)   
    data_test, label_test = load_eeg_data(test_dir)
    
    train_X = data_train
    train_y = label_train
    val_X = data_val
    val_y = label_val
    test_X = data_test
    test_y = label_test
    
    # Scaling
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)
    
    # Scaling
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)
    
    # Train through GridSearchCV
    rf = RandomForestClassifier()
    param_grid = {
        'n_estimators': [100, 200, 300],  # 트리의 개수
        'max_depth': [10, 20, None],  # 트리의 최대 깊이
        'min_samples_split': [2, 5, 10],  # 분할을 위한 최소 샘플 수
        'min_samples_leaf': [1, 2, 4],  # 리프 노드의 최소 샘플 수
        'max_features': ['sqrt', 'log2', None]  # 고려할 최대 피처 수
    }

    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
    grid_search.fit(train_X, train_y)  # Fit the model on the training data

    # Print the best parameters and the best score from the validation process
    print("Best parameters found: ", grid_search.best_params_)
    print("Best cross-validation accuracy: {:.2f}%".format(grid_search.best_score_ * 100))
    
    # (Validation) Use the best model to make predictions on the validation set
    best_model = grid_search.best_estimator_
    val_predictions = best_model.predict(val_X)

    # Evaluate the model on the validation set
    print("\nValidation Set Performance:")
    print("Validation Accuracy: {:.2f}%".format(accuracy_score(val_y, val_predictions) * 100))
    print("Validation ROC-AUC Score: {:.2f}".format(roc_auc_score(val_y, val_predictions)))
    print("\nValidation Classification Report:")
    print(classification_report(val_y, val_predictions))
    
    # (Test) After validation, use the best model to predict on the test set
    test_predictions = best_model.predict(test_X)

    # Evaluate the model on the test set
    print("\nTest Set Performance:")
    print("Test Accuracy: {:.2f}%".format(accuracy_score(test_y, test_predictions) * 100))
    print("Test ROC-AUC Score: {:.2f}".format(roc_auc_score(test_y, test_predictions)))
    print("\nTest Classification Report:")
    print(classification_report(test_y, test_predictions))

In [10]:
# Load data
base_path_1 = '/Users/imdohyeon/Library/CloudStorage/GoogleDrive-dhlim1598@gmail.com/공유 드라이브/4N_PKNU/Project/EEG-LLM/Dataset/subject 1 data (k3b)/down sampling X ver/label15/4ml/'
base_path_2 = '/Users/imdohyeon/Library/CloudStorage/GoogleDrive-dhlim1598@gmail.com/공유 드라이브/4N_PKNU/Project/EEG-LLM/Dataset/subject 1 data (k3b)/down sampling X ver/label25/4ml/'
base_path_3 = '/Users/imdohyeon/Library/CloudStorage/GoogleDrive-dhlim1598@gmail.com/공유 드라이브/4N_PKNU/Project/EEG-LLM/Dataset/subject 1 data (k3b)/down sampling X ver/label35/4ml/'
base_path_4 = '/Users/imdohyeon/Library/CloudStorage/GoogleDrive-dhlim1598@gmail.com/공유 드라이브/4N_PKNU/Project/EEG-LLM/Dataset/subject 1 data (k3b)/down sampling X ver/label45/4ml/'
window_size = 1000

In [11]:
train_rf_1 = pipeline(base_path_1)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters found:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation accuracy: 88.05%

Validation Set Performance:
Validation Accuracy: 63.89%
Validation ROC-AUC Score: 0.63

Validation Classification Report:
              precision    recall  f1-score   support

           1       0.64      0.53      0.58        17
           5       0.64      0.74      0.68        19

    accuracy                           0.64        36
   macro avg       0.64      0.63      0.63        36
weighted avg       0.64      0.64      0.63        36


Test Set Performance:
Test Accuracy: 86.11%
Test ROC-AUC Score: 0.86

Test Classification Report:
              precision    recall  f1-score   support

           1       0.83      0.88      0.86        17
           5       0.89      0.84      0.86        19

    accuracy                           0.86       

In [12]:
train_rf_2 = pipeline(base_path_2)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters found:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best cross-validation accuracy: 78.74%

Validation Set Performance:
Validation Accuracy: 83.33%
Validation ROC-AUC Score: 0.83

Validation Classification Report:
              precision    recall  f1-score   support

           2       0.81      0.89      0.85        19
           5       0.87      0.76      0.81        17

    accuracy                           0.83        36
   macro avg       0.84      0.83      0.83        36
weighted avg       0.84      0.83      0.83        36


Test Set Performance:
Test Accuracy: 83.33%
Test ROC-AUC Score: 0.83

Test Classification Report:
              precision    recall  f1-score   support

           2       0.83      0.83      0.83        18
           5       0.83      0.83      0.83        18

    accuracy                           0.83      

In [13]:
train_rf_3 = pipeline(base_path_3)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters found:  {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best cross-validation accuracy: 79.57%

Validation Set Performance:
Validation Accuracy: 75.00%
Validation ROC-AUC Score: 0.74

Validation Classification Report:
              precision    recall  f1-score   support

           3       0.67      0.71      0.69        14
           5       0.81      0.77      0.79        22

    accuracy                           0.75        36
   macro avg       0.74      0.74      0.74        36
weighted avg       0.75      0.75      0.75        36


Test Set Performance:
Test Accuracy: 61.11%
Test ROC-AUC Score: 0.61

Test Classification Report:
              precision    recall  f1-score   support

           3       0.58      0.83      0.68        18
           5       0.70      0.39      0.50        18

    accuracy                           0.61     

In [14]:
train_rf_4 = pipeline(base_path_4)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimato