In [None]:
pip install mlflow

In [1]:
import os
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupShuffleSplit, cross_val_score, StratifiedKFold
from sklearn.metrics import (
    f1_score, log_loss, confusion_matrix, roc_curve, auc, make_scorer
)
import matplotlib.pyplot as plt

In [2]:
import os
import mne
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from scipy.signal import find_peaks
from mne.time_frequency import psd_array_multitaper
import joblib

In [3]:
data_path = "eeg_training_data_with_subjects.csv"
#data_path = "/Users/ishabharti/eeg_training_data_with_subjects.csv"

data = pd.read_csv(data_path) 
print(data.head(2))
data.shape

      delta     theta         alpha          beta         gamma      mean  \
0  0.000123  0.000067  1.039174e-05  4.101177e-07  1.740069e-07 -0.000003   
1  0.000208  0.000085  9.628139e-07  2.513081e-07  1.123781e-07  0.000125   

       variance  skewness  kurtosis  line_length  label subject_id  
0  2.089062e-08 -1.763948  2.470575     0.004479      0    sub-001  
1  4.599112e-08  0.497322 -1.170764     0.003333      0    sub-001  


(33034, 12)

In [4]:
X = data.drop(columns=["label", "subject_id"]) 
y = data["label"]
groups = data["subject_id"]

In [5]:
# Train-Test Split to Avoid Data Leakage (Group-Based)
group_split = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(group_split.split(X, y, groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [6]:
# Set up MLflow experiment
mlflow.set_experiment("Random Forest EEG Classification (Advanced)")

<Experiment: artifact_location='file:///Users/ishabharti/EEG%20Project/mlruns/621305009683197572', creation_time=1737744675460, experiment_id='621305009683197572', last_update_time=1737744675460, lifecycle_stage='active', name='Random Forest EEG Classification (Advanced)', tags={}>

In [7]:
# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Start an MLflow run
with mlflow.start_run():
    # Log experiment type
    mlflow.log_param("model_type", "RandomForestClassifier")
    # Define hyperparameters for tuning
    param_grid = {
        "n_estimators": [100],
        "max_depth": [None],
        "min_samples_split": [2],
        "min_samples_leaf": [1]
    }

    # Train the Random Forest Model
    rf = RandomForestClassifier(
        n_estimators=param_grid["n_estimators"][0],
        max_depth=param_grid["max_depth"][0],
        min_samples_split=param_grid["min_samples_split"][0],
        min_samples_leaf=param_grid["min_samples_leaf"][0],
        random_state=42
    )

    # Evaluate model using cross-validation
    f1_scorer = make_scorer(f1_score, average="binary")
    cv_f1_scores = cross_val_score(rf, X_train, y_train, scoring=f1_scorer, cv=cv, n_jobs=-1)
    mean_f1_cv = np.mean(cv_f1_scores)
    mlflow.log_metric("mean_f1_cv", mean_f1_cv)

    # Fit the model on the training set
    rf.fit(X_train, y_train)
    joblib.dump(rf, "random_forest_model.pkl")
    mlflow.log_artifact("random_forest_model.pkl")  # Log model in MLflow
    print("Model saved as 'random_forest_model.pkl'")

    # Predict on the test set
    y_pred = rf.predict(X_test)
    y_proba = rf.predict_proba(X_test)[:, 1]

    # Evaluate metrics
    test_f1 = f1_score(y_test, y_pred)
    test_log_loss = log_loss(y_test, y_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("test_f1", test_f1)
    mlflow.log_metric("test_log_loss", test_log_loss)

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    mlflow.log_metric("roc_auc", roc_auc)

    # Plot ROC Curve
    plt.figure()
    plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Random Guess")
    plt.xlabel("False Positive Rate (FPR)")
    plt.ylabel("True Positive Rate (TPR)")
    plt.title("ROC Curve")
    plt.legend()
    plt.grid()
    plt.savefig("roc_curve.png")
    plt.close()
    mlflow.log_artifact("roc_curve.png")

    # Log confusion matrix
    conf_matrix_df = pd.DataFrame(conf_matrix, index=["True Neg", "True Pos"], columns=["Pred Neg", "Pred Pos"])
    conf_matrix_df.to_csv("confusion_matrix.csv")
    mlflow.log_artifact("confusion_matrix.csv")

    # Print metrics
    print(f"Cross-Validation F1 Score: {mean_f1_cv}")
    print(f"Test F1 Score: {test_f1}")
    print(f"Test Log Loss: {test_log_loss}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"ROC AUC: {roc_auc}")

# End MLflow run
mlflow.end_run()

Model saved as 'random_forest_model.pkl'
Cross-Validation F1 Score: 0.8554521401420414
Test F1 Score: 0.3968436626664475
Test Log Loss: 0.7875727035027699
Confusion Matrix:
[[4048 1825]
 [1844 1207]]
ROC AUC: 0.5776791982240946


### Test single EEG on trained model

In [8]:
def preprocess_single_eeg(raw_eeg_data, band_ranges):
    """
    Preprocess a single EEG dataset and extract features.

    Parameters:
        raw_eeg_data: Raw EEG data object (e.g., mne.io.Raw)
        band_ranges: Dictionary of frequency band ranges
    
    Returns:
        feature_array: A numpy array of extracted features
    """
    # Preprocess the EEG data
    raw_eeg_data.filter(1., 40., fir_design='firwin')

    # Define epochs
    events, event_id = mne.events_from_annotations(raw_eeg_data)
    tmin, tmax = -0.2, 0.5
    epochs = mne.Epochs(raw_eeg_data, events, event_id, tmin=tmin, tmax=tmax, baseline=(None, 0), preload=True)

    # Compute PSD for the entire frequency range
    psd = epochs.compute_psd(method='multitaper', fmin=0.5, fmax=40)
    psd_data = psd.get_data()  # Shape: (n_epochs, n_channels, n_freqs)
    freqs = psd.freqs

    # Compute features for each frequency band
    band_powers = []
    for fmin, fmax in band_ranges.values():
        band_indices = (freqs >= fmin) & (freqs < fmax)
        band_power = psd_data[:, :, band_indices].mean(axis=(1, 2))  # Aggregate across channels and frequencies
        band_powers.append(band_power.mean())  # Average over epochs

    # Compute additional features
    epoch_data = epochs.get_data().mean(axis=1)  # Aggregate across channels
    mean_values = epoch_data.mean()
    var_values = epoch_data.var()
    skew_values = skew(epoch_data, axis=1).mean()
    kurtosis_values = kurtosis(epoch_data, axis=1).mean()
    line_length = np.sum(np.abs(np.diff(epoch_data, axis=1)), axis=1).mean()

    # Combine all features
    feature_array = np.array([
        *band_powers, mean_values, var_values, skew_values, kurtosis_values, line_length
    ])
    return feature_array


In [9]:
import mne
import os

# Define the dataset directory
data_dir = "./ds003523"

# Subject and session information
subject = "sub-001"
session = "ses-01"

# Construct the path to the EEG file
eeg_file_path = os.path.join(
    data_dir,
    subject,
    session,
    "eeg",
    f"{subject}_{session}_task-VisualWorkingMemory_eeg.set"
)

# Check if the file exists
if not os.path.exists(eeg_file_path):
    print(f"EEG file not found: {eeg_file_path}")
else:
    print(f"EEG file found: {eeg_file_path}")

EEG file found: ./ds003523/sub-001/ses-01/eeg/sub-001_ses-01_task-VisualWorkingMemory_eeg.set


In [10]:
raw = mne.io.read_raw_eeglab(eeg_file_path, preload=True)
# Plot the raw data (optional)
# raw.plot(duration=5, n_channels=30)

Reading /Users/ishabharti/EEG Project/ds003523/sub-001/ses-01/eeg/sub-001_ses-01_task-VisualWorkingMemory_eeg.fdt
Reading 0 ... 697174  =      0.000 ...  1394.348 secs...


  raw = mne.io.read_raw_eeglab(eeg_file_path, preload=True)


In [11]:


# Define the band ranges used during training
band_ranges = {
    'delta': (0.5, 4),
    'theta': (4, 8),
    'alpha': (8, 13),
    'beta': (13, 30),
    'gamma': (30, 100)
}

# Preprocess and extract features for the single EEG sample
single_features = preprocess_single_eeg(raw, band_ranges)

# Reshape to match the input format expected by the model (1 sample, n_features)
single_features = single_features.reshape(1, -1)
single_features

Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 1 - 40 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 1.00
- Lower transition bandwidth: 1.00 Hz (-6 dB cutoff frequency: 0.50 Hz)
- Upper passband edge: 40.00 Hz
- Upper transition bandwidth: 10.00 Hz (-6 dB cutoff frequency: 45.00 Hz)
- Filter length: 1651 samples (3.302 s)



[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    0.2s


Used Annotations descriptions: ['S  1', 'S  2', 'S  3', 'S 50', 'S 51', 'S 52', 'S100', 'S101', 'S200', 'S201', 'boundary']
Not setting metadata
511 matching events found
Setting baseline interval to [-0.2, 0.0] s
Applying baseline correction (mode: mean)
0 projection items activated
Using data from preloaded Raw for 511 events and 351 original time points ...
1 bad epochs dropped
    Using multitaper spectrum estimation with 7 DPSS windows


array([[ 3.19355231e-04,  1.44500796e-04,  1.04500588e-05,
         1.89622193e-06,  5.15565377e-07, -2.47271096e-05,
         9.23275691e-08, -9.91062469e-01,  1.81069777e+00,
         2.58580155e-03]])

In [12]:
import joblib
import numpy as np

model_path = "random_forest_model.pkl"
# Load the trained Random Forest model
try:
    model = joblib.load(model_path)
    print("Model loaded successfully!")
except FileNotFoundError:
    print(f"Model file not found at: {model_path}")
    raise

Model loaded successfully!


In [13]:
# Predict the class
predicted_class = model.predict(single_features)[0]

# Predict the probability of each class
predicted_probabilities = model.predict_proba(single_features)[0]

print(f"Predicted Class: {predicted_class}")
print(f"Predicted Probabilities: {predicted_probabilities}")


Predicted Class: 0
Predicted Probabilities: [0.81 0.19]




In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [16]:
model = Sequential([
    Dense(216, activation = 'relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
