# Load dataset (demo)

In [None]:
import os
import pandas as pd

# define data root
# this is the path to the ROAMM folder on local machine
roamm_root = r"/Users/hsun11/Documents/GlassBrainLab/MindlessReading/ROAMM"
ml_data_root = os.path.join(roamm_root, 'subject_ml_data')

# =================================================================
# load a single run of ml data
subject_id = 's10014'
subject_dir = os.path.join(ml_data_root, subject_id)
run_number = 1
df_sub_single_run = pd.read_pickle(os.path.join(subject_dir, f'{subject_id}_run{run_number}_ml_data.pkl'))

# =================================================================
# load all runs for a subject
pkl_files = [f for f in os.listdir(subject_dir) if f.endswith('.pkl')]
df_sub_all_runs = pd.DataFrame()
for pkl_file in pkl_files:
    df_sub_single_run = pd.read_pickle(os.path.join(subject_dir, pkl_file))
    df_sub_all_runs = pd.concat([df_sub_all_runs, df_sub_single_run])


# =================================================================
# load all runs for all subjects
all_subjects = [d for d in os.listdir(ml_data_root) if d.startswith('s') and os.path.isdir(os.path.join(ml_data_root, d))]
df = pd.DataFrame()
for subject_id in all_subjects:
    subject_dir = os.path.join(ml_data_root, subject_id)
    pkl_files = [f for f in os.listdir(subject_dir) if f.endswith('.pkl')]

    # make sure each subject has 5 runs of data
    if len(pkl_files) != 5:
        raise ValueError(f"Subject {subject_id} has {len(pkl_files)} runs instead of 5")
    
    for pkl_file in pkl_files:
        df_sub_single_run = pd.read_pickle(os.path.join(subject_dir, pkl_file))
        # I highly recommend you to filter out reading runs that are not the first pass reading
        # to save memory
        df_sub_single_run = df_sub_single_run[df_sub_single_run['first_pass_reading'] == 1]
        # add subject id to the dataframe   
        df_sub_single_run['subject_id'] = subject_id
        # convert bool col explicitly to avoid pandas warning
        for col in ['is_blink', 'is_saccade', 'is_fixation', 'is_mw', 'first_pass_reading']:
            df_sub_single_run[col] = df_sub_single_run[col] == True
        # append to the dataframe
        df = pd.concat([df, df_sub_single_run])
    
    print(f'Subject {subject_id} has been loaded.')


Subject s10014 has been loaded.


# Load subject data

In [15]:
import os
import pandas as pd

# define data root
# this is the path to the ROAMM folder on local machine
roamm_root = r"/Users/hsun11/Documents/GlassBrainLab/MindlessReading/ROAMM"
ml_data_root = os.path.join(roamm_root, 'subject_ml_data')
# define subject id
subject_id = 's10014'
subject_dir = os.path.join(ml_data_root, subject_id)
# load all runs for a subject
pkl_files = [f for f in os.listdir(subject_dir) if f.endswith('.pkl')]
df = pd.DataFrame()
for pkl_file in pkl_files:
    df_sub_single_run = pd.read_pickle(os.path.join(subject_dir, pkl_file))
    df_sub_single_run = df_sub_single_run[df_sub_single_run['first_pass_reading'] == 1]
    # convert bool col explicitly to avoid pandas warning
    for col in ['is_blink', 'is_saccade', 'is_fixation', 'is_mw', 'first_pass_reading']:
        df_sub_single_run[col] = df_sub_single_run[col] == True
    df = pd.concat([df, df_sub_single_run])

# Classifier on raw data

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler


# Prepare features and target
# Select EEG and eye tracking features
eeg_cols = df.columns.tolist()[:64]  #first 64 columns are EEG channels
eye_cols = ['blink_interp_LX', 'blink_interp_LY', 'blink_interp_RX', 'blink_interp_RY','blink_interp_LPupil', 'blink_interp_RPupil']


# Downsample data using 1-second windows (fs = 256 Hz)
def downsample_data(df, window_size=64):
    """Downsample data using 1-second windows"""
    downsampled_data = []
    
    # Process data in chunks of window_size
    for i in range(0, len(df), window_size):
        window = df.iloc[i:i+window_size]
        
        # Skip if window is too small
        if len(window) < window_size:
            continue
            
        # Check if labels are consistent in this window
        labels_in_window = window['is_mw'].unique()
        if len(labels_in_window) > 1:
            # Skip windows with mixed labels
            continue
            
        # Calculate mean for feature columns
        window_data = {}
        for col in eeg_cols + eye_cols:
            window_data[col] = window[col].mean()
            
        # Use the consistent label
        window_data['is_mw'] = labels_in_window[0]
    
                
        downsampled_data.append(window_data)
    
    return pd.DataFrame(downsampled_data)

print("Downsampling data using 1-second windows (256 samples)...")
print(f"Original data size: {len(df)}")
df_downsampled = downsample_data(df)
print(f"Downsampled data size: {len(df_downsampled)}")
print("Data downsampling completed.")

# Handle class imbalance
print("Handling class imbalance...")
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(df_downsampled[eeg_cols + eye_cols], df_downsampled['is_mw'])
print(f"Class distribution:\n{y_resampled.value_counts()}")

# First, let's try EEG features only
print("\n" + "="*50)
print("=== EEG Features Only ===")
print("="*50)
X_eeg = X_resampled[eeg_cols].copy()
y_eeg = y_resampled

# Split the data
X_eeg_train, X_eeg_test, y_eeg_train, y_eeg_test = train_test_split(
    X_eeg, y_eeg, test_size=0.2, random_state=42, stratify=y_eeg
)

# Create pipeline with scaling and classifier
eeg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

# Train and evaluate EEG-only model
eeg_pipeline.fit(X_eeg_train, y_eeg_train)
y_eeg_pred = eeg_pipeline.predict(X_eeg_test)

print(f"EEG-only Accuracy: {accuracy_score(y_eeg_test, y_eeg_pred):.3f}")
print("\nEEG-only Classification Report:")
print(classification_report(y_eeg_test, y_eeg_pred))
print("\nEEG-only Confusion Matrix:")
print(confusion_matrix(y_eeg_test, y_eeg_pred))

# Next, let's try eye tracking features only
print("\n" + "="*50)
print("=== Eye Tracking Features Only ===")
print("="*50)
X_eye = X_resampled[eye_cols].copy()
y_eye = y_resampled

# Split the data
X_eye_train, X_eye_test, y_eye_train, y_eye_test = train_test_split(
    X_eye, y_eye, test_size=0.2, random_state=42, stratify=y_eye
)

# Create pipeline with scaling and classifier
eye_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

# Train and evaluate eye-only model
eye_pipeline.fit(X_eye_train, y_eye_train)
y_eye_pred = eye_pipeline.predict(X_eye_test)

print(f"Eye-only Accuracy: {accuracy_score(y_eye_test, y_eye_pred):.3f}")
print("\nEye-only Classification Report:")
print(classification_report(y_eye_test, y_eye_pred))
print("\nEye-only Confusion Matrix:")
print(confusion_matrix(y_eye_test, y_eye_pred))

# Now let's try combined features
print("\n" + "="*50)
print("=== Combined EEG + Eye Tracking Features ===")
print("="*50)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Create pipeline with scaling and classifier
combined_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

# Train and evaluate combined model
combined_pipeline.fit(X_train, y_train)
y_pred = combined_pipeline.predict(X_test)

# Evaluate the model
print(f"Combined Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print("\nCombined Classification Report:")
print(classification_report(y_test, y_pred))
print("\nCombined Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Summary comparison
print("\n" + "="*50)
print("=== PERFORMANCE SUMMARY ===")
print("="*50)
print(f"EEG-only Accuracy:      {accuracy_score(y_eeg_test, y_eeg_pred):.3f}")
print(f"Eye-only Accuracy:      {accuracy_score(y_eye_test, y_eye_pred):.3f}")
print(f"Combined Accuracy:      {accuracy_score(y_test, y_pred):.3f}")


Downsampling data using 1-second windows (256 samples)...
Original data size: 438227
Downsampled data size: 6771
Data downsampling completed.
Handling class imbalance...
Class distribution:
0.0    1386
1.0    1386
Name: is_mw, dtype: int64

=== EEG Features Only ===
EEG-only Accuracy: 0.831

EEG-only Classification Report:
              precision    recall  f1-score   support

         0.0       0.84      0.82      0.83       278
         1.0       0.82      0.84      0.83       277

    accuracy                           0.83       555
   macro avg       0.83      0.83      0.83       555
weighted avg       0.83      0.83      0.83       555


EEG-only Confusion Matrix:
[[227  51]
 [ 43 234]]

=== Eye Tracking Features Only ===
Eye-only Accuracy: 0.805

Eye-only Classification Report:
              precision    recall  f1-score   support

         0.0       0.80      0.81      0.81       278
         1.0       0.81      0.80      0.80       277

    accuracy                           

# Comprehensive Model Comparison

This section implements and evaluates all the models from the research comparison, including:
- Traditional ML: KNN, GaussianNB, LinearSVC, RBF SVC/SVR, Linear/Ridge/Lasso/Elastic Net Regression
- Ensemble Methods: Random Forest, Gradient Boost, AdaBoost, XGBoost
- Deep Learning: CNN, PyramidalCNN, EEGNet, InceptionTime, Xception
