In [18]:
import os
import numpy as np
import pandas as pd
import scipy.io
import mne
from mne_bids import BIDSPath, write_raw_bids, make_dataset_description
import logging


In [19]:
# =============================================================================
# Configuration and Constants
# =============================================================================

# Set up logging: messages will include timestamp, log level, and message content
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)  # Logger named after current module 

# --- File and directory paths ---
DS_ROOT = '/volumes/hyijie_psy/CPP/data_low-level_1/data'    # Root directory containing raw .mat EEG files
FN_FIRST = 'Formatted_'                                      # Prefix of EEG .mat filename 
FN_LAST = '.3_40_preproc'                                    # Suffix of EEG .mat filename 
PREPROCESS_DATA_DIR = '../data/preprocessData/'              # Output directory for BIDS-formatted data
CHANNEL_CSV = '../data/channelName.csv'                      # CSV file listing EEG channel names (e.g., E1, E2, ...)
BEHAVIOR_RAW_CSV = '../data/behaviorData_low_1_raw.csv'      # Raw behavioral data input
BEHAVIOR_FINAL_CSV = '../data/behaviorData_low_1_final.csv'  # Cleaned behavioral data output

# --- Processing parameters ---
SAMPLING_RATE = 500                      # EEG sampling rate in Hz
RT_LOWER_BOUND = 0.2                     # Minimum valid reaction time (seconds)
RT_OUTLIER_STD_MULTIPLIER = 3            # RT outliers defined as > mean + 3*std

# --- BIDS event mapping ---
# Maps integer event codes (1–6) to human-readable event names
EVENT_ID_MAP = {
    'fixation': 1,
    'boil': 2,
    'photodiode': 3,
    'stimulus': 4,
    'response': 5,
    'offset': 6  
}

# --- Subject and column identifiers ---
SUBJECT_IDS = [f'A{i:03d}' for i in range(1, 21)]  # Adult subject IDs: A001 to A020
BEHAVIOR_ID_COL = 'subj'      # Column name in behavior CSV that contains subject ID
TRIAL_NO_COL = 'trialno'      # Column name for trial number (1-based in raw data)


In [None]:

# =============================================================================
# Helper Functions
# =============================================================================

def load_behavior_data(filepath: str) -> pd.DataFrame:
    """
    Load raw behavioral data and filter to include only adult subjects (IDs starting with 'A').
    
    Parameters
    ----------
    filepath : str
        Path to the raw behavioral CSV file.
    
    Returns
    -------
    pd.DataFrame
        Filtered DataFrame containing only adult subjects.
        If the expected subject ID column is missing, it assumes the second column (index 1)
        contains subject IDs and renames it accordingly.
    """
    df = pd.read_csv(filepath)
    # Keep only rows where subject ID starts with 'A' (adults)
    adults = df[df[BEHAVIOR_ID_COL].astype(str).str.startswith('A')].copy()
    return adults


def validate_trial_alignment(trigger_trials: np.ndarray, behavior_trials: np.ndarray, sub_id: str) -> bool:
    """
    Check whether trial indices from EEG triggers and behavioral data are identical.
    
    This ensures that each trial in the behavioral log corresponds exactly to a trial in the EEG data.
    
    Parameters
    ----------
    trigger_trials : np.ndarray
        Trial indices from EEG trigger data (after filtering invalid responses).
    behavior_trials : np.ndarray
        Trial indices from behavioral data (0-based).
    sub_id : str
        Subject ID for logging purposes.
    
    Returns
    -------
    bool
        True if indices match exactly; False otherwise.
    """
    if np.array_equal(trigger_trials, behavior_trials):
        logger.info(f"Subject {sub_id}: behavioral and EEG trial indices aligned.")
        return True
    else:
        logger.warning(f"Subject {sub_id}: trial indices mismatch! Skipping.")
        return False


def process_subject(sub_id: str, behavior_df: pd.DataFrame) -> tuple[pd.DataFrame, str]:
    """
    Process a single subject: load EEG data, align with behavior, clean trials, and export to BIDS.
    
    Steps:
    1. Filter behavioral data for the subject.
    2. Load corresponding EEG .mat file.
    3. Align trials between EEG and behavior (exclude trials with missing response triggers).
    4. Keep only "good" trials (as marked in the .mat file).
    5. Remove RT outliers.
    6. Convert EEG to continuous microvolts and create MNE Raw object.
    7. Generate BIDS-compatible event array.
    8. Write BIDS dataset (BrainVision format).
    
    Parameters
    ----------
    sub_id : str
        Subject ID (e.g., 'A001').
    behavior_df : pd.DataFrame
        Full behavioral dataset (adults only).
    
    Returns
    -------
    tuple[pd.DataFrame, str]
        - Cleaned behavioral data for this subject (may be empty if all trials rejected).
        - Subject ID (for logging or aggregation).
    """
    try:
        # === Step 1: Filter behavioral data for this subject ===
        beh_sub = behavior_df[behavior_df[BEHAVIOR_ID_COL] == sub_id].copy()
        if beh_sub.empty:
            raise ValueError("No behavioral data found for this subject.")

        # Convert trial numbers from 1-based (MATLAB-style) to 0-based (Python-style)
        beh_sub[TRIAL_NO_COL] = beh_sub[TRIAL_NO_COL] - 1

        # === Step 2: Load EEG .mat file ===
        mat_path = os.path.join(DS_ROOT, FN_FIRST + sub_id + FN_LAST + '.mat')
        if not os.path.exists(mat_path):
            raise FileNotFoundError(f"EEG .mat file not found: {mat_path}")

        eeg_mat = scipy.io.loadmat(mat_path)
        # Structure:
        # - 'X': cell array of trials, each trial is (n_channels, n_timepoints)
        # - 'GoodTrial': logical array indicating valid trials
        # - 'Onsets.corrMatrix': trigger timestamps per trial (n_trials, n_events)
        # - 'trialEpochIdx': start/end sample indices for each trial (n_trials, 2)
        data_EEG = eeg_mat['X']
        good_trials_arr = eeg_mat['GoodTrial'][0][0][0].T  # Shape: (n_trials,)
        triggers = eeg_mat['Onsets']['corrMatrix'][0, 0]   # Shape: (n_trials, n_events)
        trial_epoch_idx = eeg_mat['trialEpochIdx']         # Shape: (n_trials, 2)

        # Convert arrays to DataFrames and add trial number column
        good_trials = pd.DataFrame(good_trials_arr, columns=['is_good'])
        triggers_df = pd.DataFrame(triggers)
        trial_epoch_df = pd.DataFrame(trial_epoch_idx, columns=['start_idx', 'end_idx'])

        for df in [good_trials, triggers_df, trial_epoch_df]:
            df[TRIAL_NO_COL] = np.arange(len(df))  # Assign 0-based trial numbers

        # === Step 3: Exclude trials with missing response triggers ===
        valid_resp = ~triggers_df.iloc[:, 4].isna()
        valid_trial_indices = np.where(valid_resp)[0]  # Indices of trials with valid response

        beh_trial_indices = beh_sub[TRIAL_NO_COL].values
        # Validate alignment between EEG and behavior after filtering
        if not validate_trial_alignment(valid_trial_indices, beh_trial_indices, sub_id):
            return pd.DataFrame(), sub_id

        # Filter all data to include only valid-response trials
        triggers_valid = triggers_df[triggers_df[TRIAL_NO_COL].isin(valid_trial_indices)]
        trial_epoch_valid = trial_epoch_df[trial_epoch_df[TRIAL_NO_COL].isin(valid_trial_indices)]
        good_trials_valid = good_trials[good_trials[TRIAL_NO_COL].isin(valid_trial_indices)]
        beh_valid = beh_sub[beh_sub[TRIAL_NO_COL].isin(valid_trial_indices)]

        # === Step 4: Keep only "good" trials (as marked in .mat file) ===
        good_mask = good_trials_valid['is_good'] == 1
        good_trialnos = good_trials_valid.loc[good_mask, TRIAL_NO_COL]
        triggers_good = triggers_valid[triggers_valid[TRIAL_NO_COL].isin(good_trialnos)]
        trial_epoch_good = trial_epoch_valid[trial_epoch_valid[TRIAL_NO_COL].isin(good_trialnos)]
        beh_good = beh_valid[beh_valid[TRIAL_NO_COL].isin(good_trialnos)]

        # Safety check: ensure alignment is preserved after good-trial filtering
        assert np.array_equal(
            triggers_good[TRIAL_NO_COL].values,
            beh_good[TRIAL_NO_COL].values
        ), f"Post-good-trial alignment failed for {sub_id}"

        # === Step 5: Remove RT outliers ===
        rt_mean = np.nanmean(beh_good['RT'])
        rt_std = np.nanstd(beh_good['RT'])
        rt_upper = rt_mean + RT_OUTLIER_STD_MULTIPLIER * rt_std

        beh_clean = beh_good[
            (beh_good['RT'] >= RT_LOWER_BOUND) &
            (beh_good['RT'] <= rt_upper)
        ].copy()

        if beh_clean.empty:
            logger.warning(f"Subject {sub_id}: no valid trials after RT filtering.")
            return pd.DataFrame(), sub_id

        clean_trialnos = beh_clean[TRIAL_NO_COL]
        triggers_clean = triggers_good[triggers_good[TRIAL_NO_COL].isin(clean_trialnos)]
        trial_epoch_clean = trial_epoch_good[trial_epoch_good[TRIAL_NO_COL].isin(clean_trialnos)]

        # === Step 6: Prepare continuous EEG data for BIDS ===
        # Concatenate all trials into one continuous array (n_channels, total_timepoints)
        eeg_continuous = np.concatenate([trial for trial in data_EEG[0]], axis=1)
        eeg_continuous_microvolts = eeg_continuous * 1e6  # Convert to microvolts (BIDS standard)

        # Load channel names and validate count
        channel_info = pd.read_csv(CHANNEL_CSV).dropna()
        ch_names = channel_info['channel'].tolist()
        if eeg_continuous_microvolts.shape[0] != len(ch_names):
            raise ValueError(
                f"Channel count mismatch for {sub_id}: "
                f"EEG has {eeg_continuous_microvolts.shape[0]} channels, "
                f"but {len(ch_names)} names provided."
            )

        # Create MNE Info and Raw object
        info = mne.create_info(ch_names=ch_names, sfreq=SAMPLING_RATE, ch_types='eeg')
        raw = mne.io.RawArray(eeg_continuous_microvolts, info)

        # === Step 7: Build BIDS event array ===
        # Adjust trigger timestamps to global continuous time
        offsets = trial_epoch_clean['start_idx'].values - 1  # MATLAB 1-based → Python 0-based
        triggers_adj = triggers_clean.copy()
        numeric_cols = [col for col in triggers_adj.columns if col != TRIAL_NO_COL]
        # Add trial-specific offset to each trigger timestamp
        triggers_adj[numeric_cols] = triggers_adj[numeric_cols].values + offsets[:, None]

        # Flatten into BIDS event format: (n_events, 3) → [onset, duration, event_id]
        onsets = []
        unused = []
        event_ids = []

        for _, row in triggers_adj.iterrows():
            trial_onsets = row[numeric_cols].values - 1  # Convert to 0-based sample index
            n_events = len(trial_onsets)
            onsets.extend(trial_onsets)
            unused.extend([0] * n_events)  # Duration unused in most EEG analyses
            event_ids.extend(range(1, n_events + 1))  # Event IDs: 1, 2, ..., n

        events = np.column_stack([onsets, unused, event_ids]).astype(int)

        # === Step 8: Write BIDS dataset ===
        bids_path = BIDSPath(
            subject=sub_id,
            task='randomDot',
            datatype='eeg',
            root=PREPROCESS_DATA_DIR
        )

        write_raw_bids(
            raw=raw,
            bids_path=bids_path,
            events=events,
            event_id=EVENT_ID_MAP,
            format='BrainVision',  # Common EEG format compatible with many tools
            allow_preload=True,
            overwrite=True,
            verbose=False
        )

        logger.info(f"Subject {sub_id}: BIDS conversion completed.")
        return beh_clean, sub_id

    except Exception as e:
        # Log full error traceback for debugging
        logger.error(f"Error processing subject {sub_id}: {e}", exc_info=True)
        return pd.DataFrame(), sub_id


In [21]:
# =============================================================================
# Main Execution Block
# =============================================================================
# This block runs only when the script is executed directly (not when imported).
# It orchestrates the entire preprocessing pipeline.
if __name__ == "__main__":
    # Load and preprocess behavioral data once
    behavior_raw = load_behavior_data(BEHAVIOR_RAW_CSV)
    all_clean_behavior = []

    # Process each subject sequentially
    for sub in SUBJECT_IDS:
        clean_beh, _ = process_subject(sub, behavior_raw)
        if not clean_beh.empty:
            all_clean_behavior.append(clean_beh)

    # Aggregate and save cleaned behavioral data if any subjects succeeded
    if all_clean_behavior:
        final_df = pd.concat(all_clean_behavior, ignore_index=True)

        # Save only the columns needed for downstream analysis
        selected = final_df[['subj', 'coherence', 'RT', 'accuracy', 'keypress']]
        selected.to_csv(BEHAVIOR_FINAL_CSV, index=False)
        logger.info(f"Final behavioral data saved to {BEHAVIOR_FINAL_CSV}")

        # Create BIDS dataset_description.json (required by BIDS specification)
        make_dataset_description(
            path=PREPROCESS_DATA_DIR,
            name='LowLevel-randomDot',
            dataset_type='raw',
            overwrite=True
        )
        logger.info("BIDS dataset description created.")
    else:
        logger.warning("No subjects processed successfully.")

2025-10-28 18:42:54,966 - INFO - Subject A001: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=491753
    Range : 0 ... 491752 =      0.000 ...   983.504 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:42:56,296 - INFO - Subject A001: BIDS conversion completed.
2025-10-28 18:42:57,402 - INFO - Subject A002: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=461226
    Range : 0 ... 461225 =      0.000 ...   922.450 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:42:58,735 - INFO - Subject A002: BIDS conversion completed.
2025-10-28 18:42:59,869 - INFO - Subject A003: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=449384
    Range : 0 ... 449383 =      0.000 ...   898.766 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:01,025 - INFO - Subject A003: BIDS conversion completed.
2025-10-28 18:43:02,053 - INFO - Subject A004: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=456058
    Range : 0 ... 456057 =      0.000 ...   912.114 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:03,084 - INFO - Subject A004: BIDS conversion completed.
2025-10-28 18:43:04,093 - INFO - Subject A005: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=438489
    Range : 0 ... 438488 =      0.000 ...   876.976 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:05,055 - INFO - Subject A005: BIDS conversion completed.
2025-10-28 18:43:06,186 - INFO - Subject A006: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=498464
    Range : 0 ... 498463 =      0.000 ...   996.926 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:07,402 - INFO - Subject A006: BIDS conversion completed.
2025-10-28 18:43:08,490 - INFO - Subject A007: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=471979
    Range : 0 ... 471978 =      0.000 ...   943.956 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:09,569 - INFO - Subject A007: BIDS conversion completed.
2025-10-28 18:43:10,603 - INFO - Subject A008: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=475293
    Range : 0 ... 475292 =      0.000 ...   950.584 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:11,696 - INFO - Subject A008: BIDS conversion completed.
2025-10-28 18:43:12,704 - INFO - Subject A009: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=447978
    Range : 0 ... 447977 =      0.000 ...   895.954 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:13,575 - INFO - Subject A009: BIDS conversion completed.
2025-10-28 18:43:14,606 - INFO - Subject A010: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=455350
    Range : 0 ... 455349 =      0.000 ...   910.698 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:15,613 - INFO - Subject A010: BIDS conversion completed.
2025-10-28 18:43:16,632 - INFO - Subject A011: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=436412
    Range : 0 ... 436411 =      0.000 ...   872.822 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:17,618 - INFO - Subject A011: BIDS conversion completed.
2025-10-28 18:43:18,685 - INFO - Subject A012: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=465449
    Range : 0 ... 465448 =      0.000 ...   930.896 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:19,703 - INFO - Subject A012: BIDS conversion completed.
2025-10-28 18:43:20,894 - INFO - Subject A013: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=512778
    Range : 0 ... 512777 =      0.000 ...  1025.554 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:22,225 - INFO - Subject A013: BIDS conversion completed.
2025-10-28 18:43:23,366 - INFO - Subject A014: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=420476
    Range : 0 ... 420475 =      0.000 ...   840.950 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:24,323 - INFO - Subject A014: BIDS conversion completed.
2025-10-28 18:43:25,562 - INFO - Subject A015: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=484636
    Range : 0 ... 484635 =      0.000 ...   969.270 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:26,798 - INFO - Subject A015: BIDS conversion completed.
2025-10-28 18:43:27,882 - INFO - Subject A016: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=442934
    Range : 0 ... 442933 =      0.000 ...   885.866 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:28,916 - INFO - Subject A016: BIDS conversion completed.
2025-10-28 18:43:30,014 - INFO - Subject A017: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=467765
    Range : 0 ... 467764 =      0.000 ...   935.528 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:31,199 - INFO - Subject A017: BIDS conversion completed.
2025-10-28 18:43:32,243 - INFO - Subject A018: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=445532
    Range : 0 ... 445531 =      0.000 ...   891.062 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:33,324 - INFO - Subject A018: BIDS conversion completed.
2025-10-28 18:43:34,437 - INFO - Subject A019: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=455674
    Range : 0 ... 455673 =      0.000 ...   911.346 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:35,568 - INFO - Subject A019: BIDS conversion completed.
2025-10-28 18:43:36,592 - INFO - Subject A020: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=432717
    Range : 0 ... 432716 =      0.000 ...   865.432 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-10-28 18:43:37,600 - INFO - Subject A020: BIDS conversion completed.
2025-10-28 18:43:37,639 - INFO - Final behavioral data saved to ../data/behaviorData_low_1_final.csv


Writing '../data/preprocessData/dataset_description.json'...


2025-10-28 18:43:37,640 - INFO - BIDS dataset description created.
