In [4]:
import os
import numpy as np
import pandas as pd
import scipy.io
import mne
from mne_bids import BIDSPath, write_raw_bids, make_dataset_description
import logging


In [5]:

# =============================================================================
# Configuration and Constants
# =============================================================================

# Set up logging: messages will include timestamp, log level, and message content
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

DS_ROOT_EEG = '/volumes/hyijie_psy/CPP/low_Manning_2021/eegData'
BEHAVIOR_RAW_CSV = '/volumes/hyijie_psy/CPP/low_Manning_2021/behaData/behaviorData_low_Manning_2021.csv'
CHANNEL_CSV = '/volumes/hyijie_psy/CPP/low_Manning_2021/eegData/channelName.csv'

FN_FIRST = 'Formatted_'
FN_LAST = '.3_40_preproc'

PREPROCESS_DATA = '../data/preprocessData'
BEHAVIOR_CLEAN_CSV = '../data/behaviorData_low_Manning_2021_final.csv'
os.makedirs(PREPROCESS_DATA, exist_ok=True)

SAMPLING_RATE = 500
RT_OUTLIER_STD_MULTIPLIER = 3

EVENT_ID_MAP = {
    'fixation': 1,
    'boil': 2,
    'photodiode': 3,
    'stimulus': 4,
    'response': 5,
    'offset': 6  
}

SUBJECT_IDS = [f'A{i:03d}' for i in range(1, 21)]
BEHAVIOR_ID_COL = 'subj'
TRIAL_NO_COL = 'trialno'


In [None]:
# =============================================================================
# Load and preprocess behavioral data 
# =============================================================================
behavior_raw = pd.read_csv(BEHAVIOR_RAW_CSV)
behavior_raw = behavior_raw[behavior_raw[BEHAVIOR_ID_COL].astype(str).str.startswith('A')].copy()

all_clean_behavior = []

# Process each subject
for sub_id in SUBJECT_IDS:
    logger.info(f"\nProcessing subject: {sub_id}")

    # === Step 1: Filter behavioral data for this subject ===
    beh_sub = behavior_raw[behavior_raw[BEHAVIOR_ID_COL] == sub_id].copy()
    assert not beh_sub.empty, f"No behavioral data found for subject {sub_id}."

    # Convert trial numbers from 1-based to 0-based
    beh_sub[TRIAL_NO_COL] = beh_sub[TRIAL_NO_COL] - 1

    # === Step 2: Load EEG .mat file ===
    eeg_path = os.path.join(DS_ROOT_EEG, FN_FIRST + sub_id + FN_LAST + '.mat')
    assert os.path.exists(eeg_path), f"EEG .mat file not found: {eeg_path}"

    # extract eeg data, good trials (which is a derivation of preprocess analysis) ,triggers and trigger idx (which is the start and end trigger for each trial)
    data_mat = scipy.io.loadmat(eeg_path)
    data_EEG = data_mat['X']
    good_trials_arr = data_mat['GoodTrial'][0][0][0].T
    triggers = data_mat['Onsets']['corrMatrix'][0, 0]
    trial_epoch_idx = data_mat['trialEpochIdx']

    # Build dataframes
    good_trials = pd.DataFrame(good_trials_arr, columns=['is_good'])
    triggers_df = pd.DataFrame(triggers)
    trial_epoch_idx_df = pd.DataFrame(trial_epoch_idx, columns=['start_idx', 'end_idx'])

    # Add an index column each dataFrame, which is used for alignment and index
    for df in [good_trials, triggers_df, trial_epoch_idx_df]:
        df[TRIAL_NO_COL] = np.arange(len(df))

    # === Step 3: Exclude trials with missing response triggers ===
    valid_resp = ~triggers_df.iloc[:, 4].isna()
    valid_trial_indices = np.where(valid_resp)[0]
    beh_trial_indices = beh_sub[TRIAL_NO_COL].values

    # Validate alignment between behavior data and eeg data
    assert np.array_equal(valid_trial_indices, beh_trial_indices), \
        f"Subject {sub_id}: trial indices mismatch between EEG triggers and behavior."
    
    logger.info(f"Subject {sub_id}: behavioral and EEG trial indices aligned.")

    # Filter to valid-response trials
    triggers_valid = triggers_df[triggers_df[TRIAL_NO_COL].isin(valid_trial_indices)]
    trial_epoch_idx_valid = trial_epoch_idx_df[trial_epoch_idx_df[TRIAL_NO_COL].isin(valid_trial_indices)]
    good_trials_valid = good_trials[good_trials[TRIAL_NO_COL].isin(valid_trial_indices)]
    beh_valid = beh_sub[beh_sub[TRIAL_NO_COL].isin(valid_trial_indices)]

    # === Step 4: Keep only "good" trials ===
    good_mask = good_trials_valid['is_good'] == 1
    good_trial_no = good_trials_valid.loc[good_mask, TRIAL_NO_COL]

    # Filter to good trials
    triggers_good = triggers_valid[triggers_valid[TRIAL_NO_COL].isin(good_trial_no)]
    trial_epoch_idx_good = trial_epoch_idx_valid[trial_epoch_idx_valid[TRIAL_NO_COL].isin(good_trial_no)]
    beh_good = beh_valid[beh_valid[TRIAL_NO_COL].isin(good_trial_no)]

    # Ensure alignment preserved
    assert np.array_equal(triggers_good[TRIAL_NO_COL].values, beh_good[TRIAL_NO_COL].values), \
        f"Post-good-trial alignment failed for {sub_id}"

    # === Step 5: Remove RT outliers ===
    rt_mean = np.nanmean(beh_good['RT'])
    rt_std = np.nanstd(beh_good['RT'])
    rt_lower = rt_mean - RT_OUTLIER_STD_MULTIPLIER * rt_std
    rt_upper = rt_mean + RT_OUTLIER_STD_MULTIPLIER * rt_std

    beh_clean = beh_good[
        (beh_good['RT'] >= rt_lower) &
        (beh_good['RT'] <= rt_upper)
    ]

    if beh_clean.empty:
        logger.warning(f"Subject {sub_id}: no valid trials after RT filtering. Skipping.")
        continue

    clean_trial_no = beh_clean[TRIAL_NO_COL]
    triggers_clean = triggers_good[triggers_good[TRIAL_NO_COL].isin(clean_trial_no)]
    trial_epoch_idx_clean = trial_epoch_idx_good[trial_epoch_idx_good[TRIAL_NO_COL].isin(clean_trial_no)]

    # === Step 6: Prepare continuous EEG data for BIDS ===
    eeg_continuous = np.concatenate([trial for trial in data_EEG[0]], axis=1)
    channel_info = pd.read_csv(CHANNEL_CSV).dropna()
    ch_names = channel_info['channel'].tolist()
    assert eeg_continuous.shape[0] == len(ch_names), \
        f"Channel count mismatch for {sub_id}: EEG has {eeg_continuous.shape[0]} channels, " \
        f"but {len(ch_names)} names provided."

    info = mne.create_info(ch_names=ch_names, sfreq=SAMPLING_RATE, ch_types='eeg')
    data_eeg_bids = mne.io.RawArray(eeg_continuous, info)

    # === Step 7: Build BIDS event array ===
    offsets = trial_epoch_idx_clean['start_idx'].values - 1  # MATLAB 1-based â†’ Python 0-based
    triggers_continuous = triggers_clean.copy()
    numeric_cols = [col for col in triggers_continuous.columns if col != TRIAL_NO_COL]
    triggers_continuous[numeric_cols] = triggers_continuous[numeric_cols].values + offsets[:, None]

    onsets = []
    unused = []
    event_ids = []

    for _, row in triggers_continuous.iterrows():
        trial_onsets = row[numeric_cols].values - 1  # to 0-based sample index
        n_events = len(trial_onsets)
        onsets.extend(trial_onsets)
        unused.extend([0] * n_events)
        event_ids.extend(range(1, n_events + 1))

    events = np.column_stack([onsets, unused, event_ids]).astype(int)

    # === Step 8: Write BIDS dataset ===
    bids_path = BIDSPath(
        subject=sub_id,
        task='randomDot',
        datatype='eeg',
        root=PREPROCESS_DATA
    )

    write_raw_bids(
        raw=data_eeg_bids,
        bids_path=bids_path,
        events=events,
        event_id=EVENT_ID_MAP,
        format='BrainVision',
        allow_preload=True,
        overwrite=True,
        verbose=False
    )

    # === Step 9: Save cleaned behavioral data in BIDS format ===
    # Construct the behavioral file paths
    bids_path_beh = os.path.join(PREPROCESS_DATA, 'sub-'+sub_id, 'beh')
    os.makedirs(bids_path_beh, exist_ok=True)
    beh_save_path = os.path.join(bids_path_beh, f"sub-{sub_id}_task-randomDot_beh.tsv")
    beh_clean.to_csv(beh_save_path, sep='\t', index=False, na_rep='n/a')

    logger.info(f"Subject {sub_id}: BIDS c  onversion completed.")
    all_clean_behavior.append(beh_clean)    

# =============================================================================
# Final aggregation
# =============================================================================
if all_clean_behavior:
    final_df = pd.concat(all_clean_behavior, ignore_index=True)
    selected = final_df[['subj', 'coherence', 'RT', 'accuracy', 'keypress']]
    selected.to_csv(BEHAVIOR_CLEAN_CSV, index=False)
    logger.info(f"Final behavioral data saved to {BEHAVIOR_CLEAN_CSV}")

else:
    logger.warning("No subjects processed successfully.")

2025-11-25 08:29:03,976 - INFO - 
Processing subject: A001
2025-11-25 08:29:05,891 - INFO - Subject A001: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=491753
    Range : 0 ... 491752 =      0.000 ...   983.504 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-11-25 08:29:07,251 - INFO - Subject A001: BIDS c  onversion completed.
2025-11-25 08:29:07,253 - INFO - 
Processing subject: A002
2025-11-25 08:29:09,018 - INFO - Subject A002: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=461226
    Range : 0 ... 461225 =      0.000 ...   922.450 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-11-25 08:29:10,216 - INFO - Subject A002: BIDS c  onversion completed.
2025-11-25 08:29:10,216 - INFO - 
Processing subject: A003
2025-11-25 08:29:11,975 - INFO - Subject A003: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=449384
    Range : 0 ... 449383 =      0.000 ...   898.766 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-11-25 08:29:13,152 - INFO - Subject A003: BIDS c  onversion completed.
2025-11-25 08:29:13,152 - INFO - 
Processing subject: A004
2025-11-25 08:29:14,927 - INFO - Subject A004: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=456058
    Range : 0 ... 456057 =      0.000 ...   912.114 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-11-25 08:29:16,119 - INFO - Subject A004: BIDS c  onversion completed.
2025-11-25 08:29:16,120 - INFO - 
Processing subject: A005
2025-11-25 08:29:17,885 - INFO - Subject A005: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=438489
    Range : 0 ... 438488 =      0.000 ...   876.976 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-11-25 08:29:19,078 - INFO - Subject A005: BIDS c  onversion completed.
2025-11-25 08:29:19,079 - INFO - 
Processing subject: A006
2025-11-25 08:29:21,005 - INFO - Subject A006: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=498464
    Range : 0 ... 498463 =      0.000 ...   996.926 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-11-25 08:29:22,350 - INFO - Subject A006: BIDS c  onversion completed.
2025-11-25 08:29:22,350 - INFO - 
Processing subject: A007
2025-11-25 08:29:24,222 - INFO - Subject A007: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=471979
    Range : 0 ... 471978 =      0.000 ...   943.956 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-11-25 08:29:25,347 - INFO - Subject A007: BIDS c  onversion completed.
2025-11-25 08:29:25,347 - INFO - 
Processing subject: A008
2025-11-25 08:29:27,149 - INFO - Subject A008: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=475293
    Range : 0 ... 475292 =      0.000 ...   950.584 secs
Ready.


  write_raw_bids(
  write_raw_bids(
2025-11-25 08:29:28,330 - INFO - Subject A008: BIDS c  onversion completed.
2025-11-25 08:29:28,330 - INFO - 
Processing subject: A009
2025-11-25 08:29:30,094 - INFO - Subject A009: behavioral and EEG trial indices aligned.


Creating RawArray with float64 data, n_channels=128, n_times=447978
    Range : 0 ... 447977 =      0.000 ...   895.954 secs
Ready.


  write_raw_bids(
  write_raw_bids(
