# Create ROAMM

This script is used to generate datasets ready for ML.

In [None]:
import mne
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d

def process_eeg_data(file_path):
    """
    Process EEG data and extract page start times.
    
    Parameters:
    file_path (str): Path to the EEG .set file
    
    Returns:
    tuple: (eeg_data, df_data, page_start_times, sampling_freq)
    """
    # Load EEG file
    raw = mne.io.read_raw_eeglab(file_path, preload=True)
    ch_names = raw.info['ch_names']
    
    # Get the data as numpy array
    data, times = raw.get_data(return_times=True)
    
    # Convert to dataframes
    eeg_data = pd.DataFrame(data.T, columns=ch_names)
    df_data = pd.DataFrame({'time': times})
    
    # Get sampling frequency and events
    fs = raw.info['sfreq']
    df_data['sfreq'] = fs
    
    events, event_id = mne.events_from_annotations(raw)
    
    # Extract page start times (condition 10 events)
    page_start_times = []
    
    if 'condition 10' in event_id:
        condition10_code = event_id['condition 10']
        condition10_events = events[events[:, 2] == condition10_code]
        
        if len(condition10_events) != 10:
            raise ValueError(f'Expected 10 pages, found {len(condition10_events)}')
        
        # Convert sample indices to time
        for event in condition10_events:
            page_sample = event[0]
            page_start_times.append(page_sample / fs)
    else:
        raise ValueError('condition 10 not found in event_id')
    
    # Clean up large variables to free memory
    del raw, data, times, events
    
    return pd.concat([eeg_data, df_data], axis=1), page_start_times, fs


def interpolate_blink(dfSamples, dfBlink, dfSaccade):
    """
    Interpolate left and right pupil sizes over blink periods. Modifies the
    dataframe of samples in place to change pupil dilation values to interpolated
    values, effectively removing blink artifacts. Saves interpolated data as csv.
    
    Uses saccades as t1 and t4. Contains adjustments recommended through conversation
    with Dr. J. Performs the interpolation over the normalized pupil dilation values.
    
    Parameters
    ----------
    dfSamples : pandas.DataFrame
        Sample-level eye data containing timestamp column `tSample` and
        columns for each eye named like `LX`, `LY`, `LPupil`, `RX`, `RY`, `RPupil`.
    dfBlink : pandas.DataFrame
        Blink events with columns `tStart`, `tEnd`, and `eye` (values 'L'/'R').
    dfSaccade : pandas.DataFrame
        Saccade events with columns `tStart`, `tEnd`, and `eye` used to
        identify saccades that overlap or surround blinks.

    Returns
    -------
    pandas.DataFrame
        The input `dfSamples` with pupil and position columns replaced by
        interpolated values during blink-related intervals.

    Notes
    -----
    - Interpolation points are chosen using saccades that overlap the blink
      when available; otherwise the nearest surrounding saccades are used.
    - Interpolation is performed independently for position (`X`,`Y`) and
      pupil size columns for each eye.
    """
    # extracted from reading_analysis.py (author: HS)
    # interpolate the pupil size during the blink duration
    # http://dx.doi.org/10.6084/m9.figshare.688002
    

    # get time array from dfSamples
    sample_time = dfSamples['tSample'].to_numpy()

    # interpolate data for LEFT and RIGHT eye separately
    for eye in ['L', 'R']:
        # extract blink and saccade information for one eye
        dfBlink_ = dfBlink[dfBlink['eye']==eye]
        dfSaccade_ = dfSaccade[dfSaccade['eye']==eye]

        # truncate blink dataframe using the saccade information
        t_start = dfSaccade_['tStart'].min()
        t_end = dfSaccade_['tEnd'].max()
        mask = (dfBlink_['tStart'] > t_start) & (dfBlink_['tEnd'] < t_end)
        dfBlink_ = dfBlink_[mask]

        # convert df columns to np.arrays for interpolation
        col_names = [f'{eye}X', f'{eye}Y', f'{eye}Pupil']
        data_to_interpolate = []
        for col_name in col_names:
            data_to_interpolate.append(np.array(dfSamples[col_name]))

        # iterate throu each row of blink dataframe
        for index in np.arange(len(dfBlink_)):
            row = dfBlink_.iloc[index]
            # get the start and end time
            b_start = row['tStart'] 
            b_end = row['tEnd']
            # skip blinks out of range of dfSamples
            if (b_start < sample_time[0]) and (b_end > sample_time[-1]):
                continue
            
            # commented out by HS on 12/29/2025
            # # set t1 to be the end time of the last saccade before the blink
            # #get all saccades before this blink
            # previous_sac = dfSaccade_[dfSaccade_["tEnd"] < b_start]
            # # get last saccade before this blink
            # t1 = previous_sac["tEnd"].max()
            # # set t2 to be the start time of the first saccade after the blink
            # # get all saccades after this blink
            # after_sac = dfSaccade_[dfSaccade_["tStart"] > b_end]
            # # get the first saccade after this blink
            # t2 = after_sac["tStart"].min()

            # 12/29/2025 - added by HS
            # set t1 and t2 to be the start and end time of the saccade that surrounds the blink
            # this is to avoid the long fixation between saccades that may lead to large interpolation errors

            # saccades that overlap the blink
            sac = dfSaccade_[
                (dfSaccade_["tStart"] < b_start) &
                (dfSaccade_["tEnd"] > b_end)
            ]

            if not sac.empty:
                # use overlapping saccade
                t1 = sac["tStart"].iloc[-1]
                t2 = sac["tEnd"].iloc[-1]

            else:
                # previous saccade before blink
                previous_sac = dfSaccade_[dfSaccade_["tEnd"] < b_start]
                if previous_sac.empty:
                    t1 = np.nan
                    raise ValueError("t1 are Na")
                else:
                    t1 = previous_sac["tEnd"].max()

                # first saccade after blink
                after_sac = dfSaccade_[dfSaccade_["tStart"] > b_end]
                if after_sac.empty:
                    t2 = np.nan
                    raise ValueError("t2 are Na")
                else:
                    t2 = after_sac["tStart"].min()

            # check for missing vals in t1 or t2 and use fallback if needed
            # if pd.isna(t1) or pd.isna(t2):
            #     raise ValueError("t1/t2 are Na")
            
            # check the timing of saccades are within the time array for samples
            if (t1 > sample_time[0]) and (t2 < sample_time[-1]):
                # choose data points for interpolation function
                x = [t1,t2]
                y_ind = []
                for t in x:
                    y_ind.append(np.where(sample_time==t)[0][0])

                # loop thru all columns
                for col_name, col_data in zip(col_names, data_to_interpolate):
                    # create the 1D function for interpolation
                    y = col_data[y_ind]
                    interp_f = interp1d(x, y)           
                    #spl = CubicSpline(x, y)
                    
                    # generate mask for blink duration
                    mask = (sample_time > t1) & (sample_time < t2)
                    time_to_interpolate = sample_time[mask]
                    # use spl model to interpolate data during blink duration
                    interp_data = interp_f(time_to_interpolate)
                    
                    # update the dfSamples in place
                    dfSamples.loc[mask, col_name] = interp_data

    return dfSamples


def convert_eyelink_to_image_pixel(x_eyelink, y_eyelink):
        '''
        Convert eyeylink coords to pixel for image (1900 x 1442 pixels) 
        displayed at pos (0, 0) w/ size (1.3, 0.99) in PsychoPy.

        Parameters
        ----------
        x_eyelink : float
            DESCRIPTION. eyelink coord unit
        y_eyelink : float
            DESCRIPTION. eyelink coord unit

        Returns
        -------
        x_pixel : float
            DESCRIPTION. Image pixel unit
        y_pixel : float 
            DESCRIPTION. Image pixel unit 

        '''
        x_pixel = (x_eyelink-258) * 1900 / (1080*1.3)
        y_pixel = (y_eyelink-5.4) * 1442 / (1080*0.99)
        return x_pixel, y_pixel


def find_match(dfWords, coord_info, dist_max=1000):
    '''
    Match the clicks to words. 

    Parameters
    ----------
    dfWords : DataFrame
        DESCRIPTION. The dataframe for a single page. It should at least contain
        columns 'center_x', 'center_y', 'width', and 'height', which are 
        coordinate information of words in pixel unit
    click_pos : Tuple
        DESCRIPTION. The coordinate info (x, y) for a click in pixel unit

    Returns
    -------
    matched_index: int
        DESCRIPTION. The index value of matched word for input dataframe. 

    '''
    # compute the start and end positions of words
    words_x_start = dfWords['center_x'] - dfWords['width']/2
    words_x_end = dfWords['center_x'] + dfWords['width']/2
    words_y_start = dfWords['center_y'] - dfWords['height']/2
    words_y_end = dfWords['center_y'] + dfWords['height']/2
    
    # get x and y for the click
    pos_x, pos_y = coord_info
    
    # compute the distance between click and word boundry box
    dist_x_left = (words_x_start - pos_x)
    dist_x_right = (pos_x - words_x_end)
    dist_y_top = (words_y_start - pos_y)
    dist_y_bottom = (pos_y - words_y_end)
    
    # find the maximum distance from click to the word for x and y
    max_x = np.max(np.vstack((dist_x_left, dist_x_right, np.zeros(len(dist_x_left)))), axis=0)
    max_y = np.max(np.vstack((dist_y_top, dist_y_bottom, np.zeros(len(dist_y_top)))), axis=0)
    
    # calculate the distance using x and y
    dist = np.sqrt(np.square(max_x) + np.square(max_y))
    
    # check if the minimum dist exceeds threshold values
    if np.min(dist) < dist_max:
        matched_index = np.argmin(dist)
    else:
        matched_index = -1
    
    # return the index that has the shortest distance
    return matched_index

Get all subject folders from the root directory on VACC (used for debugging)

In [5]:
import os
import re
import glob
import pandas as pd

root_dir = r"/gpfs1/pi/djangraw/mindless_reading/data/"
# Regular expression to match subject folder names (s followed by exactly 5 digits)
subject_pattern = re.compile(r"^s\d{5}$")

# Get the list of subject folders that match the pattern
subjects = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d)) and subject_pattern.match(d)]

# define res path (relative)
res_path = r"res/"

for sub_id in sorted(subjects):
    if int(sub_id[1:]) < 10115:  # debug purpose
        continue
    print('Start processing subject: ', sub_id)
    subject_path = os.path.join(root_dir, sub_id)
    # define path to save
    path_to_save = os.path.join(subject_path, 'ml_data')
    os.makedirs(path_to_save, exist_ok=True)

    # load subject page info dataframe
    file_path = os.path.join(subject_path, f'{sub_id}_R_features_default_sr.csv')
    df_page_info = pd.read_csv(file_path)

    # load raw eye datasets for each run
    eye_folder = os.path.join(subject_path, 'eye')
    folders = [f for f in os.listdir(eye_folder) if os.path.isdir(os.path.join(eye_folder, f))]
    r_folders = [f for f in folders if re.search(r'_r[1-5]_', f)]
    r_folders_sorted = sorted(r_folders, key=lambda x: int(re.search(r'_r(\d+)_', x).group(1)))

    for folder in r_folders_sorted:
        folder_path = os.path.join(eye_folder, folder)
        # get run number
        run_num = int(re.search(r'_r(\d+)_', folder).group(1))
        print('Run number: ', run_num)

        # load EEG file for the current run
        file_path = os.path.join(subject_path, 'eeg', 'ICAPruned', f'MR_{sub_id}_r{run_num}_ICAPruned.set')
        df_data, page_start_eeg, fs = process_eeg_data(file_path)

        # get the current run info
        df_page_run = df_page_info[df_page_info['run']==run_num].copy()
        eyelink_start_time = df_page_run['task_start'].iloc[0]
        # page start time in eyelink
        page_start_eye = df_page_run['page_start'] - eyelink_start_time

        # make sure time offsets between eeg and eyelink are consistent
        time_offsets = np.array(page_start_eeg) - np.array(page_start_eye)
        max_offset_allowed = 0.05
        if np.max(time_offsets) - np.min(time_offsets) > max_offset_allowed:
            raise ValueError('Time offsets between EEG and eyelink not consistent.')
        # use the mean to be the time offset (in seconds) for the current run
        time_offset = np.mean(time_offsets)

        # update eyelink time to match df_data array
        df_page_run['page_start_eeg'] = df_page_run['page_start'] - eyelink_start_time + time_offset
        df_page_run['page_end_eeg'] = df_page_run['page_end'] - eyelink_start_time + time_offset
        df_page_run['mw_onset'] = df_page_run['mw_onset'] - eyelink_start_time + time_offset
        df_page_run['mw_offset'] = df_page_run['mw_offset'] - eyelink_start_time + time_offset

        # get story name
        story_name = df_page_run['reading'].iloc[0]
        story_name = story_name.lower().replace(' ', '_')

        # load word coordinate file
        csv_pattern = os.path.join(res_path, f'{story_name}*.csv')
        matching_files = glob.glob(csv_pattern)
        df_word = pd.read_csv(matching_files[0])

        # load fixation file
        csv_pattern = os.path.join(folder_path, f'*Fixation.csv')
        matching_files = glob.glob(csv_pattern)
        df_fix = pd.read_csv(matching_files[0])

        # load blink file
        csv_pattern = os.path.join(folder_path, f'*Blink.csv')
        matching_files = glob.glob(csv_pattern)
        df_blink = pd.read_csv(matching_files[0])

        # load saccade file
        csv_pattern = os.path.join(folder_path, f'*Saccade.csv')
        matching_files = glob.glob(csv_pattern)
        df_sacc = pd.read_csv(matching_files[0])

        # load raw eye sample file (gaze position and pupil size)
        csv_pattern = os.path.join(folder_path, f'*Sample.csv')
        matching_files = glob.glob(csv_pattern)
        df_sample = pd.read_csv(matching_files[0])

        # interpolate eye samples during blinnk
        df_interp_sample = df_sample.copy()
        interpolate_blink(df_interp_sample, df_blink, df_sacc)
        # copy interpolated data into the samples dataframe
        for col in ['LX', 'LY', 'LPupil', 'RX', 'RY', 'RPupil']:
            df_sample[f'blink_interp_{col}'] = df_interp_sample[col]
        
        # loop through each page for matching fixation and word and inserting page-level information
        for _, row in df_page_run.iterrows():
            # get the current page number
            page_num = row['page']
            # get the page number start and end time
            page_start = row['page_start'] * 1000
            page_end = row['page_end'] * 1000

            # match fixation to the reading word
            # find the fixation and words for the current page
            df_word_page = df_word[df_word['page']==page_num]
            mask = (df_fix['tStart'] >= page_start) & (df_fix['tEnd'] <= page_end)
            df_fix_page = df_fix[mask]
            # loop through each fixation and match it to the closed word 
            for row_index, fix in df_fix_page.iterrows():
                # get the fixtaion position x and y
                x_eyelink = fix['xAvg']
                y_eyelink = fix['yAvg']
                # convert position x and y into image pixel unit
                fix_x, fix_y = convert_eyelink_to_image_pixel(x_eyelink, y_eyelink)
                
                # call function find the matched index
                matched_index = find_match(df_word_page, (fix_x, fix_y))
                
                if matched_index >= 0:
                    # store matched word info into fixations 
                    df_fix.at[row_index, 'fixed_word'] = df_word_page['words'].iloc[matched_index]
                    df_fix.at[row_index, 'fixed_word_key'] = df_word_page['word_key'].iloc[matched_index]
        

            # insert page start and end time to df_data
            page_mask = (df_data['time'] >= row['page_start_eeg']) & (df_data['time'] <= row['page_end_eeg'])
            df_data.loc[page_mask, 'first_pass_reading'] = True
            df_data.loc[page_mask, 'page_num'] = page_num
            df_data.loc[page_mask, 'page_start'] = row['page_start_eeg']
            df_data.loc[page_mask, 'page_end'] = row['page_end_eeg']
            df_data.loc[page_mask, 'page_dur'] = row['page_end_eeg'] - row['page_start_eeg']
            # insert mw start and end time to df_data
            mw_mask = (df_data['time'] >= row['mw_onset']) & (df_data['time'] <= row['mw_offset'])
            df_data.loc[mw_mask, 'is_mw'] = True
            df_data.loc[mw_mask, 'mw_onset'] = row['mw_onset']
            df_data.loc[mw_mask, 'mw_offset'] = row['mw_offset']
            df_data.loc[mw_mask, 'mw_dur'] = row['mw_offset'] - row['mw_onset']

        print('Aligning eye-tracking data into EEG time...')
        # insert current run information
        df_data['run_num'] = run_num
        df_data['story_name'] = story_name
        # sort df_data by time
        df_data = df_data.sort_values("time")

        # insert fixations, saccades, and blinks to df_data
        for df_eye, event_type in zip([df_fix, df_blink, df_sacc], ['fix', 'blink', 'sacc']):
            # align fixation time to eeg
            # 1. /1000: ms -> s
            # 2. - eyelink_start_time: take off the run start time
            # 3. + time_offset: add on the time offset between eeg and eye
            for col in ['tStart', 'tEnd']:
                df_eye[col] = df_eye[col] / 1000 - eyelink_start_time + time_offset
                df_eye = df_eye[df_eye['tStart'] >= 0].copy()

            # separate left and right eye
            for eye in ['L', 'R']:
                # Filter by eye and sort
                df_eye_filtered = df_eye[df_eye['eye'] == eye].copy().sort_values('tStart')
                if len(df_eye_filtered) == 0:
                    continue
                # Merge on start time
                df_merged = pd.merge_asof(
                    df_data,
                    df_eye_filtered,
                    left_on='time',
                    right_on='tStart',
                    direction='backward',
                )
                
                # Create mask for times within event window (tStart <= time <= tEnd)
                mask = (df_merged['tStart'].notna()) & (df_merged['time'] <= df_merged['tEnd'])
                # initialize is_{type} column if not exists
                if f'is_{event_type}' not in df_data.columns:
                    df_data[f'is_{event_type}'] = False
                # mark events
                df_data.loc[mask, f'is_{event_type}'] = True
                # add event columns
                for col in df_eye_filtered.columns:
                    new_col_name = f'{event_type}_{eye}_{col}'
                    df_data.loc[mask, new_col_name] = df_merged.loc[mask, col]

        # insert gaze position and pupil size to df_data
        df_sample['tSample'] = df_sample['tSample'] / 1000 - eyelink_start_time + time_offset
        df_sample = df_sample[df_sample['tSample'] >= 0]
        df_sample = df_sample.sort_values('tSample')
        # merge on nearest time within tolerance
        df_data = pd.merge_asof(
            df_data,
            df_sample,
            left_on='time',
            right_on='tSample',
            direction='nearest',
            tolerance=0.01
        )
        
        # save the dataset for the current run as csv
        dataset_name = f'{sub_id}_run{run_num}_ml_data.csv'
        df_data.to_csv(os.path.join(path_to_save, dataset_name), index=False)

        # save as pickle
        dataset_name = f'{sub_id}_run{run_num}_ml_data.pkl'
        df_data.to_pickle(os.path.join(path_to_save, dataset_name))

        # save dataset during first-pass reading only
        dataset_name = f'{sub_id}_run{run_num}_ml_data_firstpass.csv'
        df_data_firstpass = df_data[df_data['first_pass_reading']==True].copy()
        df_data_firstpass.to_csv(os.path.join(path_to_save, dataset_name), index=False)
        
        print(f'Subject {sub_id} run {run_num} has done! Data saved to {os.path.join(path_to_save, dataset_name)}\n')


Start processing subject:  s10115
Run number:  1
Used Annotations descriptions: ['30', '40', 'MW_offset', 'MW_onset', 'condition 1', 'condition 10', 'condition 5', 'control_onset', 'control_sr', 's10115_1_0_control_onset', 's10115_1_0_control_sr', 's10115_1_1_MW_onset', 's10115_1_1_self_report', 's10115_1_2_control_onset', 's10115_1_2_control_sr', 's10115_1_3_control_onset', 's10115_1_3_control_sr', 's10115_1_4_MW_onset', 's10115_1_4_self_report', 's10115_1_5_control_onset', 's10115_1_5_control_sr', 's10115_1_6_control_onset', 's10115_1_6_control_sr', 's10115_1_7_control_onset', 's10115_1_7_control_sr', 's10115_1_8_MW_onset', 's10115_1_8_self_report', 's10115_1_9_MW_onset', 's10115_1_9_self_report', 'self_report']
Aligning eye-tracking data into EEG time...
Subject s10115 run 1 has done! Data saved to /gpfs1/pi/djangraw/mindless_reading/data/s10115/ml_data/s10115_run1_ml_data_firstpass.csv

Run number:  2
Used Annotations descriptions: ['30', '40', 'MW_offset', 'MW_onset', 'condition 1

# Label sentence

In [None]:
import os
import pandas as pd
from glob import glob
import spacy
import numpy as np

def row_for_char(c):
        """
        Return df row whose span contains character position c.
        If c lands in whitespace (between words), we return the nearest previous row.
        """
        # rightmost span_start <= c
        i = np.searchsorted(span_starts, c, side="right") - 1
        if i < 0:
            return 0
        # If inside that span, good
        if c < span_ends[i]:
            return i
        # Otherwise token starts in whitespace; assign to nearest previous row
        return i
# --------------------
# Load spaCy
# --------------------
nlp = spacy.load("en_core_web_sm", exclude=["tagger", "parser", "lemmatizer", "ner"])
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")

root = "/gpfs1/pi/djangraw/hsun11/roamm_ml/res"
csv_files = sorted(glob(os.path.join(root, "*.csv")))

out_dir = os.path.join(root, "with_sentences")
os.makedirs(out_dir, exist_ok=True)

for filepath in csv_files:
    df = pd.read_csv(filepath)

    # Original tokens (one per row)
    words = df["words"].fillna("").astype(str).tolist()

    # Build joined text AND record char spans for each original word
    spans = []  # list of (start_char, end_char) for each df row in the joined string
    parts = []
    pos = 0
    for w in words:
        start = pos
        parts.append(w)
        pos += len(w)
        end = pos
        spans.append((start, end))
        # add the join-space
        parts.append(" ")
        pos += 1

    text = "".join(parts).rstrip()  # remove last space
    doc = nlp(text)

    # sentence texts (by spaCy segmentation)
    sent_texts = [sent.text.strip() for sent in doc.sents]

    # For each df row, store a sentence_id (init as -1)
    sent_id_per_row = np.full(len(df), -1, dtype=int)

    # Helper: find df row index for a token start char via spans
    # We do this efficiently by keeping an array of span starts.
    span_starts = np.array([s for s, e in spans], dtype=int)
    span_ends   = np.array([e for s, e in spans], dtype=int)

    # Walk spaCy sentences, assign sentence_id to df rows touched by tokens
    for sid, sent in enumerate(doc.sents):
        for tok in sent:
            if tok.is_space:
                continue
            r = row_for_char(tok.idx)
            if sent_id_per_row[r] == -1:
                sent_id_per_row[r] = sid
            # If multiple spaCy tokens map to same df row (e.g., "can't" split),
            # they will share the same sentence, so it's fine.

    # Any still-unassigned rows (e.g., empty tokens) -> inherit previous
    for i in range(len(sent_id_per_row)):
        if sent_id_per_row[i] == -1:
            sent_id_per_row[i] = sent_id_per_row[i-1] if i > 0 else 0

    df["sentence_id"] = sent_id_per_row
    df["sentence"] = df["sentence_id"].map(lambda i: sent_texts[i])

    df.to_csv(filepath, index=False)