In [9]:
import subprocess
import pandas as pd
import numpy as np
import os
import shutil
import re

In [10]:
def find_besteye(df_msg, default='R'):
    val_msgs = (df_msg[df_msg['text'].str.contains('CAL VALIDATION')][-2:]).to_numpy(dtype=str)
    if not len(val_msgs) or 'ABORTED' in val_msgs[0][1]:
        return default

    left_index = int('LEFT' in val_msgs[1][1])
    right_index = 1 - left_index
    lefterror_index, righterror_index = val_msgs[left_index][1].split().index('ERROR'), val_msgs[right_index][
        1].split().index('ERROR')
    left_error = float(val_msgs[left_index][1].split()[lefterror_index + 1])
    right_error = float(val_msgs[right_index][1].split()[righterror_index + 1])

    return 'L' if left_error < right_error else 'R'


def filter_msgs(df_msg, cutout='validation'):
    first_index = df_msg.index[df_msg['text'].str.contains(cutout)].tolist()[0]

    return df_msg[first_index:]


def is_binocular(df_fix):
    return len(df_fix['eye'].unique()) > 1


def keep_besteye(df_fix, df_msg, default='R'):
    best_eye = default
    if is_binocular(df_fix):
        best_eye = find_besteye(df_msg, default)
        df_fix = df_fix[df_fix['eye'] == best_eye]

    return df_fix, best_eye


def extract_calpoints(df_msg, best_eye, legend='Calibration points', npoints=9):
    calpoints_msg = df_msg[df_msg['text'].str.contains(legend)]
    calpoints = []
    if not calpoints_msg.empty:
        if len(calpoints_msg) >= 2:
            calpoints_msgidx = calpoints_msg.iloc[-2].name if best_eye == 'L' else calpoints_msg.iloc[-1].name
        else:
            calpoints_msgidx = calpoints_msg.iloc[0].name
        calpoints = df_msg.loc[calpoints_msgidx + 1:calpoints_msgidx + npoints]['text'].to_numpy()
        calpoints = [list(map(lambda x: float(x.replace(',', '')), msg.split()[1:3])) for msg in calpoints]
    calpoints = pd.DataFrame(calpoints, columns=['x', 'y'])

    return calpoints


def extract_valpoints(df_msg, best_eye, legend='VALIDATE', npoints=9):
    valpoints_msg = df_msg[df_msg['text'].str.contains(legend)]
    if len(valpoints_msg) > npoints:
        besteye_legend = 'RIGHT' if best_eye == 'R' else 'LEFT'
        valpoints_msg = valpoints_msg[valpoints_msg['text'].str.contains(besteye_legend)]
    valpoints_msg = valpoints_msg['text'].to_numpy()
    points = [msg.split('at')[1].split('OFFSET')[0].split(',') for msg in valpoints_msg]
    regexp = r'(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*pix'
    offsets = [re.findall(regexp, msg.split('at')[1].split('OFFSET')[1]) for msg in valpoints_msg]
    offsets = [(float(offset[0][0]), float(offset[0][1])) for offset in offsets]
    points = [(int(point[0]), int(point[1])) for point in points]
    valpoints = pd.DataFrame(points, columns=['x', 'y']).astype(int)
    valoffsets = pd.DataFrame(offsets, columns=['x', 'y']).astype(float)

    return valpoints, valoffsets

In [11]:
def dataset_to_bids(folder_path, dataset_name, session_substrings=2):
    """
    Convert a dataset to BIDS format.

    Args:
        folder_path (str): Path to the folder containing the dataset.

    Returns:
        None
    """

    # List all files in the folder
    files = os.listdir(folder_path)

    # Create a new folder for the BIDS dataset
    bids_folder_path = os.path.join(folder_path, dataset_name)
    os.makedirs(bids_folder_path, exist_ok=True)

    # Create subfolders for each subject
    subject_folders = []
    for file in files:
        if file.lower().endswith(".edf"):
            subject_id = file.split("_")[0]
            subject_folder_path = os.path.join(bids_folder_path, "sub-" + subject_id)
            os.makedirs(subject_folder_path, exist_ok=True)
            subject_folders.append(subject_folder_path)

    # Create subfolders for each session for each subject
    for subject_folder in subject_folders:
        for file in files:
            if file.endswith(".edf") and file.startswith(os.path.basename(subject_folder)):
                session_id = "_".join(file.split("_")[1:session_substrings])
                move_file_to_bids_folder(os.path.join(folder_path, file), bids_folder_path, subject_id, session_id, 'ET')
            if file.endswith(".bdf") and file.startswith(os.path.basename(subject_folder)):
                session_id = "_".join(file.split("_")[1:session_substrings])
                move_file_to_bids_folder(os.path.join(folder_path, file), bids_folder_path, subject_id, session_id, 'EEG')
            if (file.endswith(".log") or file.endswith(".csv")) and file.startswith(os.path.basename(subject_folder)):
                session_id = "_".join(file.split("_")[1:session_substrings])
                move_file_to_bids_folder(os.path.join(folder_path, file), bids_folder_path, subject_id, session_id, 'Psycopy')
    return bids_folder_path

def move_file_to_bids_folder(file_path, bids_folder_path, subject_id, session_id,tag):
    session_folder_path = os.path.join(bids_folder_path, "sub-" + subject_id, "ses-" + session_id,tag)
    os.makedirs(session_folder_path, exist_ok=True)
    shutil.move(file_path, session_folder_path)
    

def convert_edf_to_ascii(edf_file_path, output_dir):
    """
    Convert an EDF file to ASCII format using edf2asc.

    Args:
        edf_file_path (str): Path to the input EDF file.
        output_dir (str): Directory to save the ASCII file. If None, the ASCII file will be saved in the same directory as the input EDF file.

    Returns:
        str: Path to the generated ASCII file.
    """
    # Check if edf2asc is installed
    if not shutil.which("edf2asc"):
        raise FileNotFoundError("edf2asc not found. Please make sure EyeLink software is installed and accessible in the system PATH.")

    # Set output directory
    if output_dir is None:
        raise ValueError("Output directory must be specified.")

    # Generate output file path
    edf_file_name = os.path.basename(edf_file_path)
    ascii_file_name = os.path.splitext(edf_file_name)[0] + ".asc"
    ascii_file_path = os.path.join(output_dir, ascii_file_name)

    # Run edf2asc command with the -f flag, only run it if the file does not already exist
    if not os.path.exists(ascii_file_path):
        subprocess.run(["edf2asc", "-f", edf_file_path, ascii_file_path])

    return ascii_file_path



def parse_edf_eyelink(edf_file_path, msg_keywords,derivatives_folder,keep_ascii=True):
    """
    Parse an EDF file generated by EyeLink system.

    Args:
        edf_file_path (str): Path to the input EDF file.
        msg_keywords (list of str): List of strings representing keywords to filter MSG lines.

    Returns:
        tuple: A tuple containing five pandas DataFrames:
            - Header information DataFrame
            - MSG lines DataFrame filtered by msg_keywords
            - Calibration information DataFrame
            - EyeLink events DataFrame
            - Raw sample data DataFrame
    """
    # Convert EDF to ASCII
    ascii_file_path = convert_edf_to_ascii(edf_file_path,derivatives_folder)

    # ===== READ IN FILES ===== #
    # Read in EyeLink file
    
    f = open(ascii_file_path,'r')
    fileTxt0 = f.read().splitlines(True) # split into lines
    fileTxt0 = np.array(fileTxt0) # concert to np array for simpler indexing
    f.close()


    # Separate lines into samples and messages
    print('Sorting lines...')
    nLines = len(fileTxt0)
    lineType = np.array(['OTHER']*nLines,dtype='object')


    # Usar lo de mne, particularmente para calibration.
    # En sample tendría que filtrar lo que viene después de START y antes de END.
    

    calibration_flag = False
    start_flag = False
    for iLine in range(nLines):
        if len(fileTxt0[iLine])<2:
            lineType[iLine] = 'EMPTY'
        elif fileTxt0[iLine].startswith('*'):
            lineType[iLine] = 'HEADER'
        # If there is a !CAL in the line, it is a calibration line
        elif '!CAL' in fileTxt0[iLine]:
            lineType[iLine] = 'Calibration'
            calibration_flag = True
        elif fileTxt0[iLine].split()[0] == 'START' and calibration_flag:
            calibration_flag = False
            start_flag = True
        elif calibration_flag:
            lineType[iLine] = 'Calibration'
        elif not start_flag: # Data before the first successful calibration is discarded. 
            # After the first successul calibration, EVERY sample is taken into account.
            lineType[iLine] = 'Non_calibrated_samples'
        elif fileTxt0[iLine].split()[0] == 'MSG' and any(keyword in fileTxt0[iLine] for keyword in msg_keywords):
            lineType[iLine] = 'MSG'
        elif fileTxt0[iLine].split()[0] == 'ESACC':
            lineType[iLine] = 'ESACC'
        elif fileTxt0[iLine].split()[0] == 'EFIX':
            lineType[iLine] = 'EFIX'
        elif fileTxt0[iLine].split()[0] == 'EBLINK':
            lineType[iLine] = 'EBLINK'
        elif fileTxt0[iLine].split()[0][0].isdigit() or fileTxt0[iLine].split()[0].startswith('-'):
            lineType[iLine] = 'SAMPLE'
        else:
            lineType[iLine] = 'OTHER'
        
 
    # ===== PARSE EYELINK FILE ===== #
    # Import Header
    print('Parsing header...')
    dfHeader = pd.read_csv(ascii_file_path,skiprows=np.nonzero(lineType!='HEADER')[0],header=None,sep='\s+')
    # Merge columns into single strings
    dfHeader = dfHeader.apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)


    # Import Calibration
    print('Parsing calibration...')
    iCal = np.nonzero(lineType!='Calibration')[0]
    dfCalib = pd.read_csv(ascii_file_path,skiprows=iCal,names=np.arange(9))



    # Import Message
    print('Parsing messages...')
    i_msg = np.nonzero(lineType == 'MSG')[0]
    t_msg = []
    txt_msg = []
    for i in range(len(i_msg)):
        # separate MSG prefix and timestamp from rest of message
        info = fileTxt0[i_msg[i]].split()
        # extract info
        t_msg.append(int(info[1]))
        txt_msg.append(' '.join(info[2:]))
    dfMsg = pd.DataFrame({'time': t_msg, 'text': txt_msg})

    # Import Fixations
    print('Parsing fixations...')
    i_not_efix = np.nonzero(lineType != 'EFIX')[0]
    df_fix = pd.read_csv(ascii_file_path, skiprows=i_not_efix, header=None, sep='\s+', usecols=range(1, 8),
                         low_memory=False)
    df_fix.columns = ['eye', 'tStart', 'tEnd', 'duration', 'xAvg', 'yAvg', 'pupilAvg']

    # Saccades
    print('Parsing saccades...')
    i_not_esacc = np.nonzero(lineType != 'ESACC')[0]
    df_sacc = pd.read_csv(ascii_file_path, skiprows=i_not_esacc, header=None, sep='\s+', usecols=range(1, 11),
                          low_memory=False)
    df_sacc.columns = ['eye', 'tStart', 'tEnd', 'duration', 'xStart', 'yStart', 'xEnd', 'yEnd', 'ampDeg', 'vPeak']

    # Blinks
    print('Parsing blinks...')
    df_blink = pd.DataFrame()
    i_not_eblink = np.nonzero(lineType != 'EBLINK')[0]
    if len(i_not_eblink) < nLines:
        df_blink = pd.read_csv(ascii_file_path, skiprows=i_not_eblink, header=None, sep='\s+', usecols=range(1, 5),
                               low_memory=False)
        df_blink.columns = ['eye', 'tStart', 'tEnd', 'duration']

    # determine sample columns based on eyes recorded in file
    eyes_in_file = np.unique(df_fix.eye)
    if eyes_in_file.size == 2:
        cols = ['tSample', 'LX', 'LY', 'LPupil', 'RX', 'RY', 'RPupil']
    else:
        eye = eyes_in_file[0]
        print('monocular data detected (%c eye).' % eye)
        cols = ['tSample', '%cX' % eye, '%cY' % eye, '%cPupil' % eye]

    # Import samples
    i_not_sample = np.nonzero(lineType != 'SAMPLE')[0]
    dfSamples = pd.read_csv(ascii_file_path, skiprows=i_not_sample, header=None, sep='\s+',
                                usecols=range(0, len(cols)), low_memory=False)
    dfSamples.columns = cols
    # Convert values to numbers
    for eye in ['L', 'R']:
        if eye in eyes_in_file:
            dfSamples['%cX' % eye] = pd.to_numeric(dfSamples['%cX' % eye], errors='coerce')
            dfSamples['%cY' % eye] = pd.to_numeric(dfSamples['%cY' % eye], errors='coerce')
            dfSamples['%cPupil' % eye] = pd.to_numeric(dfSamples['%cPupil' % eye], errors='coerce')
        else:
            dfSamples['%cX' % eye] = np.nan
            dfSamples['%cY' % eye] = np.nan
            dfSamples['%cPupil' % eye] = np.nan

    dict_events = {'fix': df_fix, 'sacc': df_sacc, 'blink': df_blink}
    if not keep_ascii:
        os.remove(ascii_file_path)

    # Save the 5 data structures in HDF5 file each, in the derivatives folder
    dfHeader.to_hdf(os.path.join(derivatives_folder, 'header.hdf5'), key='dfHeader', mode='w')
    dfMsg.to_hdf(os.path.join(derivatives_folder, 'msg.hdf5'), key='dfMsg', mode='w')
    dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')
    dfSamples.to_hdf(os.path.join(derivatives_folder, 'samples.hdf5'), key='dfSamples', mode='w')
    for key, value in dict_events.items():
        value.to_hdf(os.path.join(derivatives_folder, key + '.hdf5'), key=key, mode='w')


def compute_derivatives_for_dataset(bids_dataset_folder, msg_keywords):
    """
    Compute derivatives for a dataset.

    Args:
        dataset_folder (str): Path to the folder containing the dataset.
        msg_keywords (list of str): List of strings representing keywords to filter MSG lines. Leave empty if no
            filtering is required.
        derivatives_folder (str): Path to the folder to save the derivatives.

    Returns:
        None
    """
    derivatives_folder = bids_dataset_folder + "_derivatives"
    # Create the derivatives folder
    os.makedirs(derivatives_folder, exist_ok=True)

    # List of folders in bids_dataset_folder
    bids_folders = os.listdir(bids_dataset_folder)
    # Filter out non-subject folders
    bids_folders = [folder for folder in bids_folders if folder.startswith("sub-")]

    # Compute derivatives for each EDF file in the dataset
    for subject in bids_folders:
        # List of folders in subject folder
        sessions_folders = os.listdir(os.path.join(bids_dataset_folder, subject))
        # Filter out non-session folders
        sessions_folders = [folder for folder in sessions_folders if folder.startswith("ses-")]
        for session in os.listdir(os.path.join(bids_dataset_folder, subject)):
            for file in os.listdir(os.path.join(bids_dataset_folder, subject, session,'ET')):
                if file.lower().endswith(".edf"):
                    edf_file_path = os.path.join(bids_dataset_folder, subject, session, file)
                    derivatives_folder_path = os.path.join(derivatives_folder, subject, session)
                    os.makedirs(derivatives_folder_path, exist_ok=True)
                    parse_edf_eyelink(edf_file_path, msg_keywords, derivatives_folder_path)

In [12]:
# Example usage:
current_folder = os.getcwd()
bids_dataset_folder = dataset_to_bids(current_folder, "example_dataset")
msg_keywords = ["begin","end","press"]
compute_derivatives_for_dataset(bids_dataset_folder, msg_keywords)



Sorting lines...
Parsing header...
Parsing calibration...
Parsing messages...
Parsing fixations...
Parsing saccades...
Parsing blinks...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index([0, 1, 2, 3], dtype='int64')]

  dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')


Sorting lines...
Parsing header...
Parsing calibration...
Parsing messages...
Parsing fixations...
Parsing saccades...
Parsing blinks...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index([0, 1, 2, 3], dtype='int64')]

  dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')


Sorting lines...
Parsing header...
Parsing calibration...
Parsing messages...
Parsing fixations...
Parsing saccades...
Parsing blinks...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index([0, 1, 2, 3], dtype='int64')]

  dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')


Sorting lines...
Parsing header...
Parsing calibration...
Parsing messages...
Parsing fixations...
Parsing saccades...
Parsing blinks...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index([0, 1, 2, 3], dtype='int64')]

  dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')


Sorting lines...
Parsing header...
Parsing calibration...
Parsing messages...
Parsing fixations...
Parsing saccades...
Parsing blinks...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index([0, 1, 2, 3], dtype='int64')]

  dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')


Sorting lines...
Parsing header...
Parsing calibration...
Parsing messages...
Parsing fixations...
Parsing saccades...
Parsing blinks...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index([0, 1, 2, 3], dtype='int64')]

  dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')


Sorting lines...
Parsing header...
Parsing calibration...
Parsing messages...
Parsing fixations...
Parsing saccades...
Parsing blinks...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index([0, 1, 2, 3], dtype='int64')]

  dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')


Sorting lines...
Parsing header...
Parsing calibration...
Parsing messages...
Parsing fixations...
Parsing saccades...
Parsing blinks...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index([0, 1, 2, 3], dtype='int64')]

  dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')


Sorting lines...
Parsing header...
Parsing calibration...
Parsing messages...
Parsing fixations...
Parsing saccades...
Parsing blinks...
monocular data detected (L eye).


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index([0, 1, 2, 3], dtype='int64')]

  dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')


Sorting lines...
Parsing header...
Parsing calibration...
Parsing messages...
Parsing fixations...
Parsing saccades...
Parsing blinks...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index([0, 1, 2, 3], dtype='int64')]

  dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')


Sorting lines...
Parsing header...
Parsing calibration...
Parsing messages...
Parsing fixations...
Parsing saccades...
Parsing blinks...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index([0, 1, 2, 3], dtype='int64')]

  dfCalib.to_hdf(os.path.join(derivatives_folder, 'calib.hdf5'), key='dfCalib', mode='w')
