In [19]:
import subprocess
import pandas as pd
import numpy as np
import os
import shutil

def convert_edf_to_ascii(edf_file_path, output_dir=None):
    """
    Convert an EDF file to ASCII format using edf2asc.

    Args:
        edf_file_path (str): Path to the input EDF file.
        output_dir (str): Directory to save the ASCII file. If None, the ASCII file will be saved in the same directory as the input EDF file.

    Returns:
        str: Path to the generated ASCII file.
    """
    # Check if edf2asc is installed
    if not shutil.which("edf2asc"):
        raise FileNotFoundError("edf2asc not found. Please make sure EyeLink software is installed and accessible in the system PATH.")

    # Set output directory
    if output_dir is None:
        output_dir = os.path.dirname(edf_file_path)

    # Generate output file path
    edf_file_name = os.path.basename(edf_file_path)
    ascii_file_name = os.path.splitext(edf_file_name)[0] + ".asc"
    ascii_file_path = os.path.join(output_dir, ascii_file_name)

    # Run edf2asc command with the -f flag, only run it if the file does not already exist
    if not os.path.exists(ascii_file_path):
        subprocess.run(["edf2asc", "-f", edf_file_path, ascii_file_path])

    return ascii_file_path



def parse_edf_eyelink(edf_file_path, msg_keywords):
    """
    Parse an EDF file generated by EyeLink system.

    Args:
        edf_file_path (str): Path to the input EDF file.
        msg_keywords (list of str): List of strings representing keywords to filter MSG lines.

    Returns:
        tuple: A tuple containing five pandas DataFrames:
            - Header information DataFrame
            - MSG lines DataFrame filtered by msg_keywords
            - Calibration information DataFrame
            - EyeLink events DataFrame
            - Raw sample data DataFrame
    """
    # Convert EDF to ASCII
    ascii_file_path = convert_edf_to_ascii(edf_file_path)

    # ===== READ IN FILES ===== #
    # Read in EyeLink file
    
    f = open(ascii_file_path,'r')
    fileTxt0 = f.read().splitlines(True) # split into lines
    fileTxt0 = list(filter(None, fileTxt0)) #  remove emptys
    fileTxt0 = np.array(fileTxt0) # concert to np array for simpler indexing
    f.close()


    # Separate lines into samples and messages
    print('Sorting lines...')
    nLines = len(fileTxt0)
    lineType = np.array(['OTHER']*nLines,dtype='object')


    # Usar lo de mne, particularmente para calibration.
    # En sample tendría que filtrar lo que viene después de START y antes de END.
    

    calibration_flag = False
    for iLine in range(nLines):
        if len(fileTxt0[iLine])<3:
            lineType[iLine] = 'EMPTY'
        elif fileTxt0[iLine].startswith('*'):
            lineType[iLine] = 'HEADER'
        # If there is a !CAL in the line, it is a calibration line
        elif '!CAL' in fileTxt0[iLine]:
            lineType[iLine] = 'Calibration'
            calibration_flag = True
        elif fileTxt0[iLine].split()[0] == 'START' and calibration_flag:
            calibration_flag = False
        elif calibration_flag:
            lineType[iLine] = 'Calibration'
        elif fileTxt0[iLine].split()[0] == 'MSG' and any(keyword in fileTxt0[iLine] for keyword in msg_keywords):
            lineType[iLine] = 'MSG'
        elif fileTxt0[iLine].split()[0][0].isdigit() or fileTxt0[iLine].split()[0].startswith('-'):
            lineType[iLine] = 'SAMPLE'
        elif fileTxt0[iLine].split()[0] != 'MSG' and not fileTxt0[iLine].split()[0][0].isdigit():
            lineType[iLine] = 'EVENTS'
        else:
            lineType[iLine] = fileTxt0[iLine].split()[0]
        
    # Print the amount of each type of line
    print('Amount of each line type:')
    print(pd.Series(lineType).value_counts())
    
    
    # ===== PARSE EYELINK FILE ===== #
    # Import Header
    print('Parsing header...')
    dfHeader = pd.read_csv(ascii_file_path,skiprows=np.nonzero(lineType!='HEADER')[0],header=None,sep='\s+')


    # Import Calibration
    print('Parsing calibration...')
    iCal = np.nonzero(lineType=='Calibration')[0]
    dfCalib = pd.read_csv(ascii_file_path,skiprows=iCal,header=None,sep='\s+')

    # IF amount of MSG lines is > 0:
    if np.sum(lineType=='MSG')>0:
        # Import MSG
        print('Parsing messages...')
        dfMsg = pd.read_csv(ascii_file_path,skiprows=np.nonzero(lineType!='MSG')[0],header=None,sep='\s+')
    else:
        dfMsg = pd.DataFrame()



    print('Parsing samples...')

    iNotSample = np.nonzero(lineType!='SAMPLE')[0]
    dfSamples = pd.read_csv(ascii_file_path,skiprows=iNotSample,header=None,sep='\s+',low_memory=False)

    print('Parsing events...')

    dfEvents = pd.read_csv(ascii_file_path,skiprows=np.nonzero(lineType!='EVENTS')[0],header=None,sep='\s+')

    return dfHeader, dfMsg, dfCalib, dfSamples, dfEvents



In [20]:
# Example usage:
edf_file_path = "/home/gonzalo/Escritorio/edf_nuevo_dataset/ab01_second_half_2023-10-09_10h50.06.034.asc"
msg_keywords = []
header_df, msg_df, calib_df, events_df, raw_sample_df = parse_edf_eyelink(edf_file_path, msg_keywords)

# Print the first few rows of each DataFrame
print("Header DataFrame:")
print(header_df.head())
print("\nMSG DataFrame:")
print(msg_df.head())
print("\nCalibration DataFrame:")
print(calib_df.head())
print("\nEyeLink Events DataFrame:")
print(events_df.head())
print("\nRaw Sample Data DataFrame:")
print(raw_sample_df.head())

Sorting lines...
Amount of each line type:
SAMPLE         828742
EVENTS          18834
MSG              1388
Calibration       549
EMPTY              61
HEADER             10
OTHER               5
Name: count, dtype: int64
Parsing header...
Parsing calibration...


ParserError: Error tokenizing data. C error: Expected 3 fields in line 459120, saw 12
