# Parse EyeLink Data and Prep for LSTM Model Input

## Enable Cloud or Colab Analysis

In [1]:
import os
'''
Initial setup
'''
project_id = ''
bucket_name = 'nsf-neurogesture'
import_folder = bucket_name+'/all_eyelink_data.npy'
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
elif not IN_COLAB:
    root_data_dir = '/home/jupyter/nsf-neurogesture/data_drive_sync'
    
    # 1. install command: curl https://rclone.org/install.sh | sudo bash
    # 2. follow instructions to set up credentials, use remote login in last step: https://rclone.org/drive/
    !rclone copy nsf_neurogesture:data {root_data_dir} -P

[2K[1GTransferred:   	         0 / 0 Bytes, -, 0 Bytes/s, ETA -
Checks:                 2 / 2, 100%
Elapsed time:         0.0s[2K[1A[2K[1A[2K[1GTransferred:   	         0 / 0 Bytes, -, 0 Bytes/s, ETA -
Checks:                 2 / 2, 100%
Elapsed time:         0.0s[2K[1A[2K[1A[2K[1GTransferred:   	         0 / 0 Bytes, -, 0 Bytes/s, ETA -
Checks:                15 / 15, 100%
Elapsed time:         0.0s[2K[1A[2K[1A[2K[1GTransferred:   	         0 / 0 Bytes, -, 0 Bytes/s, ETA -
Checks:               198 / 198, 100%
Elapsed time:         0.0s


In [None]:
# ParseEyeLinkAsc.py
# - Reads in .asc data files from EyeLink and produces pandas dataframes for further analysis
#
# Created 7/31/18-8/15/18 by DJ.


def ParseEyeLinkAsc(elFilename):
    # dfTrial,dfMsg,dfFix,dfSacc,dfBlink,dfSamples = ParseEyeLinkAsc(elFilename)
    # -Reads in data files from EyeLink .asc file and produces readable dataframes for further analysis.
    #
    # INPUTS:
    # -elFilename is a string indicating an EyeLink data file from an AX-CPT task in the current path.
    #
    # OUTPUTS:
    # -dfTrial contains information about trials
    # -dfMsg contains information about messages (usually sent from stimulus software)
    # -dfFix contains information about fixations
    # -dfSacc contains information about saccades
    # -dfBlink contains information about blinks
    # -dfSamples contains information about individual samples
    #
    # Created 7/31/18-8/15/18 by DJ.
    
    # Import packages
    import numpy as np
    import pandas as pd
    import time

    # ===== READ IN FILES ===== #
    # Read in EyeLink file
    #print('Reading in EyeLink file %s...'%elFilename)
    t = time.time()
    f = open(elFilename,'r')
    fileTxt0 = f.read().split("\n") # split into lines (runs)
   # fileTxt0 = list(filter(None, fileTxt0)) #  remove emptys
    fileTxt0 = np.array(fileTxt0) # concert to np array for simpler indexing
    f.close()
    #print('Done! Took %f seconds.'%(time.time()-t))

    # Separate lines into samples and messages
    #print('Sorting lines...')
    nLines = len(fileTxt0)
    lineType = np.array(['OTHER']*nLines,dtype='object')
    iStartRec = None
    t = time.time()
    for iLine in range(nLines):
        if len(fileTxt0[iLine])<3:
            lineType[iLine] = 'EMPTY'
        elif fileTxt0[iLine].startswith('*') or fileTxt0[iLine].startswith('>>>>>'):
            lineType[iLine] = 'COMMENT'
        elif fileTxt0[iLine].split()[0][0].isdigit() or fileTxt0[iLine].split()[0].startswith('-'):
            lineType[iLine] = 'SAMPLE'
        else:
            lineType[iLine] = fileTxt0[iLine].split()[0]
        if '!CAL' in fileTxt0[iLine]: # TODO: Find more general way of determining if recording has started
            iStartRec = iLine+1
    #print('Done! Took %f seconds.'%(time.time()-t))
    
    
    
    # ===== PARSE EYELINK FILE ===== #
    t = time.time()
    # Trials
    #print('Parsing trial markers...')
    iNotStart = np.nonzero(lineType!='START')[0]
    dfTrialStart = pd.read_csv(elFilename,skiprows=iNotStart,header=None,delim_whitespace=True,usecols=[1])
    dfTrialStart.columns = ['tStart']
    iNotEnd = np.nonzero(lineType!='END')[0]
    dfTrialEnd = pd.read_csv(elFilename,skiprows=iNotEnd,header=None,delim_whitespace=True,usecols=[1,5,6])
    dfTrialEnd.columns = ['tEnd','xRes','yRes']
    # combine trial info
    dfTrial = pd.concat([dfTrialStart,dfTrialEnd],axis=1)
    nTrials = dfTrial.shape[0]
    #print('%d trials found.'%nTrials)

    # Import Messages
    #print('Parsing stimulus messages...')
    t = time.time()
    iMsg = np.nonzero(lineType=='MSG')[0]
    # set up
    tMsg = []
    txtMsg = []
    t = time.time()
    for i in range(len(iMsg)):
        # separate MSG prefix and timestamp from rest of message
        info = fileTxt0[iMsg[i]].split()
        # extract info
        tMsg.append(int(info[1]))
        txtMsg.append(' '.join(info[2:]))
    # Convert dict to dataframe
    dfMsg = pd.DataFrame({'time':tMsg, 'text':txtMsg})
    #print('Done! Took %f seconds.'%(time.time()-t))
    
    # Import Fixations
    #print('Parsing fixations...')
    t = time.time()
    iNotEfix = np.nonzero(lineType!='EFIX')[0]
    dfFix = pd.read_csv(elFilename,skiprows=iNotEfix,header=None,delim_whitespace=True,usecols=range(1,8))
    dfFix.columns = ['eye','tStart','tEnd','duration','xAvg','yAvg','pupilAvg']
    nFix = dfFix.shape[0]
    #print('Done! Took %f seconds.'%(time.time()-t))

    # Saccades
    #print('Parsing saccades...')
    t = time.time()
    iNotEsacc = np.nonzero(lineType!='ESACC')[0]
    dfSacc = pd.read_csv(elFilename,skiprows=iNotEsacc,header=None,delim_whitespace=True,usecols=range(1,11))
    dfSacc.columns = ['eye','tStart','tEnd','duration','xStart','yStart','xEnd','yEnd','ampDeg','vPeak']
    #print('Done! Took %f seconds.'%(time.time()-t))
    
    # Blinks
    #print('Parsing blinks...')
    iNotEblink = np.nonzero(lineType!='EBLINK')[0]
    dfBlink = pd.read_csv(elFilename,skiprows=iNotEblink,header=None,delim_whitespace=True,usecols=range(1,5))
    dfBlink.columns = ['eye','tStart','tEnd','duration']
    #print('Done! Took %f seconds.'%(time.time()-t))
    
    # Import samples
    #print('Parsing samples...')
    t = time.time()
    if iStartRec:
        iNotSample = np.nonzero( np.logical_or(lineType!='SAMPLE', np.arange(nLines)<iStartRec))[0]
        dfSamples = pd.read_csv(elFilename, skiprows=iNotSample,header=None,delim_whitespace=True,usecols=range(0,4))
    else:
        iNotSample = np.nonzero(np.logical_or(lineType != 'SAMPLE', False))[0]
        dfSamples = pd.read_csv(elFilename, skiprows=iNotSample, header=None, delim_whitespace=True,
                                usecols=range(0, 4))
    dfSamples.columns = ['tSample', 'LX', 'LY', 'LPupil']
    # Convert values to numbers
    dfSamples['LX'] = pd.to_numeric(dfSamples['LX'],errors='coerce')
    dfSamples['LY'] = pd.to_numeric(dfSamples['LY'],errors='coerce')
    #dfSamples['RX'] = pd.to_numeric(dfSamples['RX'],errors='coerce')
    #dfSamples['RY'] = pd.to_numeric(dfSamples['RY'],errors='coerce')
    #print('Done! Took %.1f seconds.'%(time.time()-t))

    #make time relative to trial start
    dfFix['tStart'] = dfFix['tStart'] - dfTrialStart['tStart'][0]
    dfFix['tEnd'] = dfFix['tEnd'] - dfTrialStart['tStart'][0]
    dfSacc['tStart'] = dfSacc['tStart'] - dfTrialStart['tStart'][0]
    dfSacc['tEnd'] = dfSacc['tEnd'] - dfTrialStart['tStart'][0]
    dfBlink['tStart'] = dfBlink['tStart'] - dfTrialStart['tStart'][0]
    dfBlink['tEnd'] = dfBlink['tEnd'] - dfTrialStart['tStart'][0]
    dfSamples['tSample'] = dfSamples['tSample'] - dfTrialStart['tStart'][0]
    # Return new compilation dataframe
    return dfTrial,dfMsg,dfFix,dfSacc,dfBlink,dfSamples

## Project-specific Aggregation

In [None]:
import pandas as pd
import collections
import math, sys
import glob
import numpy as np
import os
sys.path.insert(0, 'scripts')
import time
from sklearn import preprocessing
from functools import reduce
import xarray as xr
import pickle

def load_questions(vid_labels, filter_out=True):
    '''
    Load relevant data containing questions mentioned in videos.
    :return: dictionary where keys = vid_labels below, and values are sub-dict with question
            number and tuples of start and end times of video regions containing answer content.
            Gesture answer content is contained within 'question_on' and 'question_off' so just
            need to read these.
    '''
    type_slide_data = []
    vid_dict = {}
    for v in vid_labels:
        vid_dict[v] = None
        type_slide_data.append(root_data_dir+'/raw_data/slides/' + v + '_sqg_v2.csv')
    vid_counter = 0
    for csvtype in type_slide_data:
        type_csv = pd.read_csv(csvtype)
        # sort by questions and fixation times
        type_csv = type_csv.sort_values(by=['questionno', 'question_on'])
        qdict = collections.defaultdict(list)
        # go through each question and create a dictionary
        for index, row in type_csv.iterrows():
            if not math.isnan(row['question_on']):
                q = int(row['questionno'])
                # filtering out questions
                if filter_out:
                    if 'filter_out' in type_csv.columns and row['filter_out'] == 0:
                        qdict[q].append((row['question_on'], row['question_off']))
                    elif 'filter_out' not in type_csv.columns:
                        qdict[q].append((row['question_on'], row['question_off']))
                else:
                    qdict[q].append((row['question_on'], row['question_off']))
        vid_dict[vid_labels[vid_counter]] = qdict
        vid_counter += 1
    return vid_dict


def loadFiles(input_directory):
    '''
        Loads all raw asc files.
        :param: Directory of asc files:
        :return: List of dataframes, specific to each vid type, condition and participant
    '''

    vid_types = ['full', 'dual', 'single']
    conds = {'bic': 'bicycle', 'per': 'perspective', 'tar': 'tarmac'}
    output = {}
    # this is the output order for reference
    for vid_type in vid_types:
        for cond in conds:
            print(input_directory + '/' + vid_type + '/' + cond + '/*')
            fixation_files = glob.glob(input_directory + '/' + vid_type + '/' + cond + '/*')
            for file in fixation_files:
                dfTrial, dfMsg, dfFix, dfSacc, dfBlink, dfSamples = ParseEyeLinkAsc(file)
                output[(vid_type, conds[cond], file.split('.asc')[0][-2:])] = (dfTrial, dfMsg, dfFix, dfSacc, dfBlink,
                                                                               dfSamples)
                print('read asc: ', vid_type, conds[cond], file.split('.asc')[0][-2:])
    return output


def merge_fixations_assessments():
    '''
    Loads dictionary of question and answer content.
    :return: output dataframe
    '''
    df = pd.DataFrame(columns=['vid_type', 'cond', 'pid', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', ' q7'])
    score_files = glob.glob(root_data_dir+'/raw_data/assessments/*.csv')
    row_num = 0
    for file in score_files:
        print('reading score file: ', file)
        if os.name == 'nt':
            vid_type = file.split('_')[-2].split('\\')[1]
        else:
            vid_type = file.split('_')[-2].split('/')[2]
        cond = file.split('_')[-1].split('.')[0]
        incsv = pd.read_csv(file)
        for _, row in incsv.iterrows():
            if not pd.isnull(row['ParticipantID']):
                df.loc[row_num] = [vid_type, cond, row['ParticipantID'], row['Q1'], row['Q2'], row['Q3'], row['Q4'],
                                   row['Q5'], row['Q6'], row['Q7']]
                row_num += 1
    return df


def load_aois():
    aois_df = pd.read_csv(root_data_dir+'/raw_data/slides/eye_tracking_bounds.csv', dtype={'start_s': 'float64',
                                                                               'end_s': 'float64',
                                                                               'x_start': 'float64',
                                                                               'x_end': 'float64',
                                                                               'y_start': 'float64',
                                                                               'y_end': 'float64'})
    aois_df['start_s'] = aois_df['start_s']*1000
    aois_df['end_s'] = aois_df['end_s'] * 1000
    return aois_df.to_records()


def aoi_lookup(aois_df, lookup_time, lookup_x, lookup_y, vid_type, cond):
    '''
    :param aois_df: reference to dataframe containing aoi information
    :param lookup_time: typically fixation or saccade times, in ms
    :param lookup_x: x coordinate
    :param lookup_y: y coordinate
    :return: categorical string
    '''

    # SET THIS TO TARMAC FULL TO BUILD A PROFILE AND REMOVE IN THE DATASET LIST

    aoi_return = \
        aois_df[(aois_df['vid_type'] == vid_type) & (aois_df['cond'] == cond) & (aois_df['start_s'] <=
                                                                                   lookup_time) &
                (aois_df['end_s'] > lookup_time) & (aois_df['x_start'] <= lookup_x) & (aois_df['x_end'] >
                                                                                       lookup_x)
                & (aois_df['y_start'] <= lookup_y) & (aois_df['y_end'] > lookup_y)]['type']
    if aoi_return.any() and aoi_return.size == 1:
        aoi_return = aoi_return[0]
    else:
        aoi_return = np.nan
    return aoi_return


def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx


def get_max_length(x):
    max_len = float('-inf')
    for k in x.keys():
        for l in x[k]:
            max_len = max(len(l), max_len)
    return max_len


def replace_periods(sub_df, start_idx, end_idx, var):
    '''
    Replaces '.' or missing data, typically in saccade data with 0's
    '''
    tmp = sub_df[var][start_idx:end_idx]
    tmp_mask = tmp == '.'
    tmp[tmp_mask] = 0
    return tmp


def create_tf_dataset(input_data, vid_dict, conds, model_type, aois_df, toi_in_duration, toi_out_duration, output_var,
                      toy=False):
    '''
        Reads in all raw eye tracking data, and vid_dict for reference to pull out relevant regions of data for tensor.
        :param: .npy array of asc file merge, dictionary of questions and relevant regions, conditions to filter,
                dataframe of assessments scores by condition, vid type and participant
                toy: forces Saccade_Duration to be 300 if participant was accurate in response (label == 1) or 0
                otherwise
        :return: output dataframe
    '''
    le = preprocessing.LabelEncoder()
    def calc_label(output_var, df, out_start, out_end, in_start, in_end):
        '''

        :param output_var: tuple of df number and the variable contained within that is averaged
        :param df: sample dataframe
        :param out: output start and end
        :return avg_val: typically average pupil diameter value
        '''
        sub_df = df[output_var[0]]
        start_idx = find_nearest(sub_df['tStart'], out_start)
        end_idx = find_nearest(sub_df['tStart'], out_end)
        # the following is for changes between in and out
        #start_idx2 = find_nearest(sub_df['tStart'], in_start)
        #end_idx2 = find_nearest(sub_df['tStart'], in_end)
        #return np.nanmean(sub_df[output_var[1]][start_idx:end_idx]) - np.nanmean(sub_df[output_var[1]][start_idx2:end_idx2])
        return np.nanmean(sub_df[output_var[1]][start_idx:end_idx])


    all_data = list(input_data.keys())
    output = collections.defaultdict(list)
    labels = []
    cond_list = []
    for cond in conds:
        for vid in vid_dict:
            pps = [item[2] for item in all_data if cond in item and vid in item]
            for pp in pps:
                df = input_data[(cond, vid, pp)]
                total_time = df[5]['tSample'][-1:].values[0]
                total_input_output_time = toi_in_duration + toi_out_duration
                nbins = total_time//total_input_output_time
                for bin in range(1, nbins+1):
                    in_start = (bin-1)*total_input_output_time  # input start
                    in_end = in_start+toi_in_duration
                    out_start = in_end  # output label averaged from this time
                    out_end = bin*total_input_output_time
                    labels.append(calc_label(output_var, df, out_start, out_end, in_start, in_end))
                    relevant_dfs = [2, 3, 4, 5]
                    for df_no in relevant_dfs:
                        sub_df = df[df_no]
                        if 'tStart' in sub_df:
                            start_idx = find_nearest(sub_df['tStart'], in_start)
                            end_idx = find_nearest(sub_df['tStart'], in_end)
                            if df_no == 2:
                                output['fixation_start'].append(sub_df['tStart'][start_idx:end_idx])
                                output['fixation_duration'].append(sub_df['duration'][start_idx:end_idx])
                                output['fixation_xAvg'].append(sub_df['xAvg'][start_idx:end_idx])
                                output['fixation_yAvg'].append(sub_df['yAvg'][start_idx:end_idx])
                                output['fixation_pupilAvg'].append(sub_df['pupilAvg'][start_idx:end_idx])
                                # diff values
                                output['fixation_start_diff'].append(np.diff(sub_df['tStart'][start_idx:end_idx]))
                                output['fixation_xAvg_diff'].append(np.diff(sub_df['xAvg'][start_idx:end_idx]))
                                output['fixation_yAvg_diff'].append(np.diff(sub_df['yAvg'][start_idx:end_idx]))
                                tmp_aoi = sub_df[start_idx:end_idx].apply(lambda x: aoi_lookup(aois_df, x.tStart, x.xAvg, x.yAvg, vid, cond),
                                                       axis=1)
                                output['fixation_aoi'].append(tmp_aoi)
                            elif df_no == 3:
                                output['saccades_start'].append(sub_df['tStart'][start_idx:end_idx])
                                if toy:
                                    output['saccades_duration'].append([labels[-1] * 300] * (end_idx - start_idx))
                                else:
                                    output['saccades_duration'].append(sub_df['duration'][start_idx:end_idx])
                                output['saccades_xEnd_minus_xStart'].append(
                                    replace_periods(sub_df, start_idx, end_idx, 'xEnd').astype(float)
                                    - replace_periods(sub_df, start_idx, end_idx, 'xStart').astype(float))
                                output['saccades_yEnd_minus_yStart'].append(
                                    replace_periods(sub_df, start_idx, end_idx, 'yEnd').astype(float)
                                    - replace_periods(sub_df, start_idx, end_idx, 'yStart').astype(float))
                                output['saccades_ampDeg'].append(sub_df['ampDeg'][start_idx:end_idx])
                                output['saccades_vPeak'].append(sub_df['vPeak'][start_idx:end_idx])
                                # diff values
                                output['saccades_start_diff'].append(np.diff(sub_df['tStart'][start_idx:end_idx]))
                            elif df_no == 4:
                                output['blink_start'].append(sub_df['tStart'][start_idx:end_idx])
                                output['blink_duration'].append(sub_df['duration'][start_idx:end_idx])
                                # diff values
                                output['blink_start_diff'].append(np.diff(sub_df['tStart'][start_idx:end_idx]))
                    cond_list.append(cond + '-' + vid + '-' + pp + '-' + str(bin))
    feature_list = []
    if model_type == 'logistic':
        tf_out = np.full((len(output['fixation_start']), len(output)), fill_value=np.nan, dtype='float32')
        feature = 0
        for key, values in output.items():
            sample_no = 0
            for l in values:
                tf_out[sample_no, feature] = np.nanmean(l)
                sample_no += 1
            feature += 1
            feature_list.append(key)
    else:
        max_seq_len = get_max_length(output)
        tf_out = np.full((len(output['fixation_start']), max_seq_len, len(output)), fill_value=np.nan, dtype=object)
        feature = 0
        for key, values in output.items():
            sample_no = 0
            for l in values:
                tf_out[sample_no, 0:len(l), feature] = l
                sample_no += 1
            feature += 1
            feature_list.append(key)
    tf_out = xr.DataArray(tf_out, coords=[('vid_type-topic-pp-block', cond_list), ('block', np.linspace(1, max_seq_len, max_seq_len, endpoint=True, dtype='int64')), ('feature', feature_list)])
    return tf_out, labels, feature_list, cond_list

# this is to cover for any differences in the long-form fixation regions, by using the sqgs instead
vid_labels = ['tarmac', 'bicycle', 'perspective']
conditions = ['full', 'dual', 'single']

vid_dict = load_questions(vid_labels, filter_out=True)
aois_df = load_aois()
labels_df = merge_fixations_assessments()
# create npy file data
vars_list = ['dfTrial', 'dfMsg', 'dfFix', 'dfSacc', 'dfBlink', 'dfSamples']
model_type = 'lstm'
t0 = time.time()

# load existing data
all_eyelink_data = np.load(root_data_dir+'/all_eyelink_data.npy', allow_pickle=True)

# create tensor
toi_duration_input, toi_duration_output, output_var = 15000, 5000, (2, 'pupilAvg')
# duration in ms, output_var in df#, varname
tf_data, labels, feature_list, cond_list = create_tf_dataset(all_eyelink_data.item(), vid_labels, conditions,
                                                             model_type, aois_df, toi_duration_input, toi_duration_output,
                                                             output_var)

# save models to run in lstm_model.py (as of 06/10/19)
pickle.dump(tf_data, open( root_data_dir+'/processed_data/processed_tensor.pkl', "wb" ) )
np.save(root_data_dir+'/processed_data/labels', labels)
np.save(root_data_dir+'/processed_data/feature_list', feature_list)
print('saved array in processed_data')