In [18]:
import pandas as pd
import numpy as np

In [19]:
ia_Q_path = "full/ia_Q.csv"
ia_A_path = "full/ia_A.csv"
ia_QA_path = "full/ia_QA.csv"

In [20]:
#df_Q = pd.read_csv(ia_Q_path)
df_A = pd.read_csv(ia_A_path)
#df_QA = pd.read_csv(ia_QA_path)

In [21]:
df_A_filtered = df_A[(df_A['repeated_reading_trial'] == False) & ((df_A['practice_trial'] == False))]

df_A_hunters = df_A_filtered[df_A_filtered['question_preview'] == True]
df_A_gatherers = df_A_filtered[df_A_filtered['question_preview'] == False]

In [22]:
def create_area_label(df):
    for col in ['question', 'answer_1', 'answer_2', 'answer_3', 'answer_4']:
        df[col] = df[col].fillna('').astype(str)

    df['question_tokens'] = df['question'].str.split()
    df['a_tokens'] = df['answer_1'].str.split()
    df['b_tokens'] = df['answer_2'].str.split()
    df['c_tokens'] = df['answer_3'].str.split()
    df['d_tokens'] = df['answer_4'].str.split()

    df['question_len'] = df['question_tokens'].apply(len)
    df['a_len'] = df['a_tokens'].apply(len)
    df['b_len'] = df['b_tokens'].apply(len)
    df['c_len'] = df['c_tokens'].apply(len)
    df['d_len'] = df['d_tokens'].apply(len)

    def assign_area(group):
        q_len = group['question_len'].iloc[0]
        a_len = group['a_len'].iloc[0]
        b_len = group['b_len'].iloc[0]
        c_len = group['c_len'].iloc[0]
        d_len = group['d_len'].iloc[0]

        q_end = q_len - 1
        a_end = q_len + a_len - 1
        b_end = q_len + a_len + b_len - 1
        c_end = q_len + a_len + b_len + c_len - 1

        index_id = group['IA_ID'] - 1

        conditions = [
            (index_id <= q_end),
            (index_id > q_end) & (index_id <= a_end),
            (index_id > a_end) & (index_id <= b_end),
            (index_id > b_end) & (index_id <= c_end),
            (index_id > c_end)
        ]

        choices = ['question', 'answer_A', 'answer_B', 'answer_C', 'answer_D']
        group['area_label'] = np.select(conditions, choices, default='unknown')
        return group

    df_area_split = df.set_index(['TRIAL_INDEX', 'participant_id']).groupby(['TRIAL_INDEX', 'participant_id'], group_keys=False).apply(assign_area)
    return df_area_split

In [23]:
def create_correct_answer(df):
    df = df.copy()
    df['is_correct'] = (df['selected_answer_position'] == df['correct_answer_position']).astype(int)
    return df

In [24]:
def create_mean_area_dwell_time(df):
    return df.groupby(['TRIAL_INDEX', 'participant_id','area_label'], as_index=False).agg(mean_dwell_time=("IA_DWELL_TIME", "mean"))

In [25]:
def create_mean_area_fixation_count(df):
    return df.groupby(['TRIAL_INDEX', 'participant_id','area_label'], as_index=False).agg(mean_fixations_count=("IA_FIXATION_COUNT", "mean"))


In [26]:
def create_mean_first_fixation_duration(df):
    df['IA_FIRST_FIXATION_DURATION_INT'] = df["IA_FIRST_FIXATION_DURATION"].replace('.', 0).astype(int)
    return df.groupby(['TRIAL_INDEX', 'participant_id','area_label'], as_index=False).agg(mean_first_fixation_duration=("IA_FIRST_FIXATION_DURATION_INT", "mean"))

In [27]:
def create_skip_rate(df):
    return df.groupby(['TRIAL_INDEX', 'participant_id','area_label'], as_index=False).agg(skip_rate=("IA_SKIP", "mean"))

In [28]:
def create_dwell_proportions(df):
    aggregated_df = (
        df.groupby(['participant_id', 'TRIAL_INDEX', 'area_label'], as_index=False)
        .agg({'IA_DWELL_TIME': 'sum'})
        .rename(columns={'IA_DWELL_TIME': 'total_area_dwell_time'})
    )

    aggregated_df['total_dwell_time'] = aggregated_df.groupby(['participant_id', 'TRIAL_INDEX'])['total_area_dwell_time'].transform('sum')

    aggregated_df['area_dwell_proportion'] = aggregated_df['total_area_dwell_time'] / aggregated_df['total_dwell_time']

    return aggregated_df

In [29]:
def create_last_area_visited(df):
    df['IA_LAST_FIXATION_TIME_INT'] = df["IA_LAST_FIXATION_TIME"].replace('.', 0).astype(int)
    df_sorted = df.sort_values(by=['participant_id', 'TRIAL_INDEX', 'IA_LAST_FIXATION_TIME_INT'], ascending=[True, True, False])
    top_fixations = df_sorted.groupby(['participant_id', 'TRIAL_INDEX']).head(5)

    last_area = (
        top_fixations.groupby(['participant_id', 'TRIAL_INDEX'])['area_label']
        .agg(lambda x: x.value_counts().idxmax())
        .reset_index()
        .rename(columns={'area_label': 'last_area_visited'})
    )
    return last_area

In [30]:
df_base_features_h = create_correct_answer(df_A_hunters)
df_base_features_h = create_area_label(df_base_features_h).reset_index()

df_base_features_g = create_correct_answer(df_A_gatherers)
df_base_features_g = create_area_label(df_base_features_g).reset_index()


In [31]:
def generate_new_row_features(functions, df, default_join_columns=['TRIAL_INDEX', 'participant_id', 'area_label']):
    result_df = df.copy()

    for func_tuple in functions:
        func, func_kwargs = func_tuple
        print(func)

        join_columns = func_kwargs.get('join_columns', default_join_columns)

        new_features_df = func(df)
        result_df = result_df.merge(new_features_df, on=join_columns, how='left')

    return result_df

In [32]:
per_row_feature_generators = [
    (create_mean_area_dwell_time, {}),
    (create_mean_area_fixation_count, {}),
    (create_mean_first_fixation_duration, {}),
    (create_skip_rate, {}),
    (create_dwell_proportions, {}),
    (create_last_area_visited, {'join_columns': ['TRIAL_INDEX', 'participant_id']})
]

In [33]:
df_with_features_h = generate_new_row_features(per_row_feature_generators, df_base_features_h)
df_with_features_g = generate_new_row_features(per_row_feature_generators, df_base_features_g)

<function create_mean_area_dwell_time at 0x000001EA2CB6EC20>
<function create_mean_area_fixation_count at 0x000001EA7F9D09D0>
<function create_mean_first_fixation_duration at 0x000001EA0D013640>
<function create_skip_rate at 0x000001EA7DFEE050>
<function create_dwell_proportions at 0x000001EA7F9D0C10>
<function create_last_area_visited at 0x000001EA0D00F7F0>
<function create_mean_area_dwell_time at 0x000001EA2CB6EC20>
<function create_mean_area_fixation_count at 0x000001EA7F9D09D0>
<function create_mean_first_fixation_duration at 0x000001EA0D013640>
<function create_skip_rate at 0x000001EA7DFEE050>
<function create_dwell_proportions at 0x000001EA7F9D0C10>
<function create_last_area_visited at 0x000001EA0D00F7F0>


In [34]:
df_with_features_h

Unnamed: 0,TRIAL_INDEX,participant_id,EYE_REPORTED,EYE_TRACKED,GROUPING_VARIABLES,IA_AREA,IA_AVERAGE_FIX_PUPIL_SIZE,IA_BOTTOM,IA_DWELL_TIME,IA_DWELL_TIME_%,...,d_len,area_label,mean_dwell_time,mean_fixations_count,mean_first_fixation_duration,skip_rate,total_area_dwell_time,total_dwell_time,area_dwell_proportion,last_area_visited
0,4,l42_2070,RIGHT,Right,RECORDING_SESSION,10165.0,.,261,0,0.0000,...,8,question,41.250000,0.250000,41.250000,0.750000,165,5681,0.029044,answer_D
1,4,l42_2070,RIGHT,Right,RECORDING_SESSION,6099.0,1078.00,261,165,0.0280,...,8,question,41.250000,0.250000,41.250000,0.750000,165,5681,0.029044,answer_D
2,4,l42_2070,RIGHT,Right,RECORDING_SESSION,14231.0,.,261,0,0.0000,...,8,question,41.250000,0.250000,41.250000,0.750000,165,5681,0.029044,answer_D
3,4,l42_2070,RIGHT,Right,RECORDING_SESSION,28569.0,.,261,0,0.0000,...,8,question,41.250000,0.250000,41.250000,0.750000,165,5681,0.029044,answer_D
4,4,l42_2070,RIGHT,Right,RECORDING_SESSION,4180.0,1139.00,492,184,0.0313,...,8,answer_A,173.000000,1.000000,94.285714,0.571429,1211,5681,0.213167,answer_D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380345,59,l10_39,LEFT,Left,RECORDING_SESSION,8208.0,416.00,1281,258,0.0426,...,12,answer_D,142.833333,0.666667,99.083333,0.666667,1714,5707,0.300333,answer_D
380346,59,l10_39,LEFT,Left,RECORDING_SESSION,10260.0,.,1281,0,0.0000,...,12,answer_D,142.833333,0.666667,99.083333,0.666667,1714,5707,0.300333,answer_D
380347,59,l10_39,LEFT,Left,RECORDING_SESSION,6156.0,.,1281,0,0.0000,...,12,answer_D,142.833333,0.666667,99.083333,0.666667,1714,5707,0.300333,answer_D
380348,59,l10_39,LEFT,Left,RECORDING_SESSION,8208.0,439.00,1281,209,0.0345,...,12,answer_D,142.833333,0.666667,99.083333,0.666667,1714,5707,0.300333,answer_D
