In [2]:
import pandas as pd
import numpy as np

In [3]:
ia_Q_path = "full/ia_Q.csv"
ia_A_path = "full/ia_A.csv"
ia_QA_path = "full/ia_QA.csv"

In [4]:
#df_Q = pd.read_csv(ia_Q_path)
df_A = pd.read_csv(ia_A_path)
#df_QA = pd.read_csv(ia_QA_path)

In [5]:
df_A_filtered = df_A[(df_A['repeated_reading_trial'] == False) & ((df_A['practice_trial'] == False))]

df_A_hunters = df_A_filtered[df_A_filtered['question_preview'] == True]
df_A_gatherers = df_A_filtered[df_A_filtered['question_preview'] == False]

In [6]:
def create_area_label(df):
    for col in ['question', 'answer_1', 'answer_2', 'answer_3', 'answer_4']:
        df[col] = df[col].fillna('').astype(str)

    df['question_tokens'] = df['question'].str.split()
    df['a_tokens'] = df['answer_1'].str.split()
    df['b_tokens'] = df['answer_2'].str.split()
    df['c_tokens'] = df['answer_3'].str.split()
    df['d_tokens'] = df['answer_4'].str.split()

    df['question_len'] = df['question_tokens'].apply(len)
    df['a_len'] = df['a_tokens'].apply(len)
    df['b_len'] = df['b_tokens'].apply(len)
    df['c_len'] = df['c_tokens'].apply(len)
    df['d_len'] = df['d_tokens'].apply(len)

    def assign_area(group):
        q_len = group['question_len'].iloc[0]
        a_len = group['a_len'].iloc[0]
        b_len = group['b_len'].iloc[0]
        c_len = group['c_len'].iloc[0]
        d_len = group['d_len'].iloc[0]

        q_end = q_len - 1
        a_end = q_len + a_len - 1
        b_end = q_len + a_len + b_len - 1
        c_end = q_len + a_len + b_len + c_len - 1

        index_id = group['IA_ID'] - 1

        conditions = [
            (index_id <= q_end),
            (index_id > q_end) & (index_id <= a_end),
            (index_id > a_end) & (index_id <= b_end),
            (index_id > b_end) & (index_id <= c_end),
            (index_id > c_end)
        ]

        choices = ['question', 'answer_1', 'answer_2', 'answer_3', 'answer_4']
        group['area_label'] = np.select(conditions, choices, default='unknown')
        return group

    df_area_split = df.set_index(['TRIAL_INDEX', 'participant_id']).groupby(['TRIAL_INDEX', 'participant_id'], group_keys=False).apply(assign_area)
    return df_area_split

In [7]:
def create_correct_answer(df):
    df = df.copy()
    df['is_correct'] = (df['selected_answer_position'] == df['correct_answer_position']).astype(int)
    return df

In [8]:
def create_mean_area_dwell_time(df):
    return df.groupby(['TRIAL_INDEX', 'participant_id','area_label'], as_index=False).agg(mean_dwell_time=("IA_DWELL_TIME", "mean"))

In [9]:
def create_mean_area_fixation_count(df):
    return df.groupby(['TRIAL_INDEX', 'participant_id','area_label'], as_index=False).agg(mean_fixations_count=("IA_FIXATION_COUNT", "mean"))


In [10]:
def create_mean_first_fixation_duration(df):
    df['IA_FIRST_FIXATION_DURATION_INT'] = df["IA_FIRST_FIXATION_DURATION"].replace('.', 0).astype(int)
    return df.groupby(['TRIAL_INDEX', 'participant_id','area_label'], as_index=False).agg(mean_first_fixation_duration=("IA_FIRST_FIXATION_DURATION_INT", "mean"))

In [11]:
def create_skip_rate(df):
    return df.groupby(['TRIAL_INDEX', 'participant_id','area_label'], as_index=False).agg(skip_rate=("IA_SKIP", "mean"))

In [None]:
def create_dwell_proportions(df):
    aggregated_df = (
        df.groupby(['participant_id', 'TRIAL_ID', 'area_label'], as_index=False)
        .agg({'IA_DWELL_TIME': 'sum'})
        .rename(columns={'IA_DWELL_TIME': 'total_area_dwell_time'})
    )

    # Step 2: Calculate total dwell time per participant and trial
    aggregated_df['total_dwell_time'] = aggregated_df.groupby(['participant_id', 'TRIAL_ID'])['total_area_dwell_time'].transform('sum')

    # Step 3: Calculate the proportion of time spent in each area
    aggregated_df['area_dwell_proportion'] = aggregated_df['total_area_dwell_time'] / aggregated_df['total_dwell_time']

    return aggregated_df

In [12]:
df_base_features_h = create_correct_answer(df_A_hunters)
df_base_features_h = create_area_label(df_base_features_h).reset_index()

df_base_features_g = create_correct_answer(df_A_gatherers)
df_base_features_g = create_area_label(df_base_features_g).reset_index()


In [13]:
df_base_features_h

Unnamed: 0,TRIAL_INDEX,participant_id,EYE_REPORTED,EYE_TRACKED,GROUPING_VARIABLES,IA_AREA,IA_AVERAGE_FIX_PUPIL_SIZE,IA_BOTTOM,IA_DWELL_TIME,IA_DWELL_TIME_%,...,a_tokens,b_tokens,c_tokens,d_tokens,question_len,a_len,b_len,c_len,d_len,area_label
0,4,l42_2070,RIGHT,Right,RECORDING_SESSION,10165.0,.,261,0,0.0000,...,"[A, company, that, will, soon, sell, e-bikes]","[A, company, founded, nearly, 20, years, ago]","[A, company, known, for, selling, non-electric...","[A, company, that, is, headed, by, Larry, Pizzi]",4,7,7,7,8,question
1,4,l42_2070,RIGHT,Right,RECORDING_SESSION,6099.0,1078.00,261,165,0.0280,...,"[A, company, that, will, soon, sell, e-bikes]","[A, company, founded, nearly, 20, years, ago]","[A, company, known, for, selling, non-electric...","[A, company, that, is, headed, by, Larry, Pizzi]",4,7,7,7,8,question
2,4,l42_2070,RIGHT,Right,RECORDING_SESSION,14231.0,.,261,0,0.0000,...,"[A, company, that, will, soon, sell, e-bikes]","[A, company, founded, nearly, 20, years, ago]","[A, company, known, for, selling, non-electric...","[A, company, that, is, headed, by, Larry, Pizzi]",4,7,7,7,8,question
3,4,l42_2070,RIGHT,Right,RECORDING_SESSION,28569.0,.,261,0,0.0000,...,"[A, company, that, will, soon, sell, e-bikes]","[A, company, founded, nearly, 20, years, ago]","[A, company, known, for, selling, non-electric...","[A, company, that, is, headed, by, Larry, Pizzi]",4,7,7,7,8,question
4,4,l42_2070,RIGHT,Right,RECORDING_SESSION,4180.0,1139.00,492,184,0.0313,...,"[A, company, that, will, soon, sell, e-bikes]","[A, company, founded, nearly, 20, years, ago]","[A, company, known, for, selling, non-electric...","[A, company, that, is, headed, by, Larry, Pizzi]",4,7,7,7,8,answer_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380345,59,l10_39,LEFT,Left,RECORDING_SESSION,8208.0,416.00,1281,258,0.0426,...,"[He, is, a, lobbyist, who, works, for, the, BPI]","[He, is, a, politician, who, is, endorsed, by,...","[He, is, a, resident, of, Downing, Street, who...","[He, is, a, conservative, businessman, who, is...",12,9,10,14,12,answer_4
380346,59,l10_39,LEFT,Left,RECORDING_SESSION,10260.0,.,1281,0,0.0000,...,"[He, is, a, lobbyist, who, works, for, the, BPI]","[He, is, a, politician, who, is, endorsed, by,...","[He, is, a, resident, of, Downing, Street, who...","[He, is, a, conservative, businessman, who, is...",12,9,10,14,12,answer_4
380347,59,l10_39,LEFT,Left,RECORDING_SESSION,6156.0,.,1281,0,0.0000,...,"[He, is, a, lobbyist, who, works, for, the, BPI]","[He, is, a, politician, who, is, endorsed, by,...","[He, is, a, resident, of, Downing, Street, who...","[He, is, a, conservative, businessman, who, is...",12,9,10,14,12,answer_4
380348,59,l10_39,LEFT,Left,RECORDING_SESSION,8208.0,439.00,1281,209,0.0345,...,"[He, is, a, lobbyist, who, works, for, the, BPI]","[He, is, a, politician, who, is, endorsed, by,...","[He, is, a, resident, of, Downing, Street, who...","[He, is, a, conservative, businessman, who, is...",12,9,10,14,12,answer_4


In [14]:
def generate_new_row_features(functions, df, join_columns=['TRIAL_INDEX', 'participant_id', 'area_label']):
    result_df = df.copy()

    for func in functions:
        print(func)
        new_features_df = func(df)
        result_df = result_df.merge(new_features_df, on=join_columns, how='left')
        print(result_df)

    return result_df

In [15]:
per_row_feature_generators = [create_mean_area_dwell_time, create_mean_area_fixation_count,
                              create_mean_first_fixation_duration, create_skip_rate]
df_with_features_h = generate_new_row_features(per_row_feature_generators, df_base_features_h)
df_with_features_g = generate_new_row_features(per_row_feature_generators, df_base_features_g)

<function create_mean_area_dwell_time at 0x000001975CDC20E0>
        TRIAL_INDEX participant_id EYE_REPORTED EYE_TRACKED  \
0                 4       l42_2070        RIGHT       Right   
1                 4       l42_2070        RIGHT       Right   
2                 4       l42_2070        RIGHT       Right   
3                 4       l42_2070        RIGHT       Right   
4                 4       l42_2070        RIGHT       Right   
...             ...            ...          ...         ...   
380345           59         l10_39         LEFT        Left   
380346           59         l10_39         LEFT        Left   
380347           59         l10_39         LEFT        Left   
380348           59         l10_39         LEFT        Left   
380349           59         l10_39         LEFT        Left   

       GROUPING_VARIABLES  IA_AREA IA_AVERAGE_FIX_PUPIL_SIZE  IA_BOTTOM  \
0       RECORDING_SESSION  10165.0                         .        261   
1       RECORDING_SESSION   6099

In [16]:
df_with_features_h

Unnamed: 0,TRIAL_INDEX,participant_id,EYE_REPORTED,EYE_TRACKED,GROUPING_VARIABLES,IA_AREA,IA_AVERAGE_FIX_PUPIL_SIZE,IA_BOTTOM,IA_DWELL_TIME,IA_DWELL_TIME_%,...,question_len,a_len,b_len,c_len,d_len,area_label,mean_dwell_time,mean_fixations_count,mean_first_fixation_duration,skip_rate
0,4,l42_2070,RIGHT,Right,RECORDING_SESSION,10165.0,.,261,0,0.0000,...,4,7,7,7,8,question,41.250000,0.250000,41.250000,0.750000
1,4,l42_2070,RIGHT,Right,RECORDING_SESSION,6099.0,1078.00,261,165,0.0280,...,4,7,7,7,8,question,41.250000,0.250000,41.250000,0.750000
2,4,l42_2070,RIGHT,Right,RECORDING_SESSION,14231.0,.,261,0,0.0000,...,4,7,7,7,8,question,41.250000,0.250000,41.250000,0.750000
3,4,l42_2070,RIGHT,Right,RECORDING_SESSION,28569.0,.,261,0,0.0000,...,4,7,7,7,8,question,41.250000,0.250000,41.250000,0.750000
4,4,l42_2070,RIGHT,Right,RECORDING_SESSION,4180.0,1139.00,492,184,0.0313,...,4,7,7,7,8,answer_1,173.000000,1.000000,94.285714,0.571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380345,59,l10_39,LEFT,Left,RECORDING_SESSION,8208.0,416.00,1281,258,0.0426,...,12,9,10,14,12,answer_4,142.833333,0.666667,99.083333,0.666667
380346,59,l10_39,LEFT,Left,RECORDING_SESSION,10260.0,.,1281,0,0.0000,...,12,9,10,14,12,answer_4,142.833333,0.666667,99.083333,0.666667
380347,59,l10_39,LEFT,Left,RECORDING_SESSION,6156.0,.,1281,0,0.0000,...,12,9,10,14,12,answer_4,142.833333,0.666667,99.083333,0.666667
380348,59,l10_39,LEFT,Left,RECORDING_SESSION,8208.0,439.00,1281,209,0.0345,...,12,9,10,14,12,answer_4,142.833333,0.666667,99.083333,0.666667
