In [25]:
import pandas as pd
import numpy as np
import ast
import itertools
from itertools import combinations

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from scipy.stats import mannwhitneyu

import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import ConvergenceWarning
import warnings

import networkx as nx
from matplotlib.lines import Line2D
import math

### Loading Data

In [2]:
ia_A_path = "full/ia_A.csv"
df_A = pd.read_csv(ia_A_path)

In [3]:
fix_A_path = "full/fixations_A.csv"
df_fix_A = pd.read_csv(fix_A_path)

  df_fix_A = pd.read_csv(fix_A_path)


### Data Preprocessing

In [4]:
df_A_filtered = df_A[(df_A['repeated_reading_trial'] == False) & ((df_A['practice_trial'] == False))]

article_col='article_id'
difficulty_col='difficulty_level'
batch_col='article_batch'
paragraph_col='paragraph_id'

df_A_filtered['text_id'] = (
    df_A_filtered[article_col].astype(str) + '_' +
    df_A_filtered[difficulty_col].astype(str) + '_' +
    df_A_filtered[batch_col].astype(str) + '_' +
    df_A_filtered[paragraph_col].astype(str)
)

df_A_hunters = df_A_filtered[df_A_filtered['question_preview'] == True].copy()
df_A_gatherers = df_A_filtered[df_A_filtered['question_preview'] == False].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_A_filtered['text_id'] = (


In [5]:
df_fix_filtered = df_fix_A[(df_fix_A['repeated_reading_trial'] == False) & ((df_fix_A['practice_trial'] == False))]

df_fix_hunters = df_fix_filtered[df_fix_filtered['question_preview'] == True].copy()
df_fix_gatherers = df_fix_filtered[df_fix_filtered['question_preview'] == False].copy()

In [52]:
def get_fix_columns(df):
    df_fix_seq = (
        df.groupby(['participant_id', 'TRIAL_INDEX'], sort=False)
        .agg({
            'CURRENT_FIX_INTEREST_AREA_INDEX': list,
            'CURRENT_FIX_DURATION': list
        })
        .reset_index()
        .rename(columns={
            'CURRENT_FIX_INTEREST_AREA_INDEX': 'fix_seq',
            'CURRENT_FIX_DURATION': 'fix_duration'
        })
        )

    def remove_consecutive_duplicates_with_duration(seq, durations):
        short_seq = []
        short_durations = []
        for fixation, group in itertools.groupby(zip(seq, durations), key=lambda x: x[0]):
            total_duration = sum(item[1] for item in group)
            short_seq.append(fixation)
            short_durations.append(total_duration)
        return short_seq, short_durations

    df_fix_seq[['short_fix_seq', 'short_fix_duration']] = df_fix_seq.apply(
        lambda row: pd.Series(remove_consecutive_duplicates_with_duration(row['fix_seq'], row['fix_duration'])),
        axis=1
        )
    return df_fix_seq

In [53]:
df_h_fix = get_fix_columns(df_fix_hunters)
df_g_fix = get_fix_columns(df_fix_gatherers)

In [55]:
df_A_hunters = df_A_hunters.merge(df_h_fix, on=['TRIAL_INDEX', 'participant_id'], how='left')
df_A_gatherers = df_A_gatherers.merge(df_g_fix, on=['TRIAL_INDEX', 'participant_id'], how='left')

In [62]:
df_A_gatherers[df_A_gatherers['short_fix_seq'] != df_A_gatherers['INTEREST_AREA_FIXATION_SEQUENCE'].apply(ast.literal_eval)][['INTEREST_AREA_FIXATION_SEQUENCE', 'short_fix_seq']]

Unnamed: 0,INTEREST_AREA_FIXATION_SEQUENCE,short_fix_seq


### Creating Features

In [65]:
def create_area_screen_loc(df):
    df = df.copy()
    for col in ['question', 'answer_1', 'answer_2', 'answer_3', 'answer_4']:
        df[col] = df[col].fillna('').astype(str)

    df['question_tokens'] = df['question'].str.split()
    df['1_tokens'] = df['answer_1'].str.split()
    df['2_tokens'] = df['answer_2'].str.split()
    df['3_tokens'] = df['answer_3'].str.split()
    df['4_tokens'] = df['answer_4'].str.split()

    df['question_len'] = df['question_tokens'].apply(len)
    df['1_len'] = df['1_tokens'].apply(len)
    df['2_len'] = df['2_tokens'].apply(len)
    df['3_len'] = df['3_tokens'].apply(len)
    df['4_len'] = df['4_tokens'].apply(len)

    def assign_area(group):
        q_len = group['question_len'].iloc[0]
        first_len = group['1_len'].iloc[0]
        second_len = group['2_len'].iloc[0]
        third_len = group['3_len'].iloc[0]
        fourth_len = group['4_len'].iloc[0]

        q_end = q_len - 1
        first_end = q_len + first_len - 1
        second_end = q_len + first_len + second_len - 1
        third_end = q_len + first_len + second_len + third_len - 1
        fourth_end = q_len + first_len + second_len + third_len + fourth_len

        index_id = group['IA_ID'] - 1

        conditions = [
            (index_id <= q_end),
            (index_id > q_end) & (index_id <= first_end),
            (index_id > first_end) & (index_id <= second_end),
            (index_id > second_end) & (index_id <= third_end),
            (index_id > third_end) & (index_id <= fourth_end)
        ]

        choices = ['question', 'answer_0', 'answer_1', 'answer_2', 'answer_3']
        group['area_screen_loc'] = np.select(conditions, choices, default='unknown')
        return group

    df_area_split = df.set_index(['TRIAL_INDEX', 'participant_id']).groupby(['TRIAL_INDEX', 'participant_id'], group_keys=False).apply(assign_area)
    return df_area_split

In [66]:
def create_area_label(df):
    def get_screen_loc(row):
        if row['area_screen_loc'] == 'question':
            return 'question'
        elif row['area_screen_loc'].startswith('answer_'):
            answers_order = ast.literal_eval(row['answers_order'])
            idx = int(row['area_screen_loc'].split('_')[1])
            return f'answer_{answers_order[idx]}'
        return None

    df['area_label'] = df.apply(get_screen_loc, axis=1)
    return df

In [69]:
def process_dataframe(df, functions):
    for func in functions:
        print(func)
        df = func(df)
    return df.reset_index()

processing_functions = [
    create_area_screen_loc,
    create_area_label,
]

df_base_features_h = process_dataframe(df_A_hunters, processing_functions)
df_base_features_g = process_dataframe(df_A_gatherers, processing_functions)

<function create_area_screen_loc at 0x000001E40CEB63B0>
<function create_area_label at 0x000001E40CEB5630>
<function create_area_screen_loc at 0x000001E40CEB63B0>
<function create_area_label at 0x000001E40CEB5630>


In [74]:
def create_fixation_sequence_tags(df):
    result = []
    for (trial_index, participant_id), group in df.groupby(['TRIAL_INDEX', 'participant_id']):
        group_ids = set(group['IA_ID'].unique())

        id_to_label = dict(zip(group['IA_ID'], group['area_label']))
        id_to_location = dict(zip(group['IA_ID'], group['area_screen_loc']))

        sequence_str = group['INTEREST_AREA_FIXATION_SEQUENCE'].iloc[0]
        sequence = eval(sequence_str)

        label_sequence = []
        location_sequence = []

        for ia_id in sequence:
            if ia_id in group_ids:
                label_sequence.append(id_to_label[ia_id])
                location_sequence.append(id_to_location[ia_id])
        result.append({
            'TRIAL_INDEX': trial_index,
            'participant_id': participant_id,
            'fix_by_label': label_sequence[1:],
            'fix_by_loc': location_sequence[1:]
        })

    return pd.DataFrame(result)

In [80]:
def create_simplified_fixation_tags(df):
    result = []
    for (trial_index, participant_id), group in df.groupby(['TRIAL_INDEX', 'participant_id']):
        group_ids = set(group['IA_ID'].unique())

        id_to_label = dict(zip(group['IA_ID'], group['area_label']))
        id_to_location = dict(zip(group['IA_ID'], group['area_screen_loc']))

        sequence_str = group['INTEREST_AREA_FIXATION_SEQUENCE'].iloc[0]
        sequence = eval(sequence_str)

        durations = group['short_fix_duration'].iloc[0]

        valid_fixations = []
        for ia_id, dur in zip(sequence, durations):
            if ia_id not in group_ids:
                continue
            valid_fixations.append((ia_id, id_to_label[ia_id], id_to_location[ia_id], dur))

        simpl_labels = []
        simpl_locations = []
        simpl_durations = []

        for label, group_iter in itertools.groupby(valid_fixations, key=lambda x: x[1]):
            group_list = list(group_iter)
            simpl_labels.append(label)
            simpl_locations.append(group_list[0][2])
            avg_duration = sum(item[3] for item in group_list) / len(group_list)
            simpl_durations.append(avg_duration)

        result.append({
            'TRIAL_INDEX': trial_index,
            'participant_id': participant_id,
            'simpl_fix_by_label': simpl_labels[1:],
            'simpl_fix_by_loc': simpl_locations[1:],
            'simpl_fix_duration': simpl_durations[1:]
        })

    return pd.DataFrame(result)


In [76]:
def generate_new_row_features(functions, df, default_join_columns=['TRIAL_INDEX', 'participant_id', 'area_label']):
    result_df = df.copy()

    for func_tuple in functions:
        func, func_kwargs = func_tuple
        print(func)

        join_columns = func_kwargs.get('join_columns', default_join_columns)

        new_features_df = func(result_df)
        result_df = result_df.merge(new_features_df, on=join_columns, how='left')

    return result_df

In [81]:
per_row_feature_generators = [
    (create_fixation_sequence_tags, {'join_columns': ['TRIAL_INDEX', 'participant_id']}),
    (create_simplified_fixation_tags, {'join_columns': ['TRIAL_INDEX', 'participant_id']}),

]

In [82]:
df_with_features_h = generate_new_row_features(per_row_feature_generators, df_base_features_h)
df_with_features_g = generate_new_row_features(per_row_feature_generators, df_base_features_g)

<function create_fixation_sequence_tags at 0x000001E7A3FA57E0>
<function create_simplified_fixation_tags at 0x000001E2B265A440>
<function create_fixation_sequence_tags at 0x000001E7A3FA57E0>
<function create_simplified_fixation_tags at 0x000001E2B265A440>


In [84]:
df_with_features_h[['simpl_fix_duration', 'simpl_fix_by_label']]

Unnamed: 0,simpl_fix_duration,simpl_fix_by_label
0,"[165.0, 172.0, 122.5, 137.66666666666666, 175....","[answer_B, answer_D, answer_A, answer_C, answe..."
1,"[165.0, 172.0, 122.5, 137.66666666666666, 175....","[answer_B, answer_D, answer_A, answer_C, answe..."
2,"[165.0, 172.0, 122.5, 137.66666666666666, 175....","[answer_B, answer_D, answer_A, answer_C, answe..."
3,"[165.0, 172.0, 122.5, 137.66666666666666, 175....","[answer_B, answer_D, answer_A, answer_C, answe..."
4,"[165.0, 172.0, 122.5, 137.66666666666666, 175....","[answer_B, answer_D, answer_A, answer_C, answe..."
...,...,...
380345,"[220.2, 203.66666666666666, 285.6666666666667,...","[answer_A, answer_C, answer_B, answer_A]"
380346,"[220.2, 203.66666666666666, 285.6666666666667,...","[answer_A, answer_C, answer_B, answer_A]"
380347,"[220.2, 203.66666666666666, 285.6666666666667,...","[answer_A, answer_C, answer_B, answer_A]"
380348,"[220.2, 203.66666666666666, 285.6666666666667,...","[answer_A, answer_C, answer_B, answer_A]"
