# IMPORT LIBRARIES

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.dummy import DummyClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score

from sklearn.model_selection import GroupKFold
from collections import Counter
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, make_scorer

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# SET AND RESET ROWS AND COLS

In [87]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [98]:
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')

# READ CSV FILE

In [97]:
df_eye = pd.read_csv('Eye_data.csv')
df_comprehension = pd.read_csv('comprehension_data.csv')
df_fixation = pd.read_csv('fixation_all_paragraph.csv')
df_question = pd.read_csv('Question_data.csv')
df_overall = pd.read_csv('Overall_data_m.csv')
df_eye

Unnamed: 0,Participant,Paragraph,Gazes,AOIGazes,OffScreenGazesPix,OffScreenGazesProp
0,593890eac6aa16000101f037,1,699,138,181,301
1,593890eac6aa16000101f037,2,727,98,260,352
2,593890eac6aa16000101f037,3,947,248,220,356
3,593890eac6aa16000101f037,4,799,171,282,422
4,593890eac6aa16000101f037,5,163,12,4,4
5,593890eac6aa16000101f037,6,452,176,42,84
6,593890eac6aa16000101f037,7,449,91,90,134
7,593890eac6aa16000101f037,8,188,57,0,1
8,593890eac6aa16000101f037,9,701,297,34,124
9,593890eac6aa16000101f037,10,145,39,5,11


# ANALYSIS

In fixation data, there are 397 null values in cluster_sd_duration and 1373 null values in skew_duration column. there are no null values in eye and comprehension data.
After merging eye, fixation and comprehension data, there are null values in cluster_sd_duration (87) and skew_duration (298).

In [110]:
# Count number of participants and instances before mapping and merging all the data
dataframes = [
    ("df_eye_data", df_eye),
    ("df_question_data", df_question),
    ("df_fixation_data", df_fixation)
]
def calculate_statistics(df):
    num_participants = df['Participant'].nunique()
    num_instances = len(df)
    return num_participants, num_instances

for i, (name, df) in enumerate(dataframes, start=1):
    num_participants, num_instances = calculate_statistics(df)
    print(f"{name}:")
    print(f"  Number of participants = {num_participants}")
    print(f"  Number of instances = {num_instances}")


df_eye_data:
  Number of participants = 354
  Number of instances = 14160
df_question_data:
  Number of participants = 354
  Number of instances = 3540
df_fixation_data:
  Number of participants = 338
  Number of instances = 13483


In [99]:
df_question = df_question.rename(columns={'Number': 'Question', 'RT': 'Question_RT'})
df_question

Unnamed: 0,Participant,Question,Question_RT,Correct
0,593890eac6aa16000101f037,1,4.9896,1
1,593890eac6aa16000101f037,2,5.8257,1
2,593890eac6aa16000101f037,3,8.4852,0
3,593890eac6aa16000101f037,4,10.0168,1
4,593890eac6aa16000101f037,5,18.9601,1
...,...,...,...,...
3535,63e53dc0bb780ac38cdcf77e,6,12.2569,1
3536,63e53dc0bb780ac38cdcf77e,7,14.1803,1
3537,63e53dc0bb780ac38cdcf77e,8,5.9074,1
3538,63e53dc0bb780ac38cdcf77e,9,7.4726,1


In [24]:
# # Check for null values 
# df_fixation.isnull().sum()

Participant                 0
Paragraph                   0
cluster_num_clusters        0
cluster_avg_duration        0
cluster_sd_duration       397
cluster_skew_duration    1373
dispersion                  0
dtype: int64

## Mapping Questions to Paragraphs

In [100]:
# Mapping of Questions to Paragraphs
mapping = {
    1: [1],
    2: [2],
    3: [13],
    4: [35, 36],
    5: [37, 38, 39],
    6: [34, 35],
    7: [37, 38, 39],
    8: [4],
    9: [2, 3],
    10: [4, 6, 13, 30]
}

expanded_mapping = []
for question, paragraphs in mapping.items():
    for paragraph in paragraphs:
        expanded_mapping.append({'Question': question, 'Paragraph': paragraph})

df_mapping = pd.DataFrame(expanded_mapping)
df_mapping

Unnamed: 0,Question,Paragraph
0,1,1
1,2,2
2,3,13
3,4,35
4,4,36
5,5,37
6,5,38
7,5,39
8,6,34
9,6,35


In [101]:
question_mapping = pd.merge(df_question, df_mapping, on='Question')
question_mapping

Unnamed: 0,Participant,Question,Question_RT,Correct,Paragraph
0,593890eac6aa16000101f037,1,4.9896,1,1
1,596e1af7a09655000197d4bb,1,6.5785,1,1
2,5af835d8e19f8c00019e6dc0,1,12.6838,1,1
3,5b07b71c68eff50001d1c859,1,3.5650,1,1
4,5bd7971b0aac450001f951aa,1,11.2007,1,1
...,...,...,...,...,...
7075,63d3fa78d12b38b131ef6b76,10,7.6856,1,30
7076,63e53dc0bb780ac38cdcf77e,10,5.4244,1,4
7077,63e53dc0bb780ac38cdcf77e,10,5.4244,1,6
7078,63e53dc0bb780ac38cdcf77e,10,5.4244,1,13


In [102]:
gaze_question_mapping = pd.merge(question_mapping, df_eye, on=['Participant', 'Paragraph'])
gaze_question_mapping

Unnamed: 0,Participant,Question,Question_RT,Correct,Paragraph,Gazes,AOIGazes,OffScreenGazesPix,OffScreenGazesProp
0,593890eac6aa16000101f037,1,4.9896,1,1,699,138,181,301
1,596e1af7a09655000197d4bb,1,6.5785,1,1,416,45,156,212
2,5af835d8e19f8c00019e6dc0,1,12.6838,1,1,830,136,136,253
3,5b07b71c68eff50001d1c859,1,3.5650,1,1,262,93,47,53
4,5bd7971b0aac450001f951aa,1,11.2007,1,1,621,187,32,74
...,...,...,...,...,...,...,...,...,...
7075,63d17cf704d2d7053d56b962,10,8.7585,1,30,772,533,4,33
7076,63d3fa78d12b38b131ef6b76,10,7.6856,1,6,794,274,99,130
7077,63d3fa78d12b38b131ef6b76,10,7.6856,1,30,664,201,65,99
7078,63e53dc0bb780ac38cdcf77e,10,5.4244,1,6,1331,961,0,0


In [103]:
gaze_fix_que_merge = pd.merge(gaze_question_mapping, df_fixation, on=['Participant', 'Paragraph'])
gaze_fix_que_merge

Unnamed: 0,Participant,Question,Question_RT,Correct,Paragraph,Gazes,AOIGazes,OffScreenGazesPix,OffScreenGazesProp,cluster_num_clusters,cluster_avg_duration,cluster_sd_duration,cluster_skew_duration,dispersion
0,593890eac6aa16000101f037,1,4.9896,1,1,699,138,181,301,6,0.048183,0.013313,-2.332043,0.537245
1,596e1af7a09655000197d4bb,1,6.5785,1,1,416,45,156,212,51,0.050455,0.009926,1.132722,0.508165
2,5af835d8e19f8c00019e6dc0,1,12.6838,1,1,830,136,136,253,138,0.033606,0.020581,0.864703,0.453232
3,5b07b71c68eff50001d1c859,1,3.5650,1,1,262,93,47,53,14,0.080571,0.035667,2.975477,0.355419
4,5bd7971b0aac450001f951aa,1,11.2007,1,1,621,187,32,74,38,0.080797,0.015477,-3.901375,0.471612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6741,63d17cf704d2d7053d56b962,10,8.7585,1,30,772,533,4,33,58,0.080571,0.032136,0.319404,0.465988
6742,63d3fa78d12b38b131ef6b76,10,7.6856,1,6,794,274,99,130,218,0.030396,0.003170,1.285993,0.395356
6743,63d3fa78d12b38b131ef6b76,10,7.6856,1,30,664,201,65,99,177,0.031052,0.005820,2.873504,0.361958
6744,63e53dc0bb780ac38cdcf77e,10,5.4244,1,6,1331,961,0,0,165,0.021177,0.013164,2.234888,0.320407


## Standardize

In [105]:
# Standardize columns
columns_to_exclude = ['Participant', 'Question', 'Correct', 'Paragraph']
df_to_standardize = gaze_fix_que_merge.drop(columns=columns_to_exclude)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_to_standardize)

df_standardized = pd.DataFrame(scaled_features, columns=df_to_standardize.columns)
gaze_fix_que_z = pd.concat([gaze_fix_que_merge[columns_to_exclude].reset_index(drop=True), df_standardized], axis=1)

gaze_fix_que_z

Unnamed: 0,Participant,Question,Correct,Paragraph,Question_RT,Gazes,AOIGazes,OffScreenGazesPix,OffScreenGazesProp,cluster_num_clusters,cluster_avg_duration,cluster_sd_duration,cluster_skew_duration,dispersion
0,593890eac6aa16000101f037,1,1,1,-0.708886,-0.119902,-0.504823,0.137953,0.404284,-0.534860,-0.071420,-0.105832,-1.350325,0.351437
1,596e1af7a09655000197d4bb,1,1,1,-0.544851,-0.554031,-0.753242,0.037180,0.079620,-0.181841,-0.068585,-0.122042,0.339559,0.196440
2,5af835d8e19f8c00019e6dc0,1,1,1,0.085448,0.081056,-0.510166,-0.043439,0.229185,0.500662,-0.089613,-0.071052,0.208836,-0.096366
3,5b07b71c68eff50001d1c859,1,1,1,-0.855959,-0.790271,-0.625026,-0.402191,-0.500398,-0.472101,-0.030999,0.001147,1.238332,-0.617727
4,5bd7971b0aac450001f951aa,1,1,1,-0.067665,-0.239556,-0.373936,-0.462655,-0.423792,-0.283825,-0.030717,-0.095477,-2.115741,0.001606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6741,63d17cf704d2d7053d56b962,10,1,30,-0.319792,-0.007918,0.550288,-0.575521,-0.573356,-0.126927,-0.031000,-0.015750,-0.057124,-0.028372
6742,63d3fa78d12b38b131ef6b76,10,1,6,-0.430556,0.025831,-0.141545,-0.192583,-0.219508,1.128251,-0.093619,-0.154377,0.414314,-0.404858
6743,63d3fa78d12b38b131ef6b76,10,1,30,-0.430556,-0.173593,-0.336540,-0.329634,-0.332594,0.806612,-0.092801,-0.141693,1.188597,-0.582875
6744,63e53dc0bb780ac38cdcf77e,10,1,6,-0.663998,0.849603,1.693547,-0.591645,-0.693737,0.712473,-0.105125,-0.106548,0.877122,-0.804346


In [106]:
# gaze_fix_que_z.to_csv('gaze_fix_que_z.csv', index=False)

In [107]:
# Check for null values and remove
gaze_fix_que_z.isnull().sum()

gaze_fix_que_no_null_z = gaze_fix_que_z.dropna()
gaze_fix_que_no_null_z.isnull().sum()


Participant              0
Question                 0
Correct                  0
Paragraph                0
Question_RT              0
Gazes                    0
AOIGazes                 0
OffScreenGazesPix        0
OffScreenGazesProp       0
cluster_num_clusters     0
cluster_avg_duration     0
cluster_sd_duration      0
cluster_skew_duration    0
dispersion               0
dtype: int64

In [108]:
# gaze_fix_que_no_null_z.to_csv('gaze_fix_que_no_null_z.csv', index=False)

In [109]:
# Count number of participants and instances after merging
num_participants = gaze_fix_que_z['Participant'].nunique()
print ("Number of participants in merged data = ", num_participants)

num_instances = len(gaze_fix_que_z)
print ("Number of instances in merged data = ", num_instances)

Number of participants in merged data =  338
Number of instances in merged data =  6746
