# IMPORT LIBRARIES

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler


# SET AND RESET ROWS AND COLS

In [9]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')

# READ FILES

In [36]:
df1 = pd.read_csv("NLP_text.csv")
df2 = pd.read_csv("gaze_fix_que_z.csv")
df3 = pd.read_csv("neurotypical_participants.csv")
df4 = pd.read_csv("neurodivergent_participants.csv")
df_d1 = pd.read_csv("d1.csv")
df_d2 = pd.read_csv("d2.csv")
df_d3 = pd.read_csv("d3.csv")
df_d4 = pd.read_csv("d4.csv")
df_d5 = pd.read_csv("d5.csv")
df_d6 = pd.read_csv("d6.csv")
df_d7_d8 = pd.read_csv("d7_d8.csv")

In [24]:
# Count number of participants and instances for gaze fixation question data
num_participants = df2['Participant'].nunique()
num_instances = len(df2)

print(f"  Number of participants = {num_participants}")
print(f"  Number of instances = {num_instances}")


  Number of participants = 338
  Number of instances = 6746


# ANALYZE - STANDARDIZE, MERGE MULTIPLE FILES, COUNT NUMBER OF PARTICIPANTS & INSTANCES

## DATA ON ALL PARTICIPANT

In [25]:
# Standardize columns from NLP data
columns_to_exclude = ['ParaText', 'Paragraph', 'Probe', 'sentiment']
df_to_standardize = df1.drop(columns=columns_to_exclude)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_to_standardize)

df_standardized = pd.DataFrame(scaled_features, columns=df_to_standardize.columns)
df1 = pd.concat([df1[columns_to_exclude].reset_index(drop=True), df_standardized], axis=1)

df1

Unnamed: 0,ParaText,Paragraph,Probe,sentiment,wordCount_with_stopwords,syllableCount_with_stopwords,ease_of_reading,wordCount_without_stopwords,syllableCount_without_stopwords,CD,...,IN,JJS,MD,VB,RB,VBZ,RBS,JJR,WRB,RBR
0,one day early 1900s prominent american busines...,1,N,1,-0.29655,-0.416297,0.353859,-0.118287,-0.344358,2.0,...,-0.783168,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
1,hopkins time one nations famous advertising ex...,2,N,-1,1.043343,1.16208,-1.191968,0.957051,1.123398,0.75,...,0.70858,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
2,hopkins greatest contribution would helping cr...,3,N,1,0.333988,0.688567,-1.035896,0.598605,0.935224,-0.5,...,-0.783168,6.244998,1.745743,1.008097,0.11341,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
3,discovered story claude hopkins years ago repo...,4,Y,1,1.358611,1.609287,-0.903943,1.673943,1.913728,-0.5,...,2.200329,-0.160128,-0.436436,-0.604858,2.381602,3.689324,-0.160128,-0.229416,-0.333333,-0.160128
4,hopkins start america brushing,5,N,0,-1.557625,-1.547467,0.053774,-1.62376,-1.548671,-0.5,...,0.70858,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
5,taking advantage quirk neurology habits wouldn...,6,N,0,0.018719,0.083522,-0.572644,0.025091,0.107259,-0.5,...,0.70858,-0.160128,1.745743,1.008097,1.247506,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
6,historical outliers seemed intuit accidentally...,7,N,1,-0.178324,0.293973,-2.754821,-0.046598,0.521241,-0.5,...,-0.783168,-0.160128,-0.436436,-0.604858,0.11341,0.527046,6.244998,-0.229416,-0.333333,-0.160128
7,craving turns powers habit,8,N,0,-1.439399,-1.521161,2.238789,-1.62376,-1.62394,-0.5,...,-0.783168,-0.160128,-0.436436,-0.604858,-1.020686,0.527046,-0.160128,-0.229416,-0.333333,-0.160128
8,hopkins signed promote pepsodent realized need...,9,N,0,0.570439,0.372891,0.656781,0.240159,0.144894,0.75,...,0.70858,-0.160128,-0.436436,1.008097,0.680458,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
9,gave appealing idea resolved advertise toothpa...,10,Y,1,-1.163539,-1.179179,0.546112,-1.337003,-1.247592,-0.5,...,-0.783168,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128


In [26]:
correctness_all = pd.merge(df2, df1, on='Paragraph')
correctness_all

Unnamed: 0,Participant,Question,Correct,Paragraph,Question_RT,Gazes,AOIGazes,OffScreenGazesPix,OffScreenGazesProp,cluster_num_clusters,...,IN,JJS,MD,VB,RB,VBZ,RBS,JJR,WRB,RBR
0,593890eac6aa16000101f037,1,1,1,-0.708886,-0.119902,-0.504823,0.137953,0.404284,-0.534860,...,-0.783168,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
1,596e1af7a09655000197d4bb,1,1,1,-0.544851,-0.554031,-0.753242,0.037180,0.079620,-0.181841,...,-0.783168,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
2,5af835d8e19f8c00019e6dc0,1,1,1,0.085448,0.081056,-0.510166,-0.043439,0.229185,0.500662,...,-0.783168,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
3,5b07b71c68eff50001d1c859,1,1,1,-0.855959,-0.790271,-0.625026,-0.402191,-0.500398,-0.472101,...,-0.783168,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
4,5bd7971b0aac450001f951aa,1,1,1,-0.067665,-0.239556,-0.373936,-0.462655,-0.423792,-0.283825,...,-0.783168,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6741,6295270587394aac7ce600db,10,1,30,-0.612038,-0.736580,-0.809336,-0.079717,-0.183029,-0.056323,...,-0.783168,-0.160128,-0.436436,-0.604858,0.680458,-0.527046,-0.160128,-0.229416,3.000000,-0.160128
6742,63038702fea51556c37c36cf,10,0,30,-0.305793,-0.560167,-0.873444,0.706314,0.531961,-0.542705,...,-0.783168,-0.160128,-0.436436,-0.604858,0.680458,-0.527046,-0.160128,-0.229416,3.000000,-0.160128
6743,63d17cf704d2d7053d56b962,10,1,30,-0.319792,-0.007918,0.550288,-0.575521,-0.573356,-0.126927,...,-0.783168,-0.160128,-0.436436,-0.604858,0.680458,-0.527046,-0.160128,-0.229416,3.000000,-0.160128
6744,63d3fa78d12b38b131ef6b76,10,1,30,-0.430556,-0.173593,-0.336540,-0.329634,-0.332594,0.806612,...,-0.783168,-0.160128,-0.436436,-0.604858,0.680458,-0.527046,-0.160128,-0.229416,3.000000,-0.160128


In [34]:
# Count number of participants and instances after merging for correctness_all
num_participants = correctness_all['Participant'].nunique()
num_instances = len(correctness_all)
base_rate = df['Correct'].mean()

print(f"  Number of participants = {num_participants}")
print(f"  Number of instances = {num_instances}")
print(f"  Base Rate = {base_rate:.2f}")


  Number of participants = 338
  Number of instances = 6746
  Base Rate = 0.79


In [50]:
# # save file
# correctness_all.to_csv('correctness_all_data.csv', index=False)

## NEUROTYPICAL AND NEURODIVRERGENT DATA

In [28]:
# NEUROTYPICAL DATA
correctness_neurotypical = pd.merge(correctness_all, df3['Participant'], on='Participant')
correctness_neurotypical

Unnamed: 0,Participant,Question,Correct,Paragraph,Question_RT,Gazes,AOIGazes,OffScreenGazesPix,OffScreenGazesProp,cluster_num_clusters,...,IN,JJS,MD,VB,RB,VBZ,RBS,JJR,WRB,RBR
0,5af835d8e19f8c00019e6dc0,1,1,1,0.085448,0.081056,-0.510166,-0.043439,0.229185,0.500662,...,-0.783168,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
1,5af835d8e19f8c00019e6dc0,2,1,2,-0.266036,1.165612,0.053451,0.303221,1.028077,1.959807,...,0.708580,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
2,5af835d8e19f8c00019e6dc0,9,0,2,0.814927,1.165612,0.053451,0.303221,1.028077,1.959807,...,0.708580,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
3,5af835d8e19f8c00019e6dc0,3,1,13,-0.791022,2.061483,-0.536877,1.826912,2.523721,1.324373,...,0.708580,-0.160128,-0.436436,1.008097,0.113410,2.635231,-0.160128,4.358899,-0.333333,-0.160128
4,5af835d8e19f8c00019e6dc0,10,1,13,0.714828,2.061483,-0.536877,1.826912,2.523721,1.324373,...,0.708580,-0.160128,-0.436436,1.008097,0.113410,2.635231,-0.160128,4.358899,-0.333333,-0.160128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3415,63d41088860131e49a85cfac,8,1,4,-0.663110,0.309625,-0.117504,-0.079717,-0.055352,-0.566240,...,2.200329,-0.160128,-0.436436,-0.604858,2.381602,3.689324,-0.160128,-0.229416,-0.333333,-0.160128
3416,63d41088860131e49a85cfac,10,0,4,-0.725218,0.309625,-0.117504,-0.079717,-0.055352,-0.566240,...,2.200329,-0.160128,-0.436436,-0.604858,2.381602,3.689324,-0.160128,-0.229416,-0.333333,-0.160128
3417,63d41088860131e49a85cfac,9,1,3,-0.143720,-0.187399,-0.275103,-0.349789,-0.274227,-0.558395,...,-0.783168,6.244998,1.745743,1.008097,0.113410,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
3418,63d41088860131e49a85cfac,10,0,6,-0.725218,-0.544827,-0.459413,-0.386067,-0.398256,-0.566240,...,0.708580,-0.160128,1.745743,1.008097,1.247506,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128


In [29]:
# NEURODIVERGENT DATA
correctness_neurodivergent = pd.merge(correctness_all, df4['Participant'], on='Participant')
correctness_neurodivergent

Unnamed: 0,Participant,Question,Correct,Paragraph,Question_RT,Gazes,AOIGazes,OffScreenGazesPix,OffScreenGazesProp,cluster_num_clusters,...,IN,JJS,MD,VB,RB,VBZ,RBS,JJR,WRB,RBR
0,593890eac6aa16000101f037,1,1,1,-0.708886,-0.119902,-0.504823,0.137953,0.404284,-0.534860,...,-0.783168,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
1,593890eac6aa16000101f037,2,1,2,-0.622569,-0.076949,-0.611670,0.456397,0.590328,-0.511326,...,0.708580,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
2,593890eac6aa16000101f037,9,0,2,-0.296946,-0.076949,-0.611670,0.456397,0.590328,-0.511326,...,0.708580,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
3,593890eac6aa16000101f037,3,0,13,-0.348007,-0.015588,0.560972,-0.486841,-0.197621,-0.487791,...,0.708580,-0.160128,-0.436436,1.008097,0.113410,2.635231,-0.160128,4.358899,-0.333333,-0.160128
4,593890eac6aa16000101f037,10,1,13,-0.642050,-0.015588,0.560972,-0.486841,-0.197621,-0.487791,...,0.708580,-0.160128,-0.436436,1.008097,0.113410,2.635231,-0.160128,4.358899,-0.333333,-0.160128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3321,63e53dc0bb780ac38cdcf77e,8,1,4,-0.614134,5.735475,5.908648,1.758386,2.935936,4.274042,...,2.200329,-0.160128,-0.436436,-0.604858,2.381602,3.689324,-0.160128,-0.229416,-0.333333,-0.160128
3322,63e53dc0bb780ac38cdcf77e,10,1,4,-0.663998,5.735475,5.908648,1.758386,2.935936,4.274042,...,2.200329,-0.160128,-0.436436,-0.604858,2.381602,3.689324,-0.160128,-0.229416,-0.333333,-0.160128
3323,63e53dc0bb780ac38cdcf77e,9,1,3,-0.452546,1.616615,0.854801,1.073128,1.867097,1.065492,...,-0.783168,6.244998,1.745743,1.008097,0.113410,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
3324,63e53dc0bb780ac38cdcf77e,10,1,6,-0.663998,0.849603,1.693547,-0.591645,-0.693737,0.712473,...,0.708580,-0.160128,1.745743,1.008097,1.247506,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128


In [33]:
# Count number of participants and instances after filtering for neurotypical and neurodivergent data

dataframes = [
    ("neurotypical data", correctness_neurotypical),
    ("neurodivergent data", correctness_neurodivergent)
]
def calculate_statistics(df):
    num_participants = df['Participant'].nunique()
    num_instances = len(df) 
    base_rate = df['Correct'].mean()
    return num_participants, num_instances, base_rate

for i, (name, df) in enumerate(dataframes, start=1):
    num_participants, num_instances, base_rate = calculate_statistics(df)
    print(f"{name}:")
    print(f"  Number of participants = {num_participants}")
    print(f"  Number of instances = {num_instances}")
    print(f"  Base Rate: {base_rate:.2f}")

neurotypical data:
  Number of participants = 171
  Number of instances = 3420
  Base Rate: 0.78
neurodivergent data:
  Number of participants = 167
  Number of instances = 3326
  Base Rate: 0.79


In [51]:
# # save file
# correctness_neurotypical.to_csv('correctness_neurotypical_data.csv', index=False)
# correctness_neurodivergent.to_csv('correctness_neurodivergent_data.csv', index=False)

# NEURODIVERGENT DIAGNOSIS

In [37]:
correctness_d1 = pd.merge(correctness_neurodivergent, df_d1['Participant'], on='Participant')
correctness_d1

Unnamed: 0,Participant,Question,Correct,Paragraph,Question_RT,Gazes,AOIGazes,OffScreenGazesPix,OffScreenGazesProp,cluster_num_clusters,...,IN,JJS,MD,VB,RB,VBZ,RBS,JJR,WRB,RBR
0,593890eac6aa16000101f037,1,1,1,-0.708886,-0.119902,-0.504823,0.137953,0.404284,-0.534860,...,-0.783168,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
1,593890eac6aa16000101f037,2,1,2,-0.622569,-0.076949,-0.611670,0.456397,0.590328,-0.511326,...,0.708580,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
2,593890eac6aa16000101f037,9,0,2,-0.296946,-0.076949,-0.611670,0.456397,0.590328,-0.511326,...,0.708580,-0.160128,-0.436436,-0.604858,-1.020686,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
3,593890eac6aa16000101f037,3,0,13,-0.348007,-0.015588,0.560972,-0.486841,-0.197621,-0.487791,...,0.708580,-0.160128,-0.436436,1.008097,0.113410,2.635231,-0.160128,4.358899,-0.333333,-0.160128
4,593890eac6aa16000101f037,10,1,13,-0.642050,-0.015588,0.560972,-0.486841,-0.197621,-0.487791,...,0.708580,-0.160128,-0.436436,1.008097,0.113410,2.635231,-0.160128,4.358899,-0.333333,-0.160128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1381,63e53dc0bb780ac38cdcf77e,8,1,4,-0.614134,5.735475,5.908648,1.758386,2.935936,4.274042,...,2.200329,-0.160128,-0.436436,-0.604858,2.381602,3.689324,-0.160128,-0.229416,-0.333333,-0.160128
1382,63e53dc0bb780ac38cdcf77e,10,1,4,-0.663998,5.735475,5.908648,1.758386,2.935936,4.274042,...,2.200329,-0.160128,-0.436436,-0.604858,2.381602,3.689324,-0.160128,-0.229416,-0.333333,-0.160128
1383,63e53dc0bb780ac38cdcf77e,9,1,3,-0.452546,1.616615,0.854801,1.073128,1.867097,1.065492,...,-0.783168,6.244998,1.745743,1.008097,0.113410,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128
1384,63e53dc0bb780ac38cdcf77e,10,1,6,-0.663998,0.849603,1.693547,-0.591645,-0.693737,0.712473,...,0.708580,-0.160128,1.745743,1.008097,1.247506,-0.527046,-0.160128,-0.229416,-0.333333,-0.160128


In [44]:
# NEURODIVERGENT DIAGNOSIS
def diagnosis(data1, data2, participant):
    correctness_diagnosis = pd.merge(data1, data2[participant], on=participant)
    return correctness_diagnosis

correctness_d1 = diagnosis(correctness_neurodivergent, df_d1, "Participant")
correctness_d2 = diagnosis(correctness_neurodivergent, df_d2, "Participant")
correctness_d3 = diagnosis(correctness_neurodivergent, df_d3, "Participant")
correctness_d4 = diagnosis(correctness_neurodivergent, df_d4, "Participant")
correctness_d5 = diagnosis(correctness_neurodivergent, df_d5, "Participant")
correctness_d6 = diagnosis(correctness_neurodivergent, df_d6, "Participant")
correctness_d7_d8 = diagnosis(correctness_neurodivergent, df_d7_d8, "Participant")


In [46]:
# Count number of participants and instances after filtering for neurodivergent diagnosis data

dataframes = [
    ("D1", correctness_d1),
    ("D2", correctness_d2),
    ("D3", correctness_d3),
    ("D4", correctness_d4),
    ("D5", correctness_d5),
    ("D6", correctness_d6),
    ("D7_D8", correctness_d7_d8)
    
]
def calculate_statistics(df):
    num_participants = df['Participant'].nunique()
    num_instances = len(df) 
    base_rate = df['Correct'].mean()
    return num_participants, num_instances, base_rate

for i, (name, df) in enumerate(dataframes, start=1):
    num_participants, num_instances, base_rate = calculate_statistics(df)
    print(f"{name}:")
    print(f"  Number of participants = {num_participants}")
    print(f"  Number of instances = {num_instances}")
    print(f"  Base Rate: {base_rate:.2f}")

D1:
  Number of participants = 70
  Number of instances = 1386
  Base Rate: 0.79
D2:
  Number of participants = 61
  Number of instances = 1206
  Base Rate: 0.80
D3:
  Number of participants = 13
  Number of instances = 260
  Base Rate: 0.85
D4:
  Number of participants = 13
  Number of instances = 260
  Base Rate: 0.74
D5:
  Number of participants = 84
  Number of instances = 1666
  Base Rate: 0.82
D6:
  Number of participants = 37
  Number of instances = 726
  Base Rate: 0.81
D7_D8:
  Number of participants = 33
  Number of instances = 660
  Base Rate: 0.75


In [52]:
# # save file
# correctness_d1.to_csv('correctness_d1_data.csv', index=False)
# correctness_d2.to_csv('correctness_d2_data.csv', index=False)
# correctness_d3.to_csv('correctness_d3_data.csv', index=False)
# correctness_d4.to_csv('correctness_d4_data.csv', index=False)
# correctness_d5.to_csv('correctness_d5_data.csv', index=False)
# correctness_d6.to_csv('correctness_d6_data.csv', index=False)
# correctness_d7_d8.to_csv('correctness_d7_d8_data.csv', index=False)