# Correctness, difficulty and visits

Is there a statistically significant difference in the correctness/confidence of answers depending on:
- number of visits (on all elements of the graphic, only on the content, only on the command)?
- the total and average length of visit (on all elements of the graphic, only on the content, only on the command)?

### Preparing sets for testing
- visits (number, total length, average, average pupil size, last visited AOI)
- vs
- correctness
- difficulty

In [1]:
import pandas as pd
import re

from functions.set_preparations import prepare_row

In [2]:
df_answers = pd.read_csv('/Users/martasolarz/Studies/Thesis/Master_thesis/prepare_datasets/sets/correctness.csv', index_col='ID')
df_answers.head()

Unnamed: 0_level_0,1a_trud,1b_trud,2a_trud,2b_trud,3a_trud,3b_trud,Trud_all,Corr_all,1a_corr,1b_corr,...,SumCorr_a,SumCorr_b,SumCorr_1,SumCorr_2,SumCorr_3,SumTrud_a,SumTrud_b,SumTrud_1,SumTrud_2,SumTrud_3
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,1,4,3,4,4,19,4,1,1,...,2,2,2,0,2,11,8,4,7,8
2,1,2,3,1,2,4,13,5,1,1,...,2,3,2,2,1,6,7,3,4,6
3,2,1,3,1,1,1,9,4,1,1,...,2,2,2,2,0,6,3,3,4,2
4,3,2,2,1,3,5,16,5,1,1,...,3,2,2,2,1,8,8,5,3,8
5,4,2,3,2,3,3,17,6,1,1,...,3,3,2,2,2,10,7,6,5,6


In [3]:
df_1a = pd.read_csv('/Users/martasolarz/Studies/Thesis/data/metrics/AOI_visit/1a.csv', index_col='ID')

df_2a = pd.read_csv('/Users/martasolarz/Studies/Thesis/data/metrics/AOI_visit/2a.csv', index_col='ID')

df_3a = pd.read_csv('/Users/martasolarz/Studies/Thesis/data/metrics/AOI_visit/3a.csv', index_col='ID')

df_1b_aggr = pd.read_csv('/Users/martasolarz/Studies/Thesis/data/metrics/AOI_visit/1b-aggregate.csv', index_col='ID')

df_2b_aggr = pd.read_csv('/Users/martasolarz/Studies/Thesis/data/metrics/AOI_visit/2b-aggregate.csv', index_col='ID')

df_3b_aggr = pd.read_csv('/Users/martasolarz/Studies/Thesis/data/metrics/AOI_visit/3b-aggregate.csv', index_col='ID')

df_1b = pd.read_csv('/Users/martasolarz/Studies/Thesis/data/metrics/AOI_visit/1b.csv', index_col='ID')

df_2b = pd.read_csv('/Users/martasolarz/Studies/Thesis/data/metrics/AOI_visit/2b.csv', index_col='ID')

df_3b = pd.read_csv('/Users/martasolarz/Studies/Thesis/data/metrics/AOI_visit/3b.csv', index_col='ID')

In [4]:
df_a = pd.concat([df_1a, df_2a, df_3a], axis=1)  
df_b = pd.concat([df_1b, df_2b, df_3b], axis=1) 
df_1 = pd.concat([df_1a, df_1b], axis=1) 
df_2 = pd.concat([df_2a, df_2b], axis=1)
df_3 = pd.concat([df_3a, df_3b], axis=1)
df_all = pd.concat([df_a, df_b], axis=1)

df_1aaggr = df_1a.filter(regex='^(?!.*_com$).*$')
df_1aaggr.columns = [col.replace('_map', '') for col in df_1aaggr.columns]
df_2aaggr = df_2a.filter(regex='^(?!.*_com$).*$')
df_2aaggr.columns = [col.replace('_map', '') for col in df_2aaggr.columns]
df_3aaggr = df_3a.filter(regex='^(?!.*_com$).*$')
df_3aaggr.columns = [col.replace('_map', '') for col in df_3aaggr.columns]
df_a_aggr = pd.concat([df_1aaggr, df_2aaggr, df_3aaggr], axis=1)

df_1baggr = df_1b_aggr.add_suffix('_1b')
df_2baggr = df_2b_aggr.add_suffix('_2b')
df_3baggr = df_3b_aggr.add_suffix('_3b')
df_b_aggr = pd.concat([df_1baggr, df_2baggr, df_3baggr], axis=1)

In [5]:
df_all_aggr = pd.concat([df_a_aggr, df_b_aggr], axis=1)

In [6]:
total = df_1a.filter(regex='Total').sum(axis=1)
avg = df_1a.filter(regex='Avg').sum(axis=1)
num = df_1a.filter(regex='Num').sum(axis=1)
df_1A_aggr = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

total = df_2a.filter(regex='Total').sum(axis=1)
avg = df_2a.filter(regex='Avg').sum(axis=1)
num = df_2a.filter(regex='Num').sum(axis=1)
df_2A_aggr = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

total = df_3a.filter(regex='Total').sum(axis=1)
avg = df_3a.filter(regex='Avg').sum(axis=1)
num = df_3a.filter(regex='Num').sum(axis=1)
df_3A_aggr = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

df_1A_aggr = df_1A_aggr.add_suffix('_1a')
df_2A_aggr = df_2A_aggr.add_suffix('_2a')
df_3A_aggr = df_3A_aggr.add_suffix('_3a')

df_A_aggr = pd.concat([df_1A_aggr, df_2A_aggr, df_3A_aggr], axis=1)

In [7]:
total = df_1b.filter(regex='Total').sum(axis=1)
avg = df_1b.filter(regex='Avg').sum(axis=1)
num = df_1b.filter(regex='Num').sum(axis=1)
df_1B_aggr = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

total = df_2b.filter(regex='Total').sum(axis=1)
avg = df_2b.filter(regex='Avg').sum(axis=1)
num = df_2b.filter(regex='Num').sum(axis=1)
df_2B_aggr = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

total = df_3b.filter(regex='Total').sum(axis=1)
avg = df_3b.filter(regex='Avg').sum(axis=1)
num = df_3b.filter(regex='Num').sum(axis=1)
df_3B_aggr = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

df_1B_aggr = df_1B_aggr.add_suffix('_1b')
df_2B_aggr = df_2B_aggr.add_suffix('_2b')
df_3B_aggr = df_3B_aggr.add_suffix('_3b')

df_B_aggr = pd.concat([df_1B_aggr, df_2B_aggr, df_3B_aggr], axis=1)

In [8]:
df_ALL_aggr = pd.concat([df_A_aggr, df_B_aggr], axis=1)

In [9]:
total = df_ALL_aggr.filter(regex='Total_1').sum(axis=1)
avg = df_ALL_aggr.filter(regex='Avg_1').sum(axis=1)
num = df_ALL_aggr.filter(regex='Num_1').sum(axis=1)
df_1_AGGR = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

total = df_ALL_aggr.filter(regex='Total_2').sum(axis=1)
avg = df_ALL_aggr.filter(regex='Avg_2').sum(axis=1)
num = df_ALL_aggr.filter(regex='Num_2').sum(axis=1)
df_2_AGGR = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

total = df_ALL_aggr.filter(regex='Total_3').sum(axis=1)
avg = df_ALL_aggr.filter(regex='Avg_3').sum(axis=1)
num = df_ALL_aggr.filter(regex='Num_3').sum(axis=1)
df_3_AGGR = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

In [10]:
total = df_all_aggr.filter(regex='Total_1').sum(axis=1)
avg = df_all_aggr.filter(regex='Avg_1').sum(axis=1)
num = df_all_aggr.filter(regex='Num_1').sum(axis=1)
df_1_aggr = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

total = df_all_aggr.filter(regex='Total_2').sum(axis=1)
avg = df_all_aggr.filter(regex='Avg_2').sum(axis=1)
num = df_all_aggr.filter(regex='Num_2').sum(axis=1)
df_2_aggr = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

total = df_all_aggr.filter(regex='Total_3').sum(axis=1)
avg = df_all_aggr.filter(regex='Avg_3').sum(axis=1)
num = df_all_aggr.filter(regex='Num_3').sum(axis=1)
df_3_aggr = pd.DataFrame(data={'Total': total, 'Avg': avg, 'Num': num})

In [11]:
corr = df_answers.filter(regex='.+_corr$')
trud = df_answers.filter(regex='.+_trud$')
df_all[corr.columns] = corr
df_all[trud.columns] = trud

In [12]:
corr = df_answers.filter(regex='.+_corr$')
trud = df_answers.filter(regex='.+_trud$')

df_all_with_com = pd.concat([df_answers[corr.columns], df_answers[trud.columns], df_A_aggr, df_B_aggr], axis=1)

df_all_without_com = pd.concat([df_answers[corr.columns], df_answers[trud.columns], df_a_aggr, df_b_aggr], axis=1)

In [13]:
def create_sum_sets(dfWithCom, dfWithoutCom, dfAllElements, corr, trud):
    total = dfWithCom.filter(regex='Total').sum(axis=1)
    avg = dfWithCom.filter(regex='Avg').sum(axis=1)
    num = dfWithCom.filter(regex='Num').sum(axis=1)
    df_sumWithCom = pd.DataFrame(
        data={'Corr': df_answers[corr], 'Diff': df_answers[trud], 'Total': total, 'Avg': avg, 'Num': num})

    total = dfWithoutCom.filter(regex='Total').sum(axis=1)
    avg = dfWithoutCom.filter(regex='Avg').sum(axis=1)
    num = dfWithoutCom.filter(regex='Num').sum(axis=1)
    df_sumWithoutCom = pd.DataFrame(
        data={'Corr': df_answers[corr], 'Diff': df_answers[trud], 'Total': total, 'Avg': avg, 'Num': num})

    total = dfAllElements.filter(regex='^Total_.*_com$').sum(axis=1)
    avg = dfAllElements.filter(regex='^Avg_.*_com$').sum(axis=1)
    num = dfAllElements.filter(regex='^Num_.*_com$').sum(axis=1)
    df_sumOnlyCom = pd.DataFrame(
        data={'Corr': df_answers[corr], 'Diff': df_answers[trud], 'Total': total, 'Avg': avg, 'Num': num})

    return df_sumWithCom, df_sumWithoutCom, df_sumOnlyCom

In [14]:
df_sumAllWithCom, df_sumAllWithoutCom, df_sumAllOnlyCom = create_sum_sets(df_all_with_com, df_all_without_com, df_all, 'Corr_all', 'Trud_all')

df_sumAWithCom, df_sumAWithoutCom, df_sumAOnlyCom = create_sum_sets(df_A_aggr, df_a_aggr, df_a, 'SumCorr_a', 'SumTrud_a')

df_sumBWithCom, df_sumBWithoutCom, df_sumBOnlyCom = create_sum_sets(df_B_aggr, df_b_aggr, df_b, 'SumCorr_b', 'SumTrud_b')

df_sum1WithCom, df_sum1WithoutCom, df_sum1OnlyCom = create_sum_sets(df_1_AGGR, df_1_aggr, df_1, 'SumCorr_1', 'SumTrud_1')

df_sum2WithCom, df_sum2WithoutCom, df_sum2OnlyCom = create_sum_sets(df_2_AGGR, df_2_aggr, df_2, 'SumCorr_2', 'SumTrud_2')

df_sum3WithCom, df_sum3WithoutCom, df_sum3OnlyCom = create_sum_sets(df_3_AGGR, df_3_aggr, df_3, 'SumCorr_3', 'SumTrud_3')

In [15]:
def prepare_graphic_set(df, pattern):
    regex = [re.compile('Min'), re.compile('Max'), re.compile('DTTV'), re.compile('TTV')]
    columns_to_drop = [col for col in df.columns if any(pat.search(col) for pat in regex)]
    df.drop(columns=columns_to_drop, inplace=True)
    df.columns = [col.replace(f'_{pattern}', '') for col in df.columns]
    df['Corr'] = df_answers[f'{pattern}_corr']
    df['Diff'] = df_answers[f'{pattern}_trud']
    return df

In [16]:
df_1a_samples = prepare_graphic_set(df_1a, "1a")
df_2a_samples = prepare_graphic_set(df_2a, "2a")
df_3a_samples = prepare_graphic_set(df_3a, "3a")
df_1b_samples = prepare_graphic_set(df_1b, "1b")
df_2b_samples = prepare_graphic_set(df_2b, "2b")
df_3b_samples = prepare_graphic_set(df_3b, "3b")

In [17]:
dfs_sum = [df_sumAllWithCom, df_sumAllWithoutCom, df_sumAllOnlyCom, df_sumAWithCom, df_sumAWithoutCom, df_sumAOnlyCom, df_sumBWithCom, df_sumBWithoutCom, df_sumBOnlyCom, df_sum1WithCom, df_sum1WithoutCom, df_sum1OnlyCom,df_sum2WithCom, df_sum2WithoutCom, df_sum2OnlyCom, df_sum3WithCom, df_sum3WithoutCom, df_sum3OnlyCom]

dfs_samples = [df_1a_samples, df_1b_samples, df_2a_samples, df_2b_samples, df_3a_samples, df_3b_samples]

names_sum = {1: 'Zsumowane_wszystkie_grafiki_z_poleceniem',
 2: 'Zsumowane_wszystkie_grafiki_bez_polecenia',
 3: 'Zsumowane_wszystkie_grafiki_tylko_polecenie',
 4: 'Zsumowane_grafiki_A_z_poleceniem',
 5: 'Zsumowane_grafiki_A_bez_polecenia',
 6: 'Zsumowane_grafiki_A_tylko_polecenie',
 7: 'Zsumowane_grafiki_B_z_poleceniem',
 8: 'Zsumowane_grafiki_B_bez_polecenia',
 9: 'Zsumowane_grafiki_B_tylko_polecenie',
 10: 'Zsumowane_grafiki_1_z_poleceniem',
 11: 'Zsumowane_grafiki_1_bez_polecenia',
 12: 'Zsumowane_grafiki_1_tylko_polecenie',
 13: 'Zsumowane_grafiki_2_z_poleceniem',
 14: 'Zsumowane_grafiki_2_bez_polecenia',
 15: 'Zsumowane_grafiki_2_tylko_polecenie',
 16: 'Zsumowane_grafiki_3_z_poleceniem',
 17: 'Zsumowane_grafiki_3_bez_polecenia',
 18: 'Zsumowane_grafiki_3_tylko_polecenie',
}

names_samples = {
    1: '1a',
    2: '1b',
    3: '2a',
    4: '2b',
    5: '3a',
    6: '3b',
}

In [18]:
path = '/Users/martasolarz/Studies/Thesis/Master_thesis/prepare_datasets/sets/visit'

for idx, df in enumerate(dfs_sum):
    df.to_csv(f'{path}/dfs_sum_{names_sum[idx+1]}.csv', index=True)

for idx, df in enumerate(dfs_samples):
    df.to_csv(f'{path}/dfs_samples_{names_samples[idx+1]}.csv', index=True)