# Correlations with Simpson et al. (2012)

The table in `other-studies/Simpsonetal2012_subset-renamed.csv` is based on the original data that is available from https://link.springer.com/article/10.3758/s13428-012-0271-4#SecESM1 . It has been subsetted and the labels have been renamed to make it comparable to this study.

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [5]:
# compare the similarity matrix with matrix by Simpson et al.

import glob

# prepare matrices from other studies

Simpson  = pd.read_csv("csv/other-studies/Simpsonetal2012_subset-renamed.csv", header=0, index_col=0, sep=";")
# erase diagonal
for c in Simpson.columns:
  Simpson.loc[c][c] = np.NaN

#display(Simpson)

# prepare data from the present study
# and statistical coefficients

paths = sorted(glob.glob("csv/similarity-matrices/latin/*.csv"))
# alpha after Bonferroni correction
alpha = 0.05 / len(paths)
print("Corrected alpha is:", round(alpha, 5), ", for the number of hypotheses:", len(paths))

# compile results
studyname = "Simpson"
paths = {path.split("/")[-1].replace(".csv", "").replace("_latin", "").replace("-", " ").title().replace("Pt", "PT"):path for path in paths}
e = pd.DataFrame(index=paths.keys(), columns=["Size", "%s (coef)" % studyname, "%s (p-val)" % studyname])

for font, _ in e.iterrows():
    if "Sparse" in font:
      e.loc[font]["Size"] = "26 x 26"
    else:
      e.loc[font]["Size"] = "8 x 8"
    sm  = pd.read_csv(paths[font], header=0, index_col=0, skiprows=[1,], usecols=[0,2,3,4,5,6,7,8,9])
    # erase diagonal
    for c in sm.columns:
      sm.loc[c][c] = np.NaN
    ll = np.dot(*sm.shape)
    s1 = sm.values.reshape(ll)
    
    # calculate spearman’s rank correlation coefficient
    # subset and reshape
    s2 = pd.DataFrame(eval(studyname), index=sm.index, columns=sm.columns).values.reshape(ll)
    coef, p_val = stats.spearmanr(s1, s2, nan_policy="omit")
    e.loc[font][studyname + " (coef)"] = round(coef, 2)
    e.loc[font][studyname + " (p-val)"] = round(p_val, 3)
display(e)

Corrected alpha is: 0.00417 , for the number of hypotheses: 12


Unnamed: 0,Size,Simpson (coef),Simpson (p-val)
Arial,8 x 8,0.88,0.0
Calibri,8 x 8,0.77,0.0
Cambria,8 x 8,0.79,0.0
Candara,8 x 8,0.62,0.0
Century Schoolbook,8 x 8,0.81,0.0
Courier New,8 x 8,0.81,0.0
Futura,8 x 8,0.83,0.0
Georgia,8 x 8,0.74,0.0
PT Sans,8 x 8,0.89,0.0
PT Serif,8 x 8,0.82,0.0


## Response counts for Arial

In [4]:
f = pd.read_csv("csv/counts/counts_all.csv", header=[0, 1, 2], index_col=[0])[('latin', 'arial')].dropna(how="all")

display(f)

Unnamed: 0,"('latn.a', 'latn.e', 'latn.l')","('latn.a', 'latn.e', 'latn.p')","('latn.a', 'latn.e', 'latn.s')","('latn.a', 'latn.e', 'latn.t')","('latn.a', 'latn.e', 'latn.y')","('latn.a', 'latn.e', 'latn.z')","('latn.a', 'latn.l', 'latn.p')","('latn.a', 'latn.l', 'latn.s')","('latn.a', 'latn.l', 'latn.t')","('latn.a', 'latn.l', 'latn.y')",...,"('latn.p', 'latn.s', 'latn.t')","('latn.p', 'latn.s', 'latn.y')","('latn.p', 'latn.s', 'latn.z')","('latn.p', 'latn.t', 'latn.y')","('latn.p', 'latn.t', 'latn.z')","('latn.p', 'latn.y', 'latn.z')","('latn.s', 'latn.t', 'latn.y')","('latn.s', 'latn.t', 'latn.z')","('latn.s', 'latn.y', 'latn.z')","('latn.t', 'latn.y', 'latn.z')"
latn.a,1.0,12.0,7.0,1.0,3.0,2.0,14.0,7.0,56.0,21.0,...,,,,,,,,,,
latn.e,1.0,3.0,12.0,1.0,2.0,1.0,,,,,...,,,,,,,,,,
latn.l,65.0,,,,,,51.0,60.0,8.0,39.0,...,,,,,,,,,,
latn.p,,52.0,,,,,2.0,,,,...,8.0,9.0,48.0,18.0,20.0,23.0,,,,
latn.s,,,48.0,,,,,,,,...,22.0,43.0,4.0,,,,38.0,10.0,7.0,
latn.t,,,,65.0,,,,,3.0,,...,37.0,,,43.0,25.0,,21.0,53.0,,36.0
latn.y,,,,,62.0,,,,,7.0,...,,15.0,,6.0,,7.0,8.0,,53.0,8.0
latn.z,,,,,,64.0,,,,,...,,,15.0,,22.0,37.0,,4.0,7.0,23.0


## Predictions for Arial trials based on Simpson et al.

In [5]:
# Form count predictions regarding triplet trials
# A chance of a character to be the OOOO corresponds
# to the measure of similarity of the remaining pair of character
# The counts are made to match

sim = pd.DataFrame(columns=f.columns, index=f.index)

for triplet_ in f.columns:
    triplet = eval(triplet_)
    triplet_sum = 0
    triplet_sim = pd.DataFrame(index=triplet, columns=[triplet_])
    for i, char in enumerate(triplet):
        # get pair similarity of the other two characters from Simpson et al.
        pairsim = Simpson[triplet[(i+1)%3]][triplet[(i-1)%3]]
        triplet_sim.at[char, triplet_] = pairsim
        triplet_sum += pairsim
    sim[triplet_] = triplet_sim / triplet_sum * f[triplet_].sum()

display(sim)

Unnamed: 0,"('latn.a', 'latn.e', 'latn.l')","('latn.a', 'latn.e', 'latn.p')","('latn.a', 'latn.e', 'latn.s')","('latn.a', 'latn.e', 'latn.t')","('latn.a', 'latn.e', 'latn.y')","('latn.a', 'latn.e', 'latn.z')","('latn.a', 'latn.l', 'latn.p')","('latn.a', 'latn.l', 'latn.s')","('latn.a', 'latn.l', 'latn.t')","('latn.a', 'latn.l', 'latn.y')",...,"('latn.p', 'latn.s', 'latn.t')","('latn.p', 'latn.s', 'latn.y')","('latn.p', 'latn.s', 'latn.z')","('latn.p', 'latn.t', 'latn.y')","('latn.p', 'latn.t', 'latn.z')","('latn.p', 'latn.y', 'latn.z')","('latn.s', 'latn.t', 'latn.y')","('latn.s', 'latn.t', 'latn.z')","('latn.s', 'latn.y', 'latn.z')","('latn.t', 'latn.y', 'latn.z')"
latn.a,12.870736,20.536398,19.068564,14.179894,15.717466,14.608197,24.462475,17.020785,45.202399,30.469849,...,,,,,,,,,,
latn.e,13.231598,17.37037,18.461837,12.643739,12.275685,15.047541,,,,,...,,,,,,,,,,
latn.l,40.897666,,,,,,27.588235,32.95843,10.748126,18.012563,...,,,,,,,,,,
latn.p,,29.093231,,,,,14.94929,,,,...,21.21372,19.541667,31.606522,20.9375,22.570292,25.504931,,,,
latn.s,,,29.469599,,,,,17.020785,,,...,23.511873,28.945175,17.041304,,,,24.937965,18.338362,23.813996,
latn.t,,,,40.176367,,,,,11.049475,,...,22.274406,,,27.497917,20.793103,,22.111663,31.334052,,27.512766
latn.y,,,,,39.006849,,,,,18.517588,...,,18.513158,,18.564583,,15.461538,19.950372,,26.775322,18.104255
latn.z,,,,,,37.344262,,,,,...,,,18.352174,,23.636605,26.033531,,17.327586,16.410681,21.382979


In [5]:
# ---------------------------------------------------
# Fisher’s Exact test via R

from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from sklearn import metrics
# activate the Pandas conversion of rpy2
pandas2ri.activate()
# import stats package
rstats = importr("stats")

def fisher_exact(d1, d2):
    """
    (Using R via rpy2. See the imports above.)
    
    Performs Fisher’s exact test for testing the null of independence
    of rows and columns in a contingency table with fixed marginals.
    
    Input:  df - DataFrame 2x3
    Output: pval - p-value of the test
    """
    
    counts = pd.DataFrame(columns=["T1","T2"], index=[0, 1, 2])
    counts["T1"] = list(d1)
    counts["T2"] = list(d2)
    # perform Fisher’s test
    res = rstats.fisher_test(counts)
    # return p-value
    return res.rx("p.value")[0][0]

alpha = 0.05

# Use Fisher’s exact test to compare the observed responses with predictions
# if the p-value is smaller than 0.05 (alpha) then there is an effect (i.e. the results are dependent on the study)
# Also count percentage and R2 (Cohen’s kappa can’t be used here)

oooo_count = 0
f_ = pd.DataFrame(0, columns=f.columns, index=[0, 1, 2])
sim_ = pd.DataFrame(0, columns=f.columns, index=[0, 1, 2])
for triplet_ in f.columns:
    triplet = eval(triplet_)
    
    f_[triplet_] = list(f[triplet_][list(triplet)].fillna(0))
    sim_[triplet_] = list(sim[triplet_][list(triplet)].fillna(0))
    p_val = fisher_exact(f_[triplet_], sim_[triplet_])
    print(triplet_)
    if p_val < alpha:
        print("Fisher’s exact test: effect exists")
    else:
        print("Fisher’s exact test: no effect")
    if f[triplet_].idxmax() == sim[triplet_].idxmax():
        oooo_count += 1
        print("Identical OOOO")
    else:
        print("Different OOOO")

print()
print("Number of trials", len(f.columns))
print("OOOO agreement:", round(100 * oooo_count / len(f.columns), 2), "%")
print("Accuracy score:", round(metrics.accuracy_score(f_.idxmax(), sim_.idxmax()), 2))
print("Precision score:", round(metrics.precision_score(f_.idxmax(), sim_.idxmax(), average="macro"), 2))
print("Recall score:", round(metrics.recall_score(f_.idxmax(), sim_.idxmax(), average="macro"), 2))
print("R2 score:", round(metrics.r2_score(f_.values, sim_.values), 2))




NameError: name 'sim' is not defined