# Effect of typeface design

- typeface comparisons, i. e. comparisons of responses for the same triplet in different typefaces (Fisher’s Exact test)
- overall report on shared triplets and which cause a typeface design effect
- correlation of similarity matrices that shared the same characters in different typefaces

In [7]:
# intro, defaults (shared across notebooks)

import os

import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
from chardict import chardict

%matplotlib inline


# ---------------------------------------------------
# Fisher’s Exact test via R

from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
# activate the Pandas conversion of rpy2
pandas2ri.activate()
# import stats package
rstats = importr("stats")

def fisher_exact(counts, columns=["T1", "T2"]):
    """
    (Using R via rpy2. See the imports above.)
    
    Performs Fisher’s exact test for testing the null of independence
    of rows and columns in a contingency table with fixed marginals.
    
    Input:  df - DataFrame 2x3
    Output: p_val - p-value of the test
    """

    def scale(counts, total):
        """
        Scale the counts to fit the total provided.
        Avoid rounding errors.
        """
        
        counts = counts / counts.sum() * total
        counts = counts.round()
        counts.iloc[2] = total - (counts.iloc[0] + counts.iloc[1])
        return counts

    # scale counts with the smaller total to the larger total
    nc = counts.copy()
    total1 = nc[columns[0]].sum()
    total2 = nc[columns[1]].sum()
    if total1 < total2:
        nc[columns[0]] = scale(nc[columns[0]], total2)
    elif total1 > total2:
        nc[columns[1]] = scale(nc[columns[1]], total1)
    
    # perform Fisher’s test
    res = rstats.fisher_test(counts)
    p_val = res.rx("p.value")[0][0]
    return nc, p_val


# ---------------------------------------------------
# helpers

def fix_columns(cols):
    for col in cols:
        if col[0] == "control":
            yield col
        else:
            col = list(col)
            col[-1] = tuple(eval(col[-1]))
            yield tuple(col)

def make_compact(d):
    """
    Make DataFrame compact = use [0, 1, 2]
    instead of all characters on the index
    """

    d_compact = pd.DataFrame(columns=d.columns, index=[0, 1, 2])
    for col in d.columns:
        triplet = col[-1] # last item in the index is a triplet
        col_data = pd.Series(d[col].dropna(), index=triplet)
        d_compact[col] = list(col_data) # past regardless the index
    d = d_compact.fillna(0)
    return d


# ---------------------------------------------------
# set global properties for plotting

font = {"family":"Adapter Mono PE", "size":"10", "weight":"medium"}
figure = {"titlesize":"10","titleweight":"medium"}
axes = {"titlesize":"10", "titleweight":"medium", "labelsize":"10", "labelweight":"medium"}
plt.rc("font", **font)
plt.rc("figure", **figure)
plt.rc("axes", **axes)
box_colors = dict(boxes="Black", whiskers="0.5", medians="Black", caps="0.5")

In [8]:
# Get data

# raw data

d = pd.read_csv("csv/raw-data-preprocessed.csv", index_col=0, header=[0,1,2], dtype="unicode")
d.sort_index(axis=1, inplace=True)

# fix type
d["control", "control", "order"] = d["control", "control", "order"].astype("float").astype("int")

# fix triplet columns (convert them to tuples)
d.columns = pd.MultiIndex.from_tuples(list(fix_columns(d.columns)))

# data frame just for the demographics
demo = d["control", "control"]

print("Imported %d rows, %d columns." % d.shape)

# frequencies

global_frequencies = pd.read_csv("csv/frequencies/frequencies_all.csv", index_col=0, header=[0, 1, 2])
global_counts = pd.read_csv("csv/counts/counts_all.csv", index_col=0, header=[0, 1, 2])

# Warning: the third level (2) of the columns is now strings: '('cyrl.ef', 'cyrl.er', 'cyrl.yu')', not tuples of strings: ('cyrl.ef', 'cyrl.er', 'cyrl.yu')

print("Imported %d frequencies and %d counts." % (len(global_frequencies), len(global_counts)))

# Get nice names of typefaces
all_typefaces = {}
for _, t, _ in global_frequencies.columns:
  t_ = t.title()
  for k, v in {
    "-": " ",
    "Mt": "MT",
    "Pt": "PT",
    "Ui": "UI",
    "Itf": "ITF",
  }.items():
    t_ = t_.replace(k, v)
  if t not in all_typefaces:
    all_typefaces[t] = t_

Imported 1787 rows, 1579 columns.
Imported 96 frequencies and 96 counts.


In [9]:
# Find triplets that occured in more than one typeface

typefaces_triplets = {}
index_cols = ["script", "T1", "T2"]
all_triplets = pd.DataFrame(index=[], columns=index_cols)
for script in ["cyrillic", "devanagari", "latin"]:
    typefaces_triplets[script] = {}
    f = global_frequencies[script]
    f.columns = f.columns.remove_unused_levels()
    typefaces = sorted(f.columns.levels[0])
    for i, t1 in enumerate(typefaces):
        triplets1 = set(list(f[t1].columns))
        for j in range(i+1, len(typefaces)):
            t2 = typefaces[j]
            triplets2 = set(list(f[t2].columns))
            overlap = list(triplets1 & triplets2)
            if len(overlap) > 0:
                typefaces_triplets[script][(t1, t2)] = overlap
                i = len(all_triplets)
                all_triplets.loc[i, ["script", "T1", "T2"]] = (script, t1, t2)
                all_triplets.loc[i, overlap] = False
all_triplets = all_triplets.set_index(index_cols)

In [16]:
# Comparison of trial responses for different typefaces

# get a list of triplets shared by at least two typefaces
# using only sets with 5 or more triplets
# those exist only in Latin (4 typefaces)
# some smaller sets are in Devanagari and Cyrillic, not using those

# using Fisher’s exact test to compare results between two different groups
# if the p-value is smaller than 0.05 then there is an effect (i.e. the results are dependent on the typeface)
# requires imports and activation of rpy2 (see the first cell)

for script in ["cyrillic", "devanagari", "latin"]:

    # make dirs
    if not os.path.exists("csv/typeface-comparisons/"):
        os.makedirs("csv/typeface-comparisons/")

    # report comparisons
    for t1, t2 in typefaces_triplets[script]:
        # make MultiIndex: triplet | typefaces
        for triplet in typefaces_triplets[script][(t1, t2)]:
            triplet_ = sorted(eval(triplet))
            cols = pd.MultiIndex.from_tuples([(t1, "original"), (t1, "scaled"), (t2, "original"), (t2, "scaled")])
            report = pd.DataFrame(columns=cols, index=[*triplet_, "p-value"])
            # get response counts for both typefaces for a triplet
            # simplify the column names so R does not have a problem
            counts = pd.DataFrame(columns=["T1","T2"], index=triplet_)
            # fill NAs with 0 so we FET can be calculated
            # get only the values for the three characters from the triplet
            counts["T1"] = global_counts[script, t1, triplet]
            counts["T2"] = global_counts[script, t2, triplet]
            counts.fillna(0, inplace=True)
            # run FE
            new_counts, p_val = fisher_exact(counts, columns=["T1", "T2"])
            report.loc[triplet_, (t1, "original")] = counts["T1"] 
            report.loc[triplet_, (t1, "scaled")] = new_counts["T1"] 
            report.loc[triplet_, (t2, "original")] = counts["T2"] 
            report.loc[triplet_, (t2, "scaled")] = new_counts["T2"] 
            report.loc["p-value", [t1, t2]] = round(p_val, 3)

            all_triplets.loc[(script, t1, t2), triplet] = (p_val < 0.05)

            #print("Saving report for comparison", t1, "vs", t2, triplet)
            path = os.path.join("csv", "typeface-comparisons", script, "%s__%s__%s.csv" % (t1, t2, "_".join(triplet_)))
            dirs = os.path.dirname(path)
            if not os.path.exists(dirs):
                os.makedirs(dirs)
            report.to_csv(path)

In [11]:
# Overall report of all triplets compared and those that had an effect

overall_report = pd.DataFrame(index=all_triplets.index, columns=[])
percentage = 100 * all_triplets.sum(axis=1) / all_triplets.count(axis=1)
overall_report["Shared triplets"] = all_triplets.count(axis=1)
overall_report["FET (n)"] = all_triplets.sum(axis=1)
overall_report["FET (%)"] = percentage.apply(lambda x: f"{x:.1f} %")
display(overall_report)
overall_report.to_csv("csv/typeface-comparisons_overall.csv")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Shared triplets,FET (n),FET (%)
script,T1,T2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cyrillic,century-schoolbook,courier-new,1,1,100.0 %
cyrillic,pt-sans,pt-serif,1,0,0.0 %
devanagari,devanagari-mt,nirmala-ui,4,0,0.0 %
devanagari,lohit-devanagari,murty-hindi,1,1,100.0 %
latin,arial,times-new-roman,1,0,0.0 %
latin,calibri,candara,4,0,0.0 %
latin,calibri,century-schoolbook,56,5,8.9 %
latin,calibri,georgia,4,1,25.0 %
latin,calibri,verdana,1,0,0.0 %
latin,cambria,futura,4,0,0.0 %


In [12]:
# Correlation of similarity matrices where the selections of characters studied are the same
# Candara vs Georgia
# Futura vs PT Serif
typefaces = [("candara", "georgia"), ("futura", "pt-serif")]

def flatten_sm(data):
  """
  Flatten similarity matrix
  before correlation test
  """

  a = []
  for i in range(8):
    for j in range(i+1, 8):
      a.append(data.iloc[i, j])
  return a

for t1, t2 in typefaces:
  td1 = pd.read_csv(f"csv/similarity-matrices/latin/{t1}.csv",
                   index_col=[0,1],
                   header=[0,1]
                   ).astype(float)
  td2 = pd.read_csv(f"csv/similarity-matrices/latin/{t2}.csv",
                   index_col=[0,1],
                   header=[0,1]
                   ).astype(float)
  if (td1.index != td2.index).any() or  (td1.columns != td2.columns).any():
    print("Error: indexes do not match!")
  else:
    print(f"Shared characters for '{t1}' and '{t2}':", [c for _, c in td1.columns])
  display(td1)
  display(td2)
  td1 = flatten_sm(td1)
  td2 = flatten_sm(td2)
  correlation = st.spearmanr(td1, td2, alternative="two-sided")
  #print(correlation)
  # print(len(td1), len(td2))
  # print(td1)
  # print(td2)
  print(f"Correlation of similarity matrices for '{t1}' and '{t2}'")
  print("- spearman r:", correlation.statistic.round(3))
  print("- spearman p-val:", correlation.pvalue.round(3))

Shared characters for 'candara' and 'georgia': ['b', 'd', 'i', 'j', 'k', 'l', 'v', 'y']


Unnamed: 0_level_0,Unnamed: 1_level_0,latn.b,latn.d,latn.i,latn.j,latn.k,latn.l,latn.v,latn.y
Unnamed: 0_level_1,Unnamed: 1_level_1,b,d,i,j,k,l,v,y
latn.b,b,,0.97,0.13,0.1,0.58,0.28,0.19,0.19
latn.d,d,0.97,,0.13,0.13,0.53,0.25,0.18,0.2
latn.i,i,0.13,0.13,,0.92,0.12,0.75,0.16,0.1
latn.j,j,0.1,0.13,0.92,,0.1,0.63,0.14,0.34
latn.k,k,0.58,0.53,0.12,0.1,,0.23,0.48,0.4
latn.l,l,0.28,0.25,0.75,0.63,0.23,,0.16,0.08
latn.v,v,0.19,0.18,0.16,0.14,0.48,0.16,,0.87
latn.y,y,0.19,0.2,0.1,0.34,0.4,0.08,0.87,


Unnamed: 0_level_0,Unnamed: 1_level_0,latn.b,latn.d,latn.i,latn.j,latn.k,latn.l,latn.v,latn.y
Unnamed: 0_level_1,Unnamed: 1_level_1,b,d,i,j,k,l,v,y
latn.b,b,,0.98,0.13,0.15,0.6,0.47,0.13,0.1
latn.d,d,0.98,,0.2,0.13,0.58,0.48,0.13,0.1
latn.i,i,0.13,0.2,,0.83,0.13,0.61,0.2,0.09
latn.j,j,0.15,0.13,0.83,,0.08,0.36,0.09,0.53
latn.k,k,0.6,0.58,0.13,0.08,,0.46,0.4,0.32
latn.l,l,0.47,0.48,0.61,0.36,0.46,,0.12,0.08
latn.v,v,0.13,0.13,0.2,0.09,0.4,0.12,,0.85
latn.y,y,0.1,0.1,0.09,0.53,0.32,0.08,0.85,


Correlation of similarity matrices for 'candara' and 'georgia'
- spearman r: 0.85
- spearman p-val: 0.0
Shared characters for 'futura' and 'pt-serif': ['g', 'h', 'm', 'n', 'q', 'r', 'y', 'z']


Unnamed: 0_level_0,Unnamed: 1_level_0,latn.g,latn.h,latn.m,latn.n,latn.q,latn.r,latn.y,latn.z
Unnamed: 0_level_1,Unnamed: 1_level_1,g,h,m,n,q,r,y,z
latn.g,g,,0.3,0.2,0.17,0.95,0.19,0.44,0.03
latn.h,h,0.3,,0.58,0.69,0.39,0.5,0.16,0.08
latn.m,m,0.2,0.58,,0.91,0.18,0.68,0.04,0.23
latn.n,n,0.17,0.69,0.91,,0.18,0.74,0.02,0.25
latn.q,q,0.95,0.39,0.18,0.18,,0.18,0.46,0.03
latn.r,r,0.19,0.5,0.68,0.74,0.18,,0.06,0.24
latn.y,y,0.44,0.16,0.04,0.02,0.46,0.06,,0.47
latn.z,z,0.03,0.08,0.23,0.25,0.03,0.24,0.47,


Unnamed: 0_level_0,Unnamed: 1_level_0,latn.g,latn.h,latn.m,latn.n,latn.q,latn.r,latn.y,latn.z
Unnamed: 0_level_1,Unnamed: 1_level_1,g,h,m,n,q,r,y,z
latn.g,g,,0.16,0.14,0.11,0.54,0.12,0.34,0.06
latn.h,h,0.16,,0.7,0.76,0.4,0.46,0.23,0.15
latn.m,m,0.14,0.7,,0.93,0.27,0.58,0.13,0.21
latn.n,n,0.11,0.76,0.93,,0.29,0.6,0.17,0.26
latn.q,q,0.54,0.4,0.27,0.29,,0.23,0.39,0.06
latn.r,r,0.12,0.46,0.58,0.6,0.23,,0.24,0.33
latn.y,y,0.34,0.23,0.13,0.17,0.39,0.24,,0.48
latn.z,z,0.06,0.15,0.21,0.26,0.06,0.33,0.48,


Correlation of similarity matrices for 'futura' and 'pt-serif'
- spearman r: 0.861
- spearman p-val: 0.0
