# Statistical comparisons

- comparison of responses for different typefaces (Fisher’s Exact test)
- comparison between different groups of participants depending on their design skills and nativity/fluency (Cohen kappa)
- comparison to other studies (Simpson et al.)

In [1]:
# intro, defaults (shared across notebooks)

import os
import sys

import pandas as pd
from pandas import DataFrame, Series, MultiIndex
import numpy as np
import scipy.stats as st
from sklearn import metrics
import matplotlib.pyplot as plt
import matplotlib
from chardict import chardict
import drawBot as db
from IPython.display import Image
%matplotlib inline


# ---------------------------------------------------
# Fisher’s Exact test via R

from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
# activate the Pandas conversion of rpy2
pandas2ri.activate()
# import stats package
rstats = importr("stats")

def fisher_exact(df):
    """
    (Using R via rpy2. See the imports above.)
    
    Performs Fisher’s exact test for testing the null of independence
    of rows and columns in a contingency table with fixed marginals.
    
    Input:  df - DataFrame 2x3
    Output: pval - p-value of the test
    """
    
    # convert DataFrame to R DataFrame
    #rdf = pandas2ri.py2ri(df)
    # perform Fisher’s test
    res = rstats.fisher_test(df)
    # return p-value
    return res.rx("p.value")[0][0]


# ---------------------------------------------------
# helpers

def fix_columns(cols):
    for col in cols:
        if col[0] == "control":
            yield col
        else:
            col = list(col)
            col[-1] = tuple(eval(col[-1]))
            yield tuple(col)

def make_compact(d):
    """
    Make DataFrame compact = use [0, 1, 2]
    instead of all characters on the index
    """

    d_compact = pd.DataFrame(columns=d.columns, index=[0, 1, 2])
    for col in d.columns:
        triplet = col[-1] # last item in the index is a triplet
        col_data = pd.Series(d[col].dropna(), index=triplet)
        d_compact[col] = list(col_data) # past regardless the index
    d = d_compact.fillna(0)
    return d


# ---------------------------------------------------
# set global properties for plotting

font = {"family":"Adapter Mono PE", "size":"10", "weight":"medium"}
figure = {"titlesize":"10","titleweight":"medium"}
axes = {"titlesize":"10", "titleweight":"medium", "labelsize":"10", "labelweight":"medium"}
plt.rc("font", **font)
plt.rc("figure", **figure)
plt.rc("axes", **axes)
box_colors = dict(boxes="Black", whiskers="0.5", medians="Black", caps="0.5")

In [2]:
# Get data

# raw data

d = pd.read_csv("csv/raw-data-preprocessed.csv", index_col=0, header=[0,1,2], dtype="unicode")
non_control_columns = [col for col in d.columns if col[0] != "control"]

# fix type
d["control", "control", "order"] = d["control", "control", "order"].astype("float").astype("int")

# fix triplet columns (convert them to tuples)
d.columns = pd.MultiIndex.from_tuples(list(fix_columns(d.columns)))

# data frame just for the demographics
demo = d["control", "control"]

print("Imported %d rows, %d columns." % d.shape)

# frequencies

global_frequencies = pd.read_csv("csv/frequencies/frequencies_all.csv", index_col=0, header=[0, 1, 2])
global_counts = pd.read_csv("csv/counts/counts_all.csv", index_col=0, header=[0, 1, 2])

# Warning: the third level (2) of the columns is now strings: '('cyrl.ef', 'cyrl.er', 'cyrl.yu')', not tuples of strings: ('cyrl.ef', 'cyrl.er', 'cyrl.yu')

print("Imported %d frequencies and %d counts." % (len(global_frequencies), len(global_counts)))


Imported 1788 rows, 1579 columns.
Imported 96 frequencies and 96 counts.


  demo = d["control", "control"]


In [8]:
# Comparison of responses for different typefaces

# get a list of triplets shared by at least two typefaces
# using only sets with 5 or more triplets
# those exist only in Latin (4 typefaces)
# some smaller sets are in Devanagari and Cyrillic, not using those

# using Fisher’s exact test to compare results between two different groups
# if the p-value is smaller than 0.05 then there is an effect (i.e. the results are dependent on the typeface)
# requires imports and activation of rpy2 (see the first cell)

def scale_counts(counts, total):
    """
    Scale the counts to fit the total provided.
    Avoid rounding errors.
    """
    
    counts = counts / counts.sum() * total
    counts = counts.round()
    counts.iloc[2] = total - (counts.iloc[0] + counts.iloc[1])
    return counts


for script in ["cyrillic", "devanagari", "latin"]:
    # find typefaces with shared triplets
    typefaces_triplets = {}
    f = global_frequencies[script]
    f.columns = f.columns.remove_unused_levels()
    typefaces = sorted(f.columns.levels[0])
    for i, t1 in enumerate(typefaces):
        triplets1 = set(f[t1].columns)
        for j in range(i+1, len(typefaces)):
            t2 = typefaces[j]
            triplets2 = set(f[t2].columns)
            overlap = list(triplets1 & triplets2)
            if len(overlap) > 0:
                typefaces_triplets[(t1, t2)] = overlap

    # make dirs
    if not os.path.exists("csv/typeface-comparisons/"):
        os.makedirs("csv/typeface-comparisons/")

    # report comparisons
    for t1, t2 in typefaces_triplets:
        # make MultiIndex: triplet | typefaces
        for triplet in typefaces_triplets[(t1, t2)]:
            triplet_ = eval(triplet)
            report = pd.DataFrame(columns=[t1, t2], index=[*triplet_, "p-value", "effect"])
            # get response counts for both typefaces for a triplet
            # simplify the column names so R does not have a problem
            counts = pd.DataFrame(columns=["T1","T2"], index=triplet_)
            # fill NAs with 0 so we FE test can be calculated
            # get only the values for the three characters from the triplet
            counts["T1"] = global_counts[script, t1, triplet]
            counts["T2"] = global_counts[script, t2, triplet]
            counts = counts.fillna(0)
            # scale counts with the smaller total to the larger total
            total1 = counts["T1"].sum()
            total2 = counts["T2"].sum()
            if total1 < total2:
                counts["T1"] = scale_counts(counts["T1"], total2)
            elif total1 > total2:
                counts["T2"] = scale_counts(counts["T2"], total1)
            # run FE
            p_val = fisher_exact(counts)
            report[t1][0:3] = counts["T1"] 
            report[t2][0:3] = counts["T2"] 
            report[t1]["p-value"] = round(p_val, 3)
            report[t2]["p-value"] = round(p_val, 3)
            report[t1]["effect"] = (p_val < 0.05) # alpha
            report[t2]["effect"] = (p_val < 0.05) # alpha

            print("Saving report for comparison", t1, "vs", t2, triplet)
            path = os.path.join("csv", "typeface-comparisons", script, "%s__%s__%s.csv" % (t1, t2, "_".join(triplet_)))
            dirs = os.path.dirname(path)
            if not os.path.exists(dirs):
                os.makedirs(dirs)
            report.to_csv(path)

ValueError: Length mismatch: Expected axis has 5 elements, new values have 3 elements

In [4]:
# frequencies & distribution of the most popular response (todo refactor)

# functions

def get_frequencies_typeface(d, index=None, mode="relative", sort=False):
    """
    Find relative frequencies for characters in each triplet in DF.
    
    Parameters:
        index: is used to pass a shared index across multiple typefaces
        mode: indicates whether the counts should be normalized (i.e. relative frequencies)
        sort: whether to return a sorted Series instead of matching responses to characters
    Return:
        DF with frequencies.
    """
    
    # the index need to be built first
    # so value_counts below fall to the right slot
    if sort:
        index = [0, 1, 2]
    elif index is None:
        index = []
        for triplet in d.columns:
            index += list(triplet)
        index = sorted(list(set(index)))
    # count frequencies
    frequencies = DataFrame(index=index, columns=d.columns)
    if mode == "relative":
        for triplet in d.columns:
            # get relative frequencies for each character in the triplet
            if sort:
                value_counts = d[triplet].value_counts(dropna=True, normalize=True, ascending=True)
                value_counts.index = index[:len(value_counts)]
            else:
                value_counts = d[triplet].value_counts(dropna=True, normalize=True)
            if not value_counts.empty:
                frequencies[triplet] = value_counts
    elif mode == "absolute":
        for triplet in d.columns:
            # get frequencies for each character in the triplet, do not normalize
            if sort:
                value_counts = d[triplet].value_counts(dropna=True, ascending=True)
                value_counts.index = index[:len(value_counts)]
            else:
                value_counts = d[triplet].value_counts(dropna=True)
            if not value_counts.empty:
                frequencies[triplet] = value_counts
        
    return frequencies

def get_frequencies_script(d, mode="relative", sort=False):
    """
    Find relative frequencies for each typeface in DF.
    
    Parameters:
        mode: indicates whether the counts should be normalized (i.e. relative frequencies)
    Return:
        DF with frequencies.
    """
    
    # the index need to be built first
    # so value_counts below fall to the right slot
    if sort:
        index = [0, 1, 2]
    else:
        index = []
        for triplet in d.columns.levels[1]:
            index += list(triplet)
        index = sorted(list(set(index)))
    frequencies = DataFrame(index=index, columns=d.columns)
    for typeface in d.columns.levels[0]:
        frequencies[typeface] = get_frequencies_typeface(d[typeface], index=index, mode=mode, sort=sort)
        
    return frequencies

def get_frequencies(d, mode="relative", sort=False):
    """
    Get relative frequencies for all scripts, everything.
    
    Parameters:
        mode: indicates whether the counts should be normalized (i.e. relative frequencies)
    """
    
    d = d[["cyrillic", "devanagari", "latin"]]
    d.columns = d.columns.remove_unused_levels()
    # in case the DF contains multiple scripts, flatten it
    if len(d.columns.levels) > 2:
        backup_cols = d.columns
        dt = d
        dt.columns = dt.columns.droplevel(level=0).unique()
        frequencies = get_frequencies_script(dt, mode=mode, sort=sort)
        frequencies.columns = backup_cols
    else:
        frequencies = get_frequencies_script(d, mode=mode, sort=sort)
    return frequencies

def as_classifier(d):
    frequencies_counts = get_frequencies(d, mode="absolute").reindex(sorted_chars)
    majority = get_majority(frequencies_counts)
    majority = make_compact(majority)
    length = np.dot(*majority.values.shape)
    majority = majority.T.values.reshape(length).astype(float).astype(int)
    return majority

def get_condition(column, value, script=None):
    """
    Compile a condition to filter out dataset based on particular condition/value.
    """
    
    if value == "*":
        # show everything
        condition = True
    if value == "Designer":
        # special case in design skills
        condition = (d["control", "control", column] != "Non-designer") & (d["control", "control", column] != "Letter designer")
    elif value.startswith("!"):
        # negative case
        condition = (d[("control", "control", column)] != value[1:])
    else:
        # positive case
        condition = (d[("control", "control", column)] == value)
    # filter script
    if script is not None:
        condition &= (d[("control", "control", "script")] == script)
    
    return condition

In [5]:


results = pd.DataFrame(columns=["Script", "Design categories", "Trials", "Percentage", "Kappa"])
sorted_scripts = ["cyrillic", "devanagari", "latin"]

def compare_constrasting_groups_kappa(labels, column, values):
    """
    Take two distributions of two contrasting groups from
    the data selected based on the (control, control, column) columns,
    convert them to classifications, and use Cohen’s kappa to compare them.
    """

    print()
    print("##", " vs. ".join(labels))
    if len(values) != 2:
        print("Error.Need precisely two groups to compare")
        return
    for script in sorted_scripts:
        shared_cols = d.columns
        dt = []
        for i, (label, value) in enumerate(zip(labels, values)):
            condition = get_condition(column, value, script)
            dtt = d[condition]#.dropna(axis="columns", how="all")
            dt.append(dtt)
            dtt_counts = get_frequencies(dtt, mode="absolute")
            # find trials shared by all datasets
            # with at least 5 participants
            shared_cols2 = []
            for col in shared_cols:
                if col in dtt.columns and col in dtt_counts and (dtt_counts[col].sum() > 5):
                    shared_cols2.append(col)
            shared_cols = shared_cols2
        ct = []
        counts = []
        if len(shared_cols) > 0:
            for dtt in dt:
                counts.append(len(dtt.index))
                x = pd.DataFrame(dtt, columns=shared_cols)
                ct.append(as_classifier(x))
            categories = "%s (%d) vs. %s (%d)" % (labels[0], counts[0], labels[1], counts[1])
            percentage = 100 * (ct[0] & ct[1]).sum() / (len(ct[0]) / 3)
            percentage = "%.2f %%" % percentage
            kappa = round(metrics.cohen_kappa_score(*ct), 3)
            print(script, categories, len(shared_cols), "->", percentage, kappa)
            row = pd.DataFrame([], columns=results.columns, index=[0])
            row["Script"] = script
            row["Design categories"] = categories
            row["Trials"] = len(shared_cols)
            row["Percentage"] = percentage
            row["Kappa"] = kappa
            results.append(row, ignore_index=True)
        else:
            print(script, "insufficient number of trials with more than 5 participants in each group.")


# ---------------------------------------------------
compare_constrasting_groups_kappa(["Non-designers", "Designers"], "design skills", ["Non-designer", "Designer"])
compare_constrasting_groups_kappa(["Non-designers", "Letter designers"], "design skills", ["Non-designer", "Letter designer"])
compare_constrasting_groups_kappa(["Designers", "Letter designers"], "design skills", ["Designer", "Letter designer"])

compare_constrasting_groups_kappa(["Native", "Non-native"], "native in script", ["True", "False"])
compare_constrasting_groups_kappa(["Fluent", "Non-fluent"], "fluent in script", ["True", "False"])


## Non-designers vs. Designers


KeyError: "['cyrillic' 'devanagari' 'latin'] not in index"

In [None]:
display(results)

Unnamed: 0,Script,Design categories,Trials,Percentage,Kappa
