In [5]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import researchpy as rp
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import statsmodels.stats.multicomp
from itertools import combinations

datafilename = "data_processed-outliers-replaced.csv"
d = pd.read_csv(datafilename)
print("Loading", len(d), "lines of data (outliers replaced).")

resultsfilename = "data_stats-outliers-replaced.csv"
st = pd.read_csv(resultsfilename)
print("Loading", len(st), "lines of stats.")

# ------------------------------------------------------
# functions to add eta squared and omega squared
# to the ANOVA summary table

def eta_squared(aov):
    aov['eta_sq'] = 'NaN'
    aov['eta_sq'] = aov[:-1]['sum_sq']/sum(aov['sum_sq'])
    return aov

def omega_squared(aov):
    mse = aov['sum_sq'][-1]/aov['df'][-1]
    aov['omega_sq'] = 'NaN'
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*mse))/(sum(aov['sum_sq'])+mse)
    return aov

Loading 15768 lines of data (outliers replaced).
Loading 876 lines of stats.


In [7]:
factors = ["StudyID", "Training", "Font", "Firstfont"]
metrics = ["RTnorm"]

def anova(st, factors, metrics, no):
    for f in combinations(factors, no):
        if f[0] != f[1]:
            f_ = ["C(%s)" % x for x in f]
            for metric in metrics:
                formula = metric + " ~ " + (" * ".join(f_))
                model = ols(formula, st).fit()
                aov_table = anova_lm(model, typ=2)
                eta_squared(aov_table)
                omega_squared(aov_table)
                print()
                print("# %s (metric: %s)" % (f, metric))
                print()
                display(aov_table)
                # overall model significance
                print(f"Overall model F(%d, %d) = %.3f, p = %.4f" % (model.df_model, model.df_resid, model.fvalue, model.f_pvalue))
                print()

anova(st, factors, metrics, 2)
print(30 * "_")
anova(st, factors, metrics, 3)

AssertionError: 

In [9]:
anova(st[st["Type"] == "lexical"], ["StudyID", "isDesigner"], ["AUC", "Correct"], 2)


# ('StudyID', 'isDesigner') (metric: AUC)



Unnamed: 0,sum_sq,df,F,PR(>F),eta_sq,omega_sq
C(StudyID),0.06501,1.0,11.372588,0.000818,0.026854,0.024435
C(isDesigner),0.032483,1.0,5.682478,0.017601,0.013418,0.011031
C(StudyID):C(isDesigner),0.036852,1.0,6.446693,0.011494,0.015222,0.012831
Residual,2.28654,400.0,,,,


Overall model F(3, 400) = 7.919, p = 0.0000


# ('StudyID', 'isDesigner') (metric: Correct)



Unnamed: 0,sum_sq,df,F,PR(>F),eta_sq,omega_sq
C(StudyID),0.081044,1.0,7.082991,0.008095,0.016296,0.013963
C(isDesigner),0.267816,1.0,23.406214,2e-06,0.053852,0.051433
C(StudyID):C(isDesigner),0.047471,1.0,4.148769,0.042323,0.009545,0.007228
Residual,4.57683,400.0,,,,


Overall model F(3, 400) = 11.682, p = 0.0000



In [4]:
from numpy.random import normal
import pyvttbl as pt
from collections import namedtuple

N = 40
P = ["noise","quiet"]
rts = [998,511]
mus = rts*N

Sub = namedtuple('Sub', ['Sub_id', 'rt','condition'])               
df = pt.DataFrame()
for subid in xrange(0,N):
    for i,condition in enumerate(P):
        df.insert(Sub(subid+1,
                     normal(mus[i], scale=112., size=1)[0],
                           condition)._asdict())     

ModuleNotFoundError: No module named 'base'

In [None]:
print()