# 1-Way ANOVA by hand (from scratch)

In [1]:
import pandas as pd

In [2]:
A = [12.6, 12, 11.8, 11.9, 13, 12.5, 14]
B = [10, 10.2, 10, 12, 14, 13]
C = [10.1, 13, 13.4, 12.9, 8.9, 10.7, 13.6, 12]

In [3]:
all_scores = A + B + C
company_names = (['A'] * len(A)) +  (['B'] * len(B)) +  (['C'] * len(C))

In [4]:
data = pd.DataFrame({'company': company_names, 'score': all_scores})

In [5]:
data

Unnamed: 0,company,score
0,A,12.6
1,A,12.0
2,A,11.8
3,A,11.9
4,A,13.0
5,A,12.5
6,A,14.0
7,B,10.0
8,B,10.2
9,B,10.0


In [6]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [8]:
lm = ols('score ~ company',data=data).fit()
table = sm.stats.anova_lm(lm)
print(table)

            df     sum_sq   mean_sq         F    PR(>F)
company    2.0   3.606905  1.803452  0.821297  0.455683
Residual  18.0  39.525476  2.195860       NaN       NaN


In [None]:
# compute overall mean
overall_mean = data['score'].mean()
overall_mean

In [None]:
# compute Sum of Squares Total
data['overall_mean'] = overall_mean
ss_total = sum((data['score'] - data['overall_mean'])**2)
ss_total

In [None]:
# compute group means
group_means = data.groupby('company').mean()
group_means = group_means.rename(columns = {'score': 'group_mean'})
group_means

In [None]:
# add group means and overall mean to the original data frame
data = data.merge(group_means, left_on = 'company', right_index = True)

In [None]:
# compute Sum of Squares Residual
ss_residual = sum((data['score'] - data['group_mean'])**2)
ss_residual

In [None]:
# compute Sum of Squares Model
ss_explained = sum((data['overall_mean'] - data['group_mean'])**2)
ss_explained

In [None]:
# compute Mean Square Residual
n_groups = len(set(data['company']))
n_obs = data.shape[0]
df_residual = n_obs - n_groups
ms_residual = ss_residual / df_residual
ms_residual

In [None]:
 #compute Mean Square Explained
df_explained = n_groups - 1
ms_explained = ss_explained / df_explained
ms_explained

In [None]:
# compute F-Value
f = ms_explained / ms_residual
f

In [None]:
# compute p-value
import scipy.stats
p_value = 1 - scipy.stats.f.cdf(f, df_explained, df_residual)
p_value