In [1]:
import anova_compute as anv

In [2]:
import os
import pandas as pd
import numpy as np
from scipy.stats import f
import matplotlib.pyplot as plt
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

# Problem 1: Treatment Effectiveness Score

## Given Data

In [3]:
treatment_1 = [8,9,6,8,5]
treatment_2 = [5,4,7,6,6]
treatment_3 = [9,3,2,4]
treatment_number = np.array([1,1,1,1,1,2,2,2,2,2,3,3,3,3])
scores = np.array([*treatment_1, *treatment_2, *treatment_3])

Q1) <br>
There are 3 different treatment groups and 14 individuals who are randomly assigned into the groups. The Table shows the effectiveness score of the treatment for each individual. Conduct a one--way ANOVA to test whether these 3 treatments are equally effective; i.e. test the hypothesis , where the  are the means for the three groups.

(1) Calculate SSB

In [4]:
SSB = anv.sum_squares_between(x = treatment_number, y = scores)
print(SSB)

16.714285714285715


(2) Calculate SSW

In [5]:
SSW = anv.sum_squares_within_one_way(x = treatment_number, y = scores)
print(SSW)

45.0


(3) Compute the F-test statistic.

In [6]:
k = 3
n = len(scores)
test_stat_F = (np.mean(SSB)/(k-1)) / (np.mean(SSW)/(n-k))
print('F_' + str(k-1) + '_' + str(n-k) + ' statistic: ',test_stat_F)

F_2_11 statistic:  2.042857142857143


(4) Find the p-value and state the conclusion of the test with alpha = 0.1 as the level of significance.

In [7]:
p_value = f.sf(test_stat_F, k-1, n-k)
print('p_value: ', p_value)

p_value:  0.17601409150114575


In [8]:
F_reject = f.isf(0.1, 1, 48, loc=0, scale=1)
print('F statistic to reject (@ alpha=0.1): ', F_reject)

F statistic to reject (@ alpha=0.1):  2.8130810040649394


In [9]:
if test_stat_F > F_reject:
    print('We reject Ho')
else:
    print('We do not reject Ho')

We do not reject Ho


# Example 2: Two-way ANOVA Table

We want to determine what impacts the variation in test scores
- is it gender
- is it the age group
- Or both?
We need a Two-way ANOVA test because we have two independent variables **age** and **gender**.

In [10]:
gender_vals = np.array([0,0,0,1,1,1,0,0,0,1,1,1,0,0,0,1,1,1])
score_vals = np.array([4,6,8,4,8,9,6,6,9,7,10,13,8,9,13,12,14,16])
age_vals = np.array([10,10,10,10,10,10,11,11,11,11,11,11,12,12,12,12,12,12])

# Create Dataframe
df = pd.DataFrame({'gender': gender_vals, 'score': score_vals, 'age': age_vals})
df.replace({0:'boys', 1:'girls'}, inplace =True)

# Display main dataframe and the different averages
display(df.head(6))
display(df.groupby(['gender','age']).mean())
display(df.groupby(['gender'])[['score']].mean())
display(df.groupby(['age'])[['score']].mean())

Unnamed: 0,gender,score,age
0,boys,4,10
1,boys,6,10
2,boys,8,10
3,girls,4,10
4,girls,8,10
5,girls,9,10


Unnamed: 0_level_0,Unnamed: 1_level_0,score
gender,age,Unnamed: 2_level_1
boys,10,6.0
boys,11,7.0
boys,12,10.0
girls,10,7.0
girls,11,10.0
girls,12,14.0


Unnamed: 0_level_0,score
gender,Unnamed: 1_level_1
boys,7.666667
girls,10.333333


Unnamed: 0_level_0,score
age,Unnamed: 1_level_1
10,6.5
11,8.5
12,12.0


       All Data                       Gender & Age                     Gender-only                Age-only

**Inputs**

In [11]:
y = score_vals
x1 = gender_vals
x2 = age_vals
factor_1 = 'gender'
factor_2 = 'age'
alpha = 0.05

**Anova Table**

In [12]:
df = anv.get_anova_table(x1 = age_vals, x2 = gender_vals,  y = score_vals, alpha =0.05, 
                         names = ['age', 'gender'])
df

Unnamed: 0_level_0,SS,df,Mean Square,F_statistic,F_critical,p_value,Decision
Source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
age,93.0,2,46.5,8.205882,3.885294,0.005677,Reject Ho
gender,32.0,1,32.0,5.647059,4.747225,0.034994,Reject Ho
Interaction,7.0,2,3.5,0.617647,3.885294,0.555502,Fail Reject Ho
Error,68.0,12,5.666667,,,,
Total,200.0,17,,,,,
