In [None]:
#!pip import statsmodels

In [1]:
import numpy as np
import pandas as pd

from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd


## Water Treatment Experiment

A company which manufactures chemicals for residential swimming pools was testing 4 different formulas for their popular Pool Clarifier product.  For the experiment, a researcher tested 2500 water samples for each of the 4 formulations and measured te time it took the sample to become clear.  The results are included in the data_experiments_WaterTreatment.csv file.


In [2]:
data = pd.read_csv('data_experiments_WaterTreatment.csv')
data = data.set_index('SampID')
data

Unnamed: 0_level_0,Treatment,TimeToClear
SampID,Unnamed: 1_level_1,Unnamed: 2_level_1
10000,C,1556.776937
10003,D,825.006227
10008,A,2674.875595
10017,C,2053.963141
10018,B,2355.740818
...,...,...
39991,C,2405.901726
39992,A,1909.103429
39995,B,1934.744664
39995,D,1968.393357


In [3]:
mean_time_to_clear = data.groupby("Treatment")["TimeToClear"].mean()
mean_time_to_clear

Treatment
A    2054.882696
B    2012.227302
C    2061.473356
D    2080.903534
Name: TimeToClear, dtype: float64

## One-Way ANOVA Test

Null Hypothesis: the sample means of all groups are identical.

Alternative Hypothesis: There is a significant difference in the mean of at least one of the groups.


In [4]:
fvalue, pvalue = stats.f_oneway(data.loc[data['Treatment'] == "A", 'TimeToClear'], \
                                data.loc[data['Treatment'] == "B", 'TimeToClear'], \
                                data.loc[data['Treatment'] == "C", 'TimeToClear'], \
                                data.loc[data['Treatment'] == "D", 'TimeToClear'])



In [5]:
print(f"Results of ANOVA test:\n The F-statistic is: {fvalue}\n The p-value is: {pvalue}")

Results of ANOVA test:
 The F-statistic is: 8.239940268552948
 The p-value is: 1.7920500776780752e-05


## Pairwise Tukey Test




In [6]:
m_comp = pairwise_tukeyhsd(endog=data['TimeToClear'], \
                           groups=data['Treatment'], alpha=0.05)
print(m_comp)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
group1 group2 meandiff p-adj   lower    upper   reject
------------------------------------------------------
     A      B -42.6554 0.0148 -79.3063  -6.0045   True
     A      C   6.5907 0.9673 -30.0603  43.2416  False
     A      D  26.0208 0.2619 -10.6301  62.6718  False
     B      C  49.2461 0.0031  12.5951   85.897   True
     B      D  68.6762    0.0  32.0253 105.3272   True
     C      D  19.4302 0.5232 -17.2208  56.0811  False
------------------------------------------------------
