# 一元配置分散分析 (One-way ANOVA analysis)
### パッケージ・モジュールをインポートする　Import packages

In [1]:
import pandas as pd
import scipy.stats as ss

import statsmodels.api as sm
from statsmodels.formula.api import ols
# dataframeを綺麗に表示する　rich display for dataframes
from IPython.core.display import display

### Rのデータセットから植物成長データ　This data is from Rdatasets

Pandasでファイルを読み込む Load data into a pandas's dataframe

In [2]:
datafile = "data/plant_growth.csv"
data_frame = pd.read_csv(datafile, comment="#")
display(data_frame)

Unnamed: 0,No,weight,group
0,1,4.17,ctrl
1,2,5.58,ctrl
2,3,5.18,ctrl
3,4,6.11,ctrl
4,5,4.5,ctrl
5,6,4.61,ctrl
6,7,5.17,ctrl
7,8,4.53,ctrl
8,9,5.33,ctrl
9,10,5.14,ctrl


### グループの種類やデータの分割

"group"ヘッドの値を分割し、pythonの辞書型変数に代入する  split data into a python dict  
pd.unique()：ユニークのグループ名を抽出する。 Find unique group names

In [3]:
groups = pd.unique(data_frame.group.values)
# グループに対応するデータを分割し、pythonの辞書型変数に代入する
# Split data into groups and store them into a python dict
data = {group: data_frame['weight'][data_frame.group==group] for group in groups}

### scipyのf_oneway()関数 scipy's f_oneway() function

scipy's f_oneway():  
**定義：**  
```python  
  f_val, p_value = scipy.stats.f_oneway(*args)  

    sample1, sample2, ... : 各グループの標本データ　The sample measurements for each group.
    f_val: F値   F-statistic value
    p_value: p値 p value
```

### scipyのf_oneway()関数 scipy's f_oneway() function
使用条件：Assumption  
 1. 各標本は独立した。The samples are independent.  
 2. 各標本は正規分布に従う母集団からのものです。Each sample is from a normally distributed population.  
 3. 各グループの母集団の標準偏差はすべて等しい。いわゆる等分散性です。  
    The population standard deviations of the groups are all equal. This property is known as homoscedasticity.

In [4]:
f, p = ss.f_oneway(data['ctrl'], data['trt1'], data['trt2'])
print("Using f_oneway of scipy")
print("F-value: {} /p-value:{:.5f}".format(f, p))

Using f_oneway of scipy
F-value: 3.3483875849684397 /p-value:0.05293


### Using statsmodels

Step 1. statsmodelsのformulaを用いて、回帰分析を行う。  
Step 2. 回帰分析の結果に基づいて、一元配置分散分析を行う。  

<span style="color:red">aov_table</span> = **[anvova_lm](https://www.statsmodels.org/dev/generated/statsmodels.stats.anova.anova_lm.html)**(fitted_result, scale=None, test="F", typ=1, robust=None)  

**入力(input)は**  

<span style="color:blue">**fitted_result**</span>: statsmodelsのformulaの回帰結果を持つオブジェクト.  
One or more fitted linear models  

<span style="color:blue">**scale**</span>: 分散の評価です。"None"の場合、最大のモデルから評価する。  
               Estimate of variance, If None, will be estimated from the largest model.  

<span style="color:blue">**test**</span>: 統計量です。"F", "Chisq", "Cp", 或いは None.  
              Test statistics to provide.  "F", "Chisq", "Cp", or None  

<span style="color:blue">**typ**</span>: 平方和のタイプ。The type of Sum of squares to perform  
             unbalancedデータに対して, 平方和の計算方法は少なくとも３つがある。
             一般的にType I, II and IIIと呼びます。  
             For unbalanced data, there are at least 3 approaches to calculate the sums  
             of squares for ANOVA, commonly called Type I, II and III.  

<span style="color:blue">**robust**</span>: 等分散性を補正した係数の共分散を使う。  
                 Use heteroscedasticity-corrected coefficient covariance matrix.  

 **出力(output)は**  

<span style="color:red">**aov_table**</span>: pandasのDataFrameです。列は以下です。  
         "sum_sq": Sum of squares for model terms.  
         "df"    : Degrees of freedom for model terms.  
         "F"     : F statistic value for significance of adding model terms.  
         "PR(>F)": P-value for significance of adding model terms.  


In [5]:
model2 = ols(formula='weight ~ group', data=data_frame) # ols formulaのモデルを構築
res = model2.fit() # 回帰分析を行う
# see designed matrix
#display(res.model.endog)
#display(res.model.exog)
#display(res.model.exog_names)
print(res.summary()) # 結果を出力する

# In a numerical way
Y = res.model.endog
x = res.model.exog
#X = sm.add_constant(x)
model = sm.OLS(Y, x)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 weight   R-squared:                       0.226
Model:                            OLS   Adj. R-squared:                  0.158
Method:                 Least Squares   F-statistic:                     3.348
Date:                Mon, 29 Oct 2018   Prob (F-statistic):             0.0529
Time:                        18:21:24   Log-Likelihood:                -23.926
No. Observations:                  26   AIC:                             53.85
Df Residuals:                      23   BIC:                             57.63
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         5.0320      0.204     24.644

### 一元配置分散分析: Statsmodels' anova_lm()

In [6]:
print("----------------")
print("ANOVA table:")

# one-way anova by anova_lm()
aov_table = sm.stats.anova_lm(res, typ=2)

# aov_tableはpandasのdataframeオブジェクトです・
print("----------------")
print(aov_table)
print("----------------")
# F値の取得、 access F-statistic
print("F値：", aov_table.at['group','F'])
# F値の取得、 access F-statistic
print("p値：", aov_table.at['group','PR(>F)'])
# グループ間の自由度
print("グループ間の自由度:",aov_table.at['group','df'])

----------------
ANOVA table:
----------------
            sum_sq    df         F    PR(>F)
group     2.792128   2.0  3.348388  0.052931
Residual  9.589533  23.0       NaN       NaN
----------------
F値： 3.3483875849684273
p値： 0.05293138619485971
グループ間の自由度: 2.0


### Pure python

<table width="70%" style="font-size: 15pt;" border="1">
<thead>
<tr><th align="center">Group</th> <th align="center">Num</th> <th align="center"> Mean</th> <th align="center"> Std </th> </tr>
</thead>
<tbody>
<tr><td align="center">   ctrl     </td> <td align="center">   10     </td> <td align="center">   5.032    </td> <td align="center">   0.583    </td> </tr>
<tr><td align="center">   trt1     </td> <td align="center">   10     </td> <td align="center">   4.661    </td> <td align="center">   0.794    </td> </tr>
<tr><td align="center">   trt2     </td> <td align="center">   10     </td> <td align="center">   5.526    </td> <td align="center">   0.443    </td> </tr>
</tbody>
</table>

分散分析表
<table border="" cellpadding="5" width="100%">
<tbody>
<tr valign="top">
<td align="left" width="8%"><strong>Source</strong></td>
<td align="center" width="22%"><strong>SS</strong></td>
<td align="center" width="19%"><strong>df</strong></td>
<td align="center" width="23%"><strong>MS</strong></td>
<td align="center" width="25%"><strong>F</strong></td>
</tr>
<tr valign="top">
<td align="center"><strong>Between</strong></td>
<td align="left">$SS_{between} = \sum_{j=1}^{k}n_j(\bar{x}_{j} - \bar{x})^2$</td>
<td align="left">$df_{between} = k - 1$</td>
<td align="left">$MSG = \frac{SS_{between}}{df_{between}}$</td>
<td align="left">$ F = \frac{MSG}{MSE} $</td>
</tr>
<tr valign="top">
<td align="center"><strong>Within</strong></td>
<td align="left">$SS_{within} = \sum_{i=1}^{n_j}\sum_{j=1}^{k}(x_{ij} - \bar{x}_{j})^2$</td>
<td align="left">$df_{within} = N - k$</td>
<td align="left">$MSE = \frac{SS_{within}}{df_{within}}$</td>
<td align="center"></td>
</tr>
<tr valign="top">
<td align="center"><strong>Total</strong></td>
<td align="left">$SS_{total} = \sum_{i=1}^{n_j}\sum_{j=1}^{k}(x_{ij} - \bar{x})^2$</td>
<td align="left">$df_{total} = N - 1$</td>
<td align="center"></td>
<td align="center"></td>
</tr>
</tbody></table>


### 標本からの情報 Samples info

pandasのdata frameの２つ属性(group, values)を利用する。  
"group"列にあるグループの種類　types in "group" column

In [7]:
# groupは元のファイルのヘッドです。"group" is the header in the file
k = len(pd.unique(data_frame.group)) 
# データの総数　total numbers of data
N = len(data_frame.values)
# degree of freedom between
df_between = k - 1
# degree of freedom within
df_within = N - k
# degree of freedom in total
df_total = N - 1

### 水準間の平方和 Sum of squares between levels

In [8]:
ave_all = data_frame['weight'].mean()# x_bar: 全てのweightの平均, means of all weights

n1 = data['ctrl'].size
n2 = data['trt1'].size
n3 = data['trt2'].size
ctrl_ave = data['ctrl'].mean()
trt1_ave = data['trt1'].mean()
trt2_ave = data['trt2'].mean()

# sum of sum of squares between groups: ss_between
ctrl_ssb = n1 * (ctrl_ave - ave_all)**2
trt1_ssb = n2 * (trt1_ave - ave_all)**2 
trt2_ssb = n3 * (trt2_ave - ave_all)**2
ss_between = ctrl_ssb + trt1_ssb + trt2_ssb
print("ss_between: ", ss_between)

ss_between:  2.792128205128212


### 水準内の平方和 Sum of squares within levels

In [9]:
# sum of sum of squares within levels: ss_within
# SS_within:
ctrl_ssw = sum((data['ctrl'] - ctrl_ave)**2)
trt1_ssw = sum((data['trt1'] - trt1_ave)**2)
trt2_ssw = sum((data['trt2'] - trt2_ave)**2)
ss_within =  ctrl_ssw + trt1_ssw + trt2_ssw
print("ss_within: ", ss_within)

ss_within:  9.589533333333335


### 総平方和  Sum of squares for total

In [10]:
# sum of sum of squares for total: ss_total
ctrl_sst = sum((data['ctrl'] - ave_all)**2)
trt1_sst = sum((data['trt1'] - ave_all)**2)
trt2_sst = sum((data['trt2'] - ave_all)**2)
ss_total = ctrl_sst + trt1_sst + trt2_sst
print("ss_total: ", ss_total)

ss_total:  12.381661538461538


### F検定  F-statistic

In [11]:
mean_squared_between = ss_between / df_between
mean_squared_within = ss_within / df_within

# F-ratio
f_ratio =  mean_squared_between / mean_squared_within
# p value
p_val = ss.f.sf(f_ratio, df_between, df_within)

print("Calculate directly:")
print("F-value: {}".format(f_ratio))
print("p-value: {:.5f}".format(p_val))

Calculate directly:
F-value: 3.348387584968448
p-value: 0.05293


### 効果量(Effective size)

実験的研究では,効果量は独立変数（IV）が従属変数（DV）に与えた影響の程度を表す数値です。  
 Effect size is a value which allows you to see how much  
 your independent variable (IV) has affected the dependent variable (DV) in  
 an experimental study.  

 ここでは、カテゴリ独立変数に対して、より直感的な解釈であり、計算しやすい2つの尺度$\eta^2$と$\omega^2$を紹介します。  
 Here we introduce two measures, $\eta^2$ and $\omega^2$ for categorical independent variables have a more intuitive interpretation, and are easier to evaluate.  
1. 標本数が多くなると、$\eta^2$の評価はbiasが小さいとなります。  
2. 標本数が少ない場合、$\omega^2$の評価は相応しいです。  

基本的には要因による変動(各条件の平均値差)が大きいほど、 または誤差による変動(各条件の標準偏差が)小さいほど、効果量は大きくなる。  
 The bias of $\eta^2$ gets very small as sample size increases, but for small samples an unbiased effect size measure is Omega Squared.  Basically, the greater the fluctuation due to factors (difference in average value of each condition), or the smaller the fluctuation due to error (standard deviation of each condition), the larger the effect size.

In [12]:
# eta_squared (somewhat biased)
eta_squared = ss_between / ss_total
# omega_squared
omega_squared = (ss_between - (df_between * mean_squared_within)) / (ss_total + mean_squared_within)
print("eta squared: {:.5f}".format(eta_squared))
print("omega squared: {:.5f}".format(omega_squared))

eta squared: 0.22551
omega squared: 0.15301


### 結果　Result

In [13]:
if p_val > 0.05:
    print("Retain the null hypothesis")
else:
    print("Reject the null hypothesis")

Retain the null hypothesis


### 解釈 Interpretation
5%有意水準でctrl、trt1、trt2の母集団の平均値が全て等しいではありません。  
The population mean of ctrl、trt1、trt2 are not all equal at 5% level of significance.