# 一元配置分散分析(One-way ANOVA analysis)
## パッケージ・モジュールをインポートする

In [2]:
import pandas as pd
import scipy.stats as ss

import statsmodels.api as sm
from statsmodels.formula.api import ols
#dataframeをキレに表示する rich display dataframe
from IPython.core.display import display

# Rのデータセットから植物成長データ
pandasでファイルを読み込む

In [19]:
datafile = "data/ssri_data.csv"
data_frame = pd.read_csv(datafile, comment="#")
display(data_frame)

Unnamed: 0,SSRI,Bound_Receptor,SSRIN
0,SSRI1,253.9,1
1,SSRI1,285.8,1
2,SSRI1,253.5,1
3,SSRI1,256.0,1
4,SSRI1,330.3,1
5,SSRI1,259.5,1
6,SSRI1,273.9,1
7,SSRI1,230.7,1
8,SSRI1,249.0,1
9,SSRI1,175.5,1


# グループの種類やデータの分割
"group"ヘッドの値を分割し、pythonの辞書型変数に代入する
pd.unique(): ユニークのグループ名を抽出する。

In [23]:
groups = pd.unique(data_frame.SSRI.values)
#グループに対応するデータを分割し、pythonの辞書型変数に代入する
data = {group: data_frame["Bound_Receptor"][data_frame.SSRI==group] for group in groups}

# scipyのf_oneway()関数 scipy's f_oneway() function
使用条件: Assumption

1. 各標本は独立している
2. 各標本は正規分布に従う母集団からのものです。
3. 各グループの母集団の標準偏差は全て等しい。いわゆる等分散性です。

In [24]:
f, p = ss.f_oneway(data["SSRI1"], data["SSRI2"], data["SSRI3"], data["SSRI4"])
print("Using f_oneway of scipy")
print("F_values: {} /p_value:{:.5f}".format(f,p))

Using f_oneway of scipy
F_values: 2.3956371170027633 /p_value:0.07825


In [40]:
model2 = ols(formula='Bound_Receptor ~ SSRI', data=data_frame)# ols formulaのモデルを構築
res = model2.fit()
# see designed matrix
#display(res.model.endog)
#display(res.model.exog)
#display(re.model.exog_names)
print(res.summary()) #結果を出力する

# in a numerical way
Y = res.model.endog
x = res.model.exog
#X = sm.add_constant(x)
model = sm.OLS(Y, x)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:         Bound_Receptor   R-squared:                       0.117
Model:                            OLS   Adj. R-squared:                  0.068
Method:                 Least Squares   F-statistic:                     2.396
Date:                Mon, 29 Oct 2018   Prob (F-statistic):             0.0783
Time:                        17:53:16   Log-Likelihood:                -310.26
No. Observations:                  58   AIC:                             628.5
Df Residuals:                      54   BIC:                             636.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept       264.2800     13.628     19.393

# 一元配置分散分析: Statsmodels' anova_lm()

In [27]:
print("-------------")
print("ANOVA table:")

#one-way anova by anova_lm()
aov_table = sm.stats.anova_lm(res, typ=2)

#aov_tableはpandasのdataframeオブジェクトです
print("------------")
print(aov_table)
print("-------------")
#F値の取得
print("F値:", aov_table.at["SSRI", "F"])
#P値の取得
print("p値:", aov_table.at["SSRI", "PR(>F)"])
#グループ間の自由度
print("グループ間の自由度", aov_table.at["SSRI", "df"])

-------------
ANOVA table:
------------
                 sum_sq    df         F    PR(>F)
SSRI       20020.429233   3.0  2.395637  0.078255
Residual  150426.675077  54.0       NaN       NaN
-------------
F値: 2.395637117002766
p値: 0.07825490306745381
グループ間の自由度 3.0


# 標本からの情報
pandasのdata frameの２つのクラス(group, values)を利用する。  
"group"列にあるグループの種類

In [28]:
#groupは元のファイルのヘッド
k = len(pd.unique(data_frame.SSRI))
#データの総数
N = len(data_frame.values)
# degree of freedom between
df_between = k - 1
# degree of freedom  within
df_within = N - k
# degree of freedom in total
df_total = N - 1

# 水準間の平方和

In [30]:
ave_all = data_frame["Bound_Receptor"].mean()# x_bar:全てのweightの平均

n1 = data["SSRI1"].size
n2 = data["SSRI2"].size
n3 = data["SSRI3"].size
n4 = data["SSRI4"].size
ssri1_ave = data["SSRI1"].mean()
ssri2_ave = data["SSRI2"].mean()
ssri3_ave = data["SSRI3"].mean()
ssri4_ave = data["SSRI4"].mean()

# Sum of sum of squares groups: ss_between
ssri1_ssb = n1 * (ssri1_ave - ave_all)**2
ssri2_ssb = n2 * (ssri2_ave - ave_all)**2
ssri3_ssb = n3 * (ssri3_ave - ave_all)**2
ssri4_ssb = n4 * (ssri4_ave - ave_all)**2
ss_between = ssri1_ssb + ssri2_ssb + ssri3_ssb + ssri4_ssb
print("ss_between:", ss_between)

ss_between: 20020.429233421753


# 水準内の平方和

In [32]:
# sum of sum of squares within levels: ss_within
# SS_within
ssri1_ssw = sum((data["SSRI1"] - ssri1_ave)**2)
ssri2_ssw = sum((data["SSRI2"] - ssri2_ave)**2)
ssri3_ssw = sum((data["SSRI3"] - ssri3_ave)**2)
ssri4_ssw = sum((data["SSRI4"] - ssri4_ave)**2)
ss_within = ssri1_ssw + ssri2_ssw + ssri3_ssw + ssri4_ssw
print("ss_within:", ss_within)

ss_within: 150426.67507692307


# 総平方和

In [33]:
# sum of sum of squares for total: ss_total
ssri1_sst = sum((data["SSRI1"] - ave_all)**2)
ssri2_sst = sum((data["SSRI2"] - ave_all)**2)
ssri3_sst = sum((data["SSRI3"] - ave_all)**2)
ssri4_sst = sum((data['SSRI4'] - ave_all)**2)
ss_total = ssri1_sst + ssri2_sst + ssri3_sst + ssri4_sst
print("ss_total:", ss_total)

ss_total: 170447.10431034482


# F検定

In [34]:
mean_squared_between = ss_between / df_between
mean_squared_within = ss_within / df_within

# F-ratio
f_ratio = mean_squared_between / mean_squared_within
# p-value
p_val = ss.f.sf(f_ratio, df_between, df_within)

print("Calculate directly:")
print("F-value: {}".format(f_ratio))
print("p-value: {:.5f}".format(p_val))

Calculate directly:
F-value: 2.3956371170027646
p-value: 0.07825


In [35]:
# eta_squared (somewhat biased)
eta_squared = ss_between / ss_total
# omega_squared
omega_squared = (ss_between - (df_between * mean_squared_within)) / (ss_total + mean_squared_within)
print("eta squared:{:.5f}".format(eta_squared))
print("omega squared: {:.5f}".format(omega_squared))

eta squared:0.11746
omega squared: 0.06733


# 結果

In [38]:
if p_val > 0.05:
    print("Retain the null hypothesis")
else:
    print("Reject the null hypothesis")

Retain the null hypothesis


# 解釈
5%の有意水準でSSRI1、SSRI2、SSRI3、SSRI4の母集団の平均値が全て等しい。