### 等分散検定

In [7]:
#2群間の等分散性の検定

import numpy as np
from scipy import stats

#A = np.array([6.3, 8.1, 9.4, 10.4, 8.6, 10.5, 10.2, 10.5, 10.0, 8.8])
#B = np.array([4.8, 2.1, 5.1, 2.0, 4.0, 1.0, 3.4, 2.7, 5.1, 1.4, 1.6])

A_var = np.var(A, ddof=1)  # Aの不偏分散
B_var = np.var(B, ddof=1)  # Bの不偏分散
A_df = len(A) - 1  # Aの自由度
B_df = len(B) - 1  # Bの自由度
f = A_var / B_var  # F比の値
one_sided_pval1 = stats.f.cdf(f, A_df, B_df)  # 片側検定のp値 1
one_sided_pval2 = stats.f.sf(f, A_df, B_df)   # 片側検定のp値 2
two_sided_pval = min(one_sided_pval1, one_sided_pval2) * 2  # 両側検定のp値

print('F:       ', round(f, 3))
print('p-value: ', round(two_sided_pval, 3))

F:        0.779
p-value:  0.718


### 平均の差の検定

In [6]:
# studentのt検定
# 2群間の平均値の差を比較する検定
# 2群間の平均値が独立であり(データに対応がない)、2群間に等分散性が仮定できる場合に用いる

import numpy as np
from scipy import stats

#A = np.array([6.3, 8.1, 9.4, 10.4, 8.6, 10.5, 10.2, 10.5, 10.0, 8.8])
#B = np.array([4.8, 2.1, 5.1, 2.0, 4.0, 1.0, 3.4, 2.7, 5.1, 1.4, 1.6])

result = stats.ttest_ind(A, B)

print(f"t-stats:{result[0]}")
print(f"p-value:{result[1]}")



t-stats:9.851086859836649
p-value:6.698194360479442e-09


In [8]:
# Welch(ウェルチ)のt検定
# 2群間の平均値の差を比較する検定
# 2群間の平均値が独立であり(データに対応がない)、2群間に等分散性が仮定できない場合に用いる

import numpy as np
from scipy import stats

# A = np.array([13.8, 10.2, 4.6, 10.0, 4.2, 16.1, 14.4, 4.9, 7.7, 11.4])
# B = np.array([3.3, 2.6, 4.0, 4.7, 1.9, 2.9, 4.7, 5.3, 4.3, 3.0, 2.0])

result = stats.ttest_ind(A, B, equal_var=False)

print(f"t-stats:{result[0]}")
print(f"p-value:{result[1]}")

t-stats:4.426442804187721
p-value:0.0012285738375064346


In [9]:
# Mann-Whitney(マンホイットニー)のU検定
# (Wilcoxon(ウィルコクソン)の順位和検定)
# 2群間の平均値が独立であり(データに対応がない)、正規分布が仮定できない(ノンパラメトリック)場合に用いる
# 順序尺度に対応
# 独立2群間の代表値の差の検定
# 各群のサンプル数は一致しなくてよい

import numpy as np
from scipy import stats

A = np.array([1.83, 1.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30, 2.01, 3.11])
B = np.array([0.88, 0.65, 0.60, 1.05, 1.06, 1.29, 1.06, 2.14, 1.29])


result = stats.mannwhitneyu(A, B, alternative='two-sided')

print(f"t-stats:{result[0]}")
print(f"p-value:{result[1]}")

t-stats:91.0
p-value:0.0018253610099931035


### 割合の差　検定

In [None]:
# フィッシャーの正確性検定


In [13]:
# z検定
from scipy import stats

def z_stats(p1, n1, p2, n2):
    p_all = ( (n1*p1) + (n2*p2) ) / (n1+n2)
    z_scores = abs(p1-p2) / np.sqrt(p_all*(1-p_all)*(1/n1 + 1/n2))
    p_values = stats.norm.sf(abs(z_scores))*2 #twosided
    # p_values = stats.norm.sf(abs(z_scores)) #one-sided

    return z_scores, p_values

z_score, p_value = z_stats(0.2,5000,0.18,3000)

print(f"z_score:{z_score}")
print(f"p-value:{p_value}")




z_score:2.196564696466586
p-value:0.028051549880530297


In [19]:
# カイ二乗検定
import pandas as pd
from scipy import stats

df_cross = pd.DataFrame([[372,49],[501,78]], 
                  index=['A', 'B'],
                  columns=['no', 'cv']
                  )

X2_stats, p_value, dof, ef =  stats.chi2_contingency(df_cross, correction=False)

print(df_cross, "\n")
print(f"X2_stats:{X2_stats}")
print(f"p-value:{p_value}")


    no  cv
A  372  49
B  501  78 

X2_stats:0.7383347016905837
p-value:0.39019485527762676


### 重回帰分析

In [None]:
#　回帰
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

x_input = df_arrange.loc[:, "n_tokens_title":"abs_title_sentiment_polarity"]
y = df_arrange[["target"]]

x_add_const = sm.add_constant(x_input)
model_sm = sm.OLS(y, x_add_const).fit()
print(model_sm.summary())

### ロジスティック回帰分析

In [21]:
#use sm.Logit

import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Load the data from Spector and Mazzeo (1980)
spector_data = sm.datasets.spector.load()
spector_data.exog = sm.add_constant(spector_data.exog)

# Follow statsmodles ipython notebook
logit_mod = sm.Logit(spector_data.endog, spector_data.exog)
logit_res = logit_mod.fit(disp=0)

print('Parameters: ', logit_res.params)
print(logit_res.summary())

# resid1 = logit_res.resid_dev   # for Logit model 残差

Parameters:  [-13.02134686   2.82611259   0.09515766   2.37868766]
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                   32
Model:                          Logit   Df Residuals:                       28
Method:                           MLE   Df Model:                            3
Date:                Wed, 28 Jul 2021   Pseudo R-squ.:                  0.3740
Time:                        23:00:19   Log-Likelihood:                -12.890
converged:                       True   LL-Null:                       -20.592
Covariance Type:            nonrobust   LLR p-value:                  0.001502
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -13.0213      4.931     -2.641      0.008     -22.687      -3.356
x1             2.8261      1.263      2.238      0.025       0.3

In [22]:
#use GLM

# logit_mod = sm.Logit(spector_data.endog, spector_data.exog)
glm_model = sm.GLM(spector_data.endog, spector_data.exog, family=sm.families.Binomial())
# logit_res = logit_mod.fit(disp=0)
glm_reslt = glm_model.fit()

# print logit_res.summary()
print(glm_reslt.summary())

# resid2 = glm_reslt.resid_deviance   # for GLM model 残差


                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                   32
Model:                            GLM   Df Residuals:                       28
Model Family:                Binomial   Df Model:                            3
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -12.890
Date:                Wed, 28 Jul 2021   Deviance:                       25.779
Time:                        23:04:20   Pearson chi2:                     27.3
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -13.0213      4.931     -2.641      0.0