In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as ss
import statsmodels.api as sm
import statsmodels.formula.api as smf

### データの読み込み

In [7]:
# Statsmodelsで用意されているデータセット(Spector and Mazzeo (1980))からデータを読み込む Load the data from Spector and Mazzeo (1980)
# https://www.statsmodels.org/dev/datasets/generated/spector.html
#dat = sm.datasets.spector.load_pandas()
dat = pd.read_csv("data/2-16-1.csv")
print(dat.head())
y=dat["result"]#.endog # dependence value
X=dat["hours"]#.exog #independence value
#print(y)
#print(X)

   hours  result
0      0       0
1      0       0
2      0       0
3      0       0
4      0       0


In [8]:
# モデルの設定 Construct a logistic regression model
model = sm.Logit(y, sm.add_constant(X))
# もしくは Or
#model = smf.glm(formula = "GRADE ~ GPA + TUCE + PSI", data = dat.data, family=sm.families.Binomial())
#model = sm.GLM(y, sm.add_constant(X), data = dat, family=sm.families.Binomial())

# 回帰分析の実行 Execute the logistic regression 
results = model.fit()
# 結果を表示　Print results
print(results.summary())
print()

Optimization terminated successfully.
         Current function value: 0.340139
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 result   No. Observations:                  100
Model:                          Logit   Df Residuals:                       98
Method:                           MLE   Df Model:                            1
Date:                Mon, 28 Jan 2019   Pseudo R-squ.:                  0.5070
Time:                        16:29:30   Log-Likelihood:                -34.014
converged:                       True   LL-Null:                       -68.994
                                        LLR p-value:                 6.049e-17
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -4.5587      0.901     -5.061      0.000      -6.324      -2.793
hours          0.9289      0.

### 包括的検定
LLR p-value:                 6.049e-17  
有意水準5%とした場合、LLR p-valueは0.05以下であるため、帰無仮説を棄却。  
このモデルは有意であると判断できる。

$$result = 0.9289hours - 4.5587$$

### オッズ比

In [14]:
print(np.exp(results.params[1]))#hoursのオッズ比 Odds ratio of hours

2.531697877875778


Hoursのオッズ比:2.531697877875778倍  
勉強1時間につき、合格確率は2.531697877875778倍になる。

### 予測

In [16]:
# Hours:6の時の発生確率を求める
pred = results.predict([1,6])
print(pred)
#もしくは Or
xx = results.params[0] + results.params[1] * 6
pred = 1.0 / (1.0 + np.exp(-xx))
print(pred)

# 閾値を0.5に設定し，0or1に判別する The threshold is set to 0.5, and it is discriminated to 0 or 1
if pred < 0.5:
    print("Predicted value is:",0)
else:
    print("Predicted value is:",1)

[0.73392911]
0.7339291124216423
Predicted value is: 1
