In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as ss
import statsmodels.api as sm
import statsmodels.formula.api as smf

### データの読み込み

In [25]:
dat = pd.read_csv("data/car.csv", delimiter=",")
dat = dat.drop(["No"], axis=1)
dat = pd.get_dummies(dat, drop_first=True)
dat = dat.ix[:, ['fix', 'dealer', 'modelyear', 'run', 'displ', 'AT', 'audio', 'navi', 'damage',
       'precord', 'price', 'area_kyoto', 'area_osaka', 'grade_GX',
       'grade_S', 'grade_T', 'color_brack', 'color_red', 'color_silver',
       'color_white']]
print(dat.head())

   fix  dealer  modelyear    run  displ  AT  audio  navi  damage  precord  \
0    5       0       2001  79046   1400   1      1     0       1        0   
1    1       0       2001   2556   1600   1      1     0       0        0   
2    4       1       2006  14172   1400   1      1     1       3        0   
3    2       1       2008   4971   1400   1      1     0       0        0   
4    2       1       2006  44282   1600   1      1     1       0        0   

   price  area_kyoto  area_osaka  grade_GX  grade_S  grade_T  color_brack  \
0     95           0           1         0        0        1            0   
1    146           0           0         1        0        0            0   
2    130           0           0         0        0        0            0   
3    135           1           0         0        1        0            0   
4    125           0           0         0        1        0            0   

   color_red  color_silver  color_white  
0          0             1      

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


### stepAICを使った独立変数の選定

In [5]:
def step_aic(model, exog, endog, **kwargs):
    """
    This select the best exogenous variables with AIC
    Both exog and endog values can be either str or list.
    (Endog list is for the Binomial family.)

    Note: This adopt only "forward" selection

    Args:
        model: model from statsmodels.formula.api
        exog (str or list): exogenous variables
        endog (str or list): endogenous variables
        kwargs: extra keyword argments for model (e.g., data, family)

    Returns:
        model: a model that seems to have the smallest AIC
    """

    # exog, endogは強制的にリスト形式に変換しておく
    exog = np.r_[[exog]].flatten()
    endog = np.r_[[endog]].flatten()
    remaining = set(exog)
    selected = []  # 採用が確定された要因

    # 定数項のみのAICを計算
    formula_head = ' + '.join(endog) + ' ~ '
    formula = formula_head + '1'
    aic = model(formula=formula, **kwargs).fit().aic
    print('AIC: {}, formula: {}'.format(round(aic, 3), formula))

    current_score, best_new_score = np.ones(2) * aic

    # 全要因を採択するか，どの要因を追加してもAICが上がらなければ終了
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:

            # 残っている要因を1つずつ追加したときのAICを計算
            formula_tail = ' + '.join(selected + [candidate])
            formula = formula_head + formula_tail
            aic = model(formula=formula, **kwargs).fit().aic
            print('AIC: {}, formula: {}'.format(round(aic, 3), formula))

            scores_with_candidates.append((aic, candidate))

        # 最もAICが小さかった要因をbest_candidateとする
        scores_with_candidates.sort()
        scores_with_candidates.reverse()
        best_new_score, best_candidate = scores_with_candidates.pop()

        # 候補要因追加でAICが下がったならば，それを確定要因として追加する
        if best_new_score < current_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score

    formula = formula_head + ' + '.join(selected)
    print('The best formula: {}'.format(formula))
    return model(formula, **kwargs).fit()

### 変数増加法により最も良いモデルを選択する

In [33]:
model = step_aic(smf.glm, ['dealer', 'modelyear', 'run', 'displ', 'AT', 'audio', 'navi', 'damage',
       'precord', 'price', 'area_kyoto', 'area_osaka', 'grade_GX',
       'grade_S', 'grade_T', 'color_brack', 'color_red', 'color_silver',
       'color_white'], ['fix'], data=dat, family=sm.families.Poisson())

AIC: 3811.247, formula: fix ~ 1
AIC: 3610.701, formula: fix ~ run
AIC: 3812.729, formula: fix ~ navi
AIC: 3810.571, formula: fix ~ damage
AIC: 3812.512, formula: fix ~ grade_GX
AIC: 3813.196, formula: fix ~ grade_S
AIC: 3810.457, formula: fix ~ color_brack
AIC: 3812.838, formula: fix ~ audio
AIC: 3806.084, formula: fix ~ price
AIC: 3812.553, formula: fix ~ color_silver
AIC: 3812.002, formula: fix ~ area_kyoto
AIC: 3813.096, formula: fix ~ grade_T
AIC: 3805.42, formula: fix ~ color_white
AIC: 3776.417, formula: fix ~ modelyear
AIC: 3813.138, formula: fix ~ dealer
AIC: 3807.724, formula: fix ~ precord
AIC: 3812.245, formula: fix ~ displ
AIC: 3813.207, formula: fix ~ color_red
AIC: 3810.917, formula: fix ~ AT
AIC: 3813.227, formula: fix ~ area_osaka
AIC: 3612.361, formula: fix ~ run + navi
AIC: 3609.773, formula: fix ~ run + damage
AIC: 3606.196, formula: fix ~ run + grade_GX
AIC: 3611.792, formula: fix ~ run + grade_S
AIC: 3609.609, formula: fix ~ run + color_brack
AIC: 3612.682, formula

AIC: 3559.588, formula: fix ~ run + modelyear + precord + color_white + grade_GX + damage + AT + dealer
AIC: 3559.242, formula: fix ~ run + modelyear + precord + color_white + grade_GX + damage + AT + displ
AIC: 3559.331, formula: fix ~ run + modelyear + precord + color_white + grade_GX + damage + AT + color_red
AIC: 3559.584, formula: fix ~ run + modelyear + precord + color_white + grade_GX + damage + AT + area_osaka
The best formula: fix ~ run + modelyear + precord + color_white + grade_GX + damage + AT


### ポアソン回帰の実行 Execution of Poisson regression

In [34]:
# モデルの設定 Construct a generalized linear model
model = smf.glm(formula = "fix ~ run + modelyear + precord + color_white + grade_GX + damage + AT", data = dat, family=sm.families.Poisson())
# 回帰分析の実行 Execution of Poisson regression 
results = model.fit()
# 結果を表示　Print results
print(results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    fix   No. Observations:                 1000
Model:                            GLM   Df Residuals:                      992
Model Family:                 Poisson   Df Model:                            7
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1770.8
Date:                Mon, 21 Jan 2019   Deviance:                       1098.0
Time:                        17:10:39   Pearson chi2:                     980.
No. Iterations:                     5   Covariance Type:             nonrobust
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     -97.5966     15.919     -6.131      0.000    -128.797     -66.396
run          1.322e-05   9.13e-07     14.486     

In [36]:
#fixの予測
print(results.predict(pd.DataFrame({"run": [5000], "modelyear": [2007], "precord": [0], "color_white": [1], "grade_GX": [0], "damage": [0], "AT":[1]})))

0    1.714332
dtype: float64
