In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor # VIFを求めるため

###  データの読み込み Loading data

In [2]:
dat = pd.read_csv("data/apartments.csv", delimiter=",", comment="#")
dat.dropna(inplace=True) #欠損値NaNを除外  Remove missing values

### 変数増加法の関数step_aicを定義 Define the function step_aic of the foward selection method

In [3]:
def step_aic(model, exog, endog, **kwargs):
    """
    This select the best exogenous variables with AIC
    Both exog and endog values can be either str or list.
    (Endog list is for the Binomial family.)

    Note: This adopt only "forward" selection

    Args:
        model: model from statsmodels.formula.api
        exog (str or list): exogenous variables
        endog (str or list): endogenous variables
        kwargs: extra keyword argments for model (e.g., data, family)

    Returns:
        model: a model that seems to have the smallest AIC
    """

    # exog, endogは強制的にリスト形式に変換しておく
    exog = np.r_[[exog]].flatten()
    endog = np.r_[[endog]].flatten()
    remaining = set(exog)
    selected = []  # 採用が確定された要因

    # 定数項のみのAICを計算
    formula_head = ' + '.join(endog) + ' ~ '
    formula = formula_head + '1'
    aic = model(formula=formula, **kwargs).fit().aic
    print('AIC: {}, formula: {}'.format(round(aic, 3), formula))

    current_score, best_new_score = np.ones(2) * aic

    # 全要因を採択するか，どの要因を追加してもAICが上がらなければ終了
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:

            # 残っている要因を1つずつ追加したときのAICを計算
            formula_tail = ' + '.join(selected + [candidate])
            formula = formula_head + formula_tail
            aic = model(formula=formula, **kwargs).fit().aic
            print('AIC: {}, formula: {}'.format(round(aic, 3), formula))

            scores_with_candidates.append((aic, candidate))

        # 最もAICが小さかった要因をbest_candidateとする
        scores_with_candidates.sort()
        scores_with_candidates.reverse()
        best_new_score, best_candidate = scores_with_candidates.pop()

        # 候補要因追加でAICが下がったならば，それを確定要因として追加する
        if best_new_score < current_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score

    formula = formula_head + ' + '.join(selected)
    print('The best formula: {}'.format(formula))
    return model(formula, **kwargs).fit()

### 変数増加法により最も良いモデルを選択する

In [4]:
model = step_aic(smf.ols,["area","time","year","distance"],["price"],data=dat)

AIC: 556.316, formula: price ~ 1
AIC: 513.392, formula: price ~ area
AIC: 554.695, formula: price ~ distance
AIC: 553.029, formula: price ~ time
AIC: 550.018, formula: price ~ year
AIC: 494.151, formula: price ~ area + distance
AIC: 490.349, formula: price ~ area + time
AIC: 509.306, formula: price ~ area + year
AIC: 492.126, formula: price ~ area + time + distance
AIC: 398.079, formula: price ~ area + time + year
AIC: 399.45, formula: price ~ area + time + year + distance
The best formula: price ~ area + time + year


### 回帰分析の実行

In [5]:
# モデルの設定 Construct a linear regression model
model = smf.ols(formula = "price ~ area + time + year", data = dat)
# 回帰分析の実行 Execute linear  regression 
results = model.fit()
# 結果を表示　Print results
print(results.summary())
# VIFを求める Calculate VIFs
col = model.exog.shape[1]# 独立変数の列数　Number of columns of independent variable
vifs = [variance_inflation_factor(model.exog, i) for i in range(col)]
print(pd.DataFrame(vifs, index=model.exog_names, columns=["VIF"]))

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.996
Model:                            OLS   Adj. R-squared:                  0.995
Method:                 Least Squares   F-statistic:                     2059.
Date:                Mon, 24 Dec 2018   Prob (F-statistic):           5.15e-31
Time:                        17:20:59   Log-Likelihood:                -195.04
No. Observations:                  30   AIC:                             398.1
Df Residuals:                      26   BIC:                             403.7
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -1.083e+05   4544.816    -23.831      0.0