In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor # VIFを求めるため

###  データの読み込み Loading data

In [17]:
dat = pd.read_csv("data/data2.csv", delimiter=",")
dat["State"] = dat["State"].map({"NewYork":0, "California":1, "Florida":2})
dat

Unnamed: 0,RDSpend,Administration,MarketingSpend,State,Profit
0,165349.2,136897.8,471784.1,0,192261.83
1,162597.7,151377.59,443898.53,1,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,0,182901.99
4,142107.34,91391.77,366168.42,2,166187.94
5,131876.9,99814.71,362861.36,0,156991.12
6,134615.46,147198.87,127716.82,1,156122.51
7,130298.13,145530.06,323876.68,2,155752.6
8,120542.52,148718.95,311613.29,0,152211.77
9,123334.88,108679.17,304981.62,1,149759.96


In [18]:
# モデルの設定 Construct a linear regression model
model = smf.ols(formula = "Profit ~ RDSpend + Administration + MarketingSpend + State", data = dat)
# 回帰分析の実行 Execute linear  regression 
results = model.fit()
# 結果を表示　Print results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 Profit   R-squared:                       0.922
Model:                            OLS   Adj. R-squared:                  0.859
Method:                 Least Squares   F-statistic:                     14.74
Date:                Mon, 07 Jan 2019   Prob (F-statistic):            0.00564
Time:                        18:06:21   Log-Likelihood:                -98.842
No. Observations:                  10   AIC:                             207.7
Df Residuals:                       5   BIC:                             209.2
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept       2.414e+04   2.35e+04      1.

  "anyway, n=%i" % int(n))


### 変数増加法の関数step_aicを定義 Define the function step_aic of the foward selection method

In [5]:
def step_aic(model, exog, endog, **kwargs):
    """
    This select the best exogenous variables with AIC
    Both exog and endog values can be either str or list.
    (Endog list is for the Binomial family.)

    Note: This adopt only "forward" selection

    Args:
        model: model from statsmodels.formula.api
        exog (str or list): exogenous variables
        endog (str or list): endogenous variables
        kwargs: extra keyword argments for model (e.g., data, family)

    Returns:
        model: a model that seems to have the smallest AIC
    """

    # exog, endogは強制的にリスト形式に変換しておく
    exog = np.r_[[exog]].flatten()
    endog = np.r_[[endog]].flatten()
    remaining = set(exog)
    selected = []  # 採用が確定された要因

    # 定数項のみのAICを計算
    formula_head = ' + '.join(endog) + ' ~ '
    formula = formula_head + '1'
    aic = model(formula=formula, **kwargs).fit().aic
    print('AIC: {}, formula: {}'.format(round(aic, 3), formula))

    current_score, best_new_score = np.ones(2) * aic

    # 全要因を採択するか，どの要因を追加してもAICが上がらなければ終了
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:

            # 残っている要因を1つずつ追加したときのAICを計算
            formula_tail = ' + '.join(selected + [candidate])
            formula = formula_head + formula_tail
            aic = model(formula=formula, **kwargs).fit().aic
            print('AIC: {}, formula: {}'.format(round(aic, 3), formula))

            scores_with_candidates.append((aic, candidate))

        # 最もAICが小さかった要因をbest_candidateとする
        scores_with_candidates.sort()
        scores_with_candidates.reverse()
        best_new_score, best_candidate = scores_with_candidates.pop()

        # 候補要因追加でAICが下がったならば，それを確定要因として追加する
        if best_new_score < current_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score

    formula = formula_head + ' + '.join(selected)
    print('The best formula: {}'.format(formula))
    return model(formula, **kwargs).fit()

### 変数増加法により最も良いモデルを選択する

In [19]:
model = step_aic(smf.ols,["RDSpend","Administration","MarketingSpend","State"],["Profit"],data=dat)


AIC: 225.174, formula: Profit ~ 1
AIC: 219.569, formula: Profit ~ MarketingSpend
AIC: 203.561, formula: Profit ~ RDSpend
AIC: 227.172, formula: Profit ~ State
AIC: 227.174, formula: Profit ~ Administration
AIC: 203.803, formula: Profit ~ RDSpend + MarketingSpend
AIC: 205.308, formula: Profit ~ RDSpend + State
AIC: 205.338, formula: Profit ~ RDSpend + Administration
The best formula: Profit ~ RDSpend


### 回帰分析の実行

In [20]:
# モデルの設定 Construct a linear regression model
model = smf.ols(formula = "Profit ~ RDSpend", data = dat)
# 回帰分析の実行 Execute linear  regression 
results = model.fit()
# 結果を表示　Print results
print(results.summary())
# VIFを求める Calculate VIFs
col = model.exog.shape[1]# 独立変数の列数　Number of columns of independent variable
vifs = [variance_inflation_factor(model.exog, i) for i in range(col)]
print(pd.DataFrame(vifs, index=model.exog_names, columns=["VIF"]))

                            OLS Regression Results                            
Dep. Variable:                 Profit   R-squared:                       0.906
Model:                            OLS   Adj. R-squared:                  0.894
Method:                 Least Squares   F-statistic:                     76.84
Date:                Tue, 08 Jan 2019   Prob (F-statistic):           2.25e-05
Time:                        11:34:50   Log-Likelihood:                -99.781
No. Observations:                  10   AIC:                             203.6
Df Residuals:                       8   BIC:                             204.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    1.59e+04   1.76e+04      0.902      0.3

  "anyway, n=%i" % int(n))
