In [29]:
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [4]:
# Load data
boston = pd.read_csv('../../DataSets/Boston.csv')

In [6]:
# Fit medv vs lstat + age
fit_res = smf.ols('medv ~ lstat + age', boston).fit()
print(fit_res.summary())

                            OLS Regression Results                            
Dep. Variable:                   medv   R-squared:                       0.551
Model:                            OLS   Adj. R-squared:                  0.549
Method:                 Least Squares   F-statistic:                     309.0
Date:                Tue, 25 Apr 2017   Prob (F-statistic):           2.98e-88
Time:                        16:12:23   Log-Likelihood:                -1637.5
No. Observations:                 506   AIC:                             3281.
Df Residuals:                     503   BIC:                             3294.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     33.2228      0.731     45.458      0.0

In [26]:
# Perform multiple regression with all variables as predictors (exept of course "medv")

all_columns = "+".join(boston.columns.drop("medv"))
my_string = "medv ~ " + all_columns

fit_res = smf.ols(my_string, boston).fit()

print(fit_res.summary())
      

                            OLS Regression Results                            
Dep. Variable:                   medv   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Tue, 25 Apr 2017   Prob (F-statistic):          6.72e-135
Time:                        16:41:40   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     36.4595      5.103      7.144      0.0

In [33]:
# We now omit age since it has a large p-value

all_columns = "+".join(boston.columns.drop(["medv","age"]))
my_formula = "medv ~ " + all_columns

fit_res = smf.ols(my_formula, boston).fit()

print(fit_res.summary())

                            OLS Regression Results                            
Dep. Variable:                   medv   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     117.3
Date:                Tue, 25 Apr 2017   Prob (F-statistic):          6.08e-136
Time:                        17:28:08   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3024.
Df Residuals:                     493   BIC:                             3079.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     36.4369      5.080      7.172      0.0

In [94]:
# And indus...

all_columns = "+".join(boston.columns.drop(["medv","age","indus"]))
my_formula = "medv ~ " + all_columns


fit_res = smf.ols(my_formula, boston).fit()

print(fit_res.summary())

                            OLS Regression Results                            
Dep. Variable:                   medv   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.735
Method:                 Least Squares   F-statistic:                     128.2
Date:                Tue, 25 Apr 2017   Prob (F-statistic):          5.54e-137
Time:                        18:08:22   Log-Likelihood:                -1498.9
No. Observations:                 506   AIC:                             3022.
Df Residuals:                     494   BIC:                             3072.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     36.3411      5.067      7.171      0.0

In [98]:

# Make two design matrices (left and right hand side)
all_columns = "+".join(boston.columns.drop("medv"))
my_formula = "medv ~ " + all_columns

y, X = dmatrices(my_formula, data=boston, return_type="dataframe")

# Check for co-linearity by calculating vif factors
vif = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif = np.array(vif).reshape(1,X.shape[1])

# Generate column names
names = list(boston)
names.remove('medv')
names = ['intercept'] + names

# Create dataframe
vif_df = pd.DataFrame(vif, columns = names)
vif_df

Unnamed: 0,intercept,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,585.265238,1.792192,2.298758,3.991596,1.073995,4.39372,1.933744,3.100826,3.955945,7.484496,9.008554,1.799084,1.348521,2.941491
