# 3.6.3 Multiple Linear Regression
## Import and load data

In [81]:
from scipy import stats
import pandas as pd
import seaborn as sns
import scipy as sp
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
plt.style.use('seaborn-white')


boston = pd.read_csv('Data/Boston.csv')
boston[0:10]

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


In [82]:
res = smf.ols('medv ~ ' + '+'.join(boston.columns.difference(['medv'])), boston).fit()
res.summary().tables[1]


0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,36.4595,5.103,7.144,0.000,26.432 46.487
age,0.0007,0.013,0.052,0.958,-0.025 0.027
black,0.0093,0.003,3.467,0.001,0.004 0.015
chas,2.6867,0.862,3.118,0.002,0.994 4.380
crim,-0.1080,0.033,-3.287,0.001,-0.173 -0.043
dis,-1.4756,0.199,-7.398,0.000,-1.867 -1.084
indus,0.0206,0.061,0.334,0.738,-0.100 0.141
lstat,-0.5248,0.051,-10.347,0.000,-0.624 -0.425
nox,-17.7666,3.820,-4.651,0.000,-25.272 -10.262


In the above regression output, ${\tt age}$ and ${\tt indus}$ have a high p-value. So we may wish to run a regression excluding these predictors:

In [74]:
res = smf.ols('medv ~ ' + '+'.join(boston.columns.difference(['medv', 'age', 'indus'])), boston).fit()
res.summary().tables[1]; # semicolon


$R^2$ Statistic

In [83]:
print("R^2: %f" % res.rsquared)

R^2: 0.740643


###### Residual standard error

In [86]:
print("RSE: %f" % np.sqrt(res.mse_resid))

RSE: 4.745298


###### VIF (variance inflation factors)

In [115]:
from statsmodels.stats import outliers_influence
from patsy import dmatrices

y, X = dmatrices('medv ~ ' + '+'.join(boston.columns.difference(['medv'])), boston, return_type='dataframe')

vif = pd.DataFrame() 
vif["vif"] = [outliers_influence.variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
vif

Unnamed: 0,vif,features
0,585.265238,Intercept
1,3.100826,age
2,1.348521,black
3,1.073995,chas
4,1.792192,crim
5,3.955945,dis
6,3.991596,indus
7,2.941491,lstat
8,4.39372,nox
9,1.799084,ptratio
