# Multiple Linear Regression

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from random import gauss
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats as stats

%matplotlib inline

### Running the Regression

First, we'll separate the data into our predictors (X) and target (y)

In [3]:
wine = pd.read_csv('data/wine.csv')

wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red_wine
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1


In [4]:
# Dropping target from df and re-adding it as a target in a separate df
wine_preds = wine.drop('quality', axis=1)
wine_target = wine['quality']
wine_preds.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,red_wine
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1


In [5]:
# use sm.add_constant() to add constant term/y-intercept. Constant is the y-intercept.
predictors = sm.add_constant(wine_preds)
predictors

Unnamed: 0,const,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,red_wine
0,1.0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,1
1,1.0,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,1
2,1.0,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,1
3,1.0,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,1
4,1.0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,1.0,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,0
6493,1.0,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,0
6494,1.0,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,0
6495,1.0,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,0


In [6]:
# Looking for correlations between variables
wine_preds.corr()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,red_wine
fixed acidity,1.0,0.219008,0.324436,-0.111981,0.298195,-0.282735,-0.329054,0.45891,-0.2527,0.299568,-0.095452,0.48674
volatile acidity,0.219008,1.0,-0.377981,-0.196011,0.377124,-0.352557,-0.414476,0.271296,0.261454,0.225984,-0.03764,0.653036
citric acid,0.324436,-0.377981,1.0,0.142451,0.038998,0.133126,0.195242,0.096154,-0.329808,0.056197,-0.010493,-0.187397
residual sugar,-0.111981,-0.196011,0.142451,1.0,-0.12894,0.402871,0.495482,0.552517,-0.26732,-0.185927,-0.359415,-0.348821
chlorides,0.298195,0.377124,0.038998,-0.12894,1.0,-0.195045,-0.27963,0.362615,0.044708,0.395593,-0.256916,0.512678
free sulfur dioxide,-0.282735,-0.352557,0.133126,0.402871,-0.195045,1.0,0.720934,0.025717,-0.145854,-0.188457,-0.179838,-0.471644
total sulfur dioxide,-0.329054,-0.414476,0.195242,0.495482,-0.27963,0.720934,1.0,0.032395,-0.238413,-0.275727,-0.26574,-0.700357
density,0.45891,0.271296,0.096154,0.552517,0.362615,0.025717,0.032395,1.0,0.011686,0.259478,-0.686745,0.390645
pH,-0.2527,0.261454,-0.329808,-0.26732,0.044708,-0.145854,-0.238413,0.011686,1.0,0.192123,0.121248,0.329129
sulphates,0.299568,0.225984,0.056197,-0.185927,0.395593,-0.188457,-0.275727,0.259478,0.192123,1.0,-0.003029,0.487218


In [7]:
# Calling the summary based on the above with 'quality' as the target variable
model = sm.OLS(wine_target, predictors).fit()
model.summary()

0,1,2,3
Dep. Variable:,quality,R-squared:,0.297
Model:,OLS,Adj. R-squared:,0.295
Method:,Least Squares,F-statistic:,227.8
Date:,"Wed, 03 Mar 2021",Prob (F-statistic):,0.0
Time:,10:27:23,Log-Likelihood:,-7195.2
No. Observations:,6497,AIC:,14420.0
Df Residuals:,6484,BIC:,14500.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,104.3904,14.105,7.401,0.000,76.741,132.040
fixed acidity,0.0851,0.016,5.396,0.000,0.054,0.116
volatile acidity,-1.4924,0.081,-18.345,0.000,-1.652,-1.333
citric acid,-0.0626,0.080,-0.786,0.432,-0.219,0.094
residual sugar,0.0624,0.006,10.522,0.000,0.051,0.074
chlorides,-0.7573,0.334,-2.264,0.024,-1.413,-0.102
free sulfur dioxide,0.0049,0.001,6.443,0.000,0.003,0.006
total sulfur dioxide,-0.0014,0.000,-4.333,0.000,-0.002,-0.001
density,-103.9096,14.336,-7.248,0.000,-132.013,-75.806

0,1,2,3
Omnibus:,140.992,Durbin-Watson:,1.648
Prob(Omnibus):,0.0,Jarque-Bera (JB):,313.985
Skew:,0.016,Prob(JB):,6.59e-69
Kurtosis:,4.077,Cond. No.,296000.0


## Scaling

Before we construct a linear regression, let's *scale* our columns as z-scores. Why?

In a word, it's useful to have all of our variables be on the same scale, so that the resulting coefficients are easier to interpret. If the scales of the variables are very different one from another, then some of the coefficients may end up on very large or very tiny scales. 

z-score = mean of 0, and std of 1.

For more on this, see [this post](https://stats.stackexchange.com/questions/32649/some-of-my-predictors-are-on-very-different-scales-do-i-need-to-transform-them).

Let's try a model with our wine dataset now.

In [13]:
# We'll include all the columns for now.
# Z score calculation: n - mean of dataset / std of dataset
wine_preds_scaled = (wine_preds - np.mean(wine_preds)) / np.std(wine_preds)

In [14]:
# Notice how red_wine constant is compromised
wine_preds_scaled.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,red_wine
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,-3.849639e-16,1.049902e-16,2.187295e-17,3.499672e-17,3.499672e-17,-8.749179e-17,-6.999344e-17,-3.534668e-15,2.729744e-15,-5.424491e-16,9.361622e-16,0.0
std,1.000077,1.000077,1.000077,1.000077,1.000077,1.000077,1.000077,1.000077,1.000077,1.000077,1.000077,1.000077
min,-2.634589,-1.57733,-2.192833,-1.018034,-1.342639,-1.663583,-1.94178,-2.530192,-3.100615,-2.091935,-2.08935,-0.571367
25%,-0.6289329,-0.6661613,-0.4723335,-0.7657978,-0.5147986,-0.7620742,-0.6855323,-0.7859527,-0.6748622,-0.6805919,-0.8316152,-0.571367
50%,-0.1660892,-0.3016939,-0.05941375,-0.5135612,-0.2578826,-0.08594301,0.03990667,0.06448888,-0.05287424,-0.1429373,-0.1608231,-0.571367
75%,0.3738951,0.3664962,0.4911459,0.5584445,0.2559494,0.5901882,0.7122647,0.7648525,0.6313125,0.4619241,0.677667,-0.571367
max,6.699425,7.534354,9.231281,12.68682,15.84219,14.56357,5.737257,14.76879,4.923029,9.870879,3.696231,1.75019


In [15]:
# Scale is now in the form of standard deviations (from units of difference in coefficients to std difference in coefficients)
# Changes how to interpret the coefficients. Prediction of coef across variables.
# R^2 value is unchanged.
predictors = sm.add_constant(wine_preds_scaled)
model = sm.OLS(wine_target, predictors).fit()
model.summary()

0,1,2,3
Dep. Variable:,quality,R-squared:,0.297
Model:,OLS,Adj. R-squared:,0.295
Method:,Least Squares,F-statistic:,227.8
Date:,"Wed, 03 Mar 2021",Prob (F-statistic):,0.0
Time:,10:31:25,Log-Likelihood:,-7195.2
No. Observations:,6497,AIC:,14420.0
Df Residuals:,6484,BIC:,14500.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.8184,0.009,639.726,0.000,5.801,5.836
fixed acidity,0.1103,0.020,5.396,0.000,0.070,0.150
volatile acidity,-0.2457,0.013,-18.345,0.000,-0.272,-0.219
citric acid,-0.0091,0.012,-0.786,0.432,-0.032,0.014
residual sugar,0.2970,0.028,10.522,0.000,0.242,0.352
chlorides,-0.0265,0.012,-2.264,0.024,-0.049,-0.004
free sulfur dioxide,0.0876,0.014,6.443,0.000,0.061,0.114
total sulfur dioxide,-0.0793,0.018,-4.333,0.000,-0.115,-0.043
density,-0.3116,0.043,-7.248,0.000,-0.396,-0.227

0,1,2,3
Omnibus:,140.992,Durbin-Watson:,1.648
Prob(Omnibus):,0.0,Jarque-Bera (JB):,313.985
Skew:,0.016,Prob(JB):,6.59e-69
Kurtosis:,4.077,Cond. No.,12.6


## Multiple Regression in Scikit-Learn

In [16]:
# Let's create a StandardScaler object to scale our data for us.
ss = StandardScaler()


# Now we'll apply it to our data by using the .fit() and .transform() methods.
ss.fit(wine_preds)

wine_preds_st_scaled = ss.transform(wine_preds)

In [17]:
# Check that the scaling worked about the same as when we did it by hand

np.allclose(wine_preds_st_scaled, wine_preds_scaled)

True

In [18]:
wine_preds_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,red_wine
0,0.142473,2.188833,-2.192833,-0.744778,0.569958,-1.10014,-1.446359,1.034993,1.81309,0.193097,-0.915464,1.75019
1,0.451036,3.282235,-2.192833,-0.59764,1.197975,-0.31132,-0.862469,0.701486,-0.115073,0.999579,-0.580068,1.75019
2,0.451036,2.5533,-1.917553,-0.660699,1.026697,-0.874763,-1.092486,0.768188,0.25812,0.797958,-0.580068,1.75019
3,3.073817,-0.362438,1.661085,-0.744778,0.541412,-0.762074,-0.986324,1.101694,-0.363868,0.32751,-0.580068,1.75019
4,0.142473,2.188833,-2.192833,-0.744778,0.569958,-1.10014,-1.446359,1.034993,1.81309,0.193097,-0.915464,1.75019


In [19]:
# Checking the values add up to original pandas df
wine_preds_st_scaled[:5, :]

array([[ 0.14247327,  2.18883292, -2.19283252, -0.7447781 ,  0.56995782,
        -1.10013986, -1.44635852,  1.03499282,  1.81308951,  0.19309677,
        -0.91546416,  1.75018984],
       [ 0.45103572,  3.28223494, -2.19283252, -0.59764007,  1.1979747 ,
        -0.31132009, -0.86246863,  0.70148631, -0.11507303,  0.99957862,
        -0.58006813,  1.75018984],
       [ 0.45103572,  2.55330026, -1.91755268, -0.66069923,  1.02669737,
        -0.87476278, -1.09248586,  0.76818761,  0.25811972,  0.79795816,
        -0.58006813,  1.75018984],
       [ 3.07381662, -0.36243847,  1.66108525, -0.7447781 ,  0.54141159,
        -0.76207424, -0.98632406,  1.10169412, -0.3638682 ,  0.32751041,
        -0.58006813,  1.75018984],
       [ 0.14247327,  2.18883292, -2.19283252, -0.7447781 ,  0.56995782,
        -1.10013986, -1.44635852,  1.03499282,  1.81308951,  0.19309677,
        -0.91546416,  1.75018984]])

In [20]:
# Now we can fit a LinearRegression object to our training data!

lr = LinearRegression()
lr.fit(wine_preds_st_scaled, wine_target)

LinearRegression()

In [21]:
# We can use the .coef_ attribute to recover the results
# of the regression.

lr.coef_

array([ 0.11027401, -0.24568548, -0.00909927,  0.29704168, -0.02652718,
        0.08762284, -0.07927578, -0.311567  ,  0.08018737,  0.10739154,
        0.26556038,  0.155642  ])

In [22]:
lr.intercept_

5.818377712790517

In [23]:
lr.score(wine_preds_st_scaled, wine_target)

0.29653465192890527

In [24]:
lr.predict(wine_preds_st_scaled)

array([4.9711381 , 4.91138099, 5.03013256, ..., 5.3914881 , 6.45904385,
       6.24475934])