# Linear Regression

## import packages

In [16]:
import pandas as pd
import numpy as np
import math
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

## import dataset

In [3]:
dataset = load_boston()

X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names

y = dataset.target

display(X.head())
display(X.describe())

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


## Estimate a linear regression model

In [37]:
#fit the model onto the training set. Note that you can opt in or out of using an intercept
model_with_intercept = LinearRegression(fit_intercept=True).fit(X_train, y_train)
model_without_intercept = LinearRegression(fit_intercept=False).fit(X_train, y_train)

#make predictions
prediction_with_intercept = model_with_intercept.predict(X_test)
prediction_without_intercept = model_without_intercept.predict(X_test)

#print the estimated coefficients
coefficients_w = pd.DataFrame(np.hstack((model_with_intercept.intercept_, model_with_intercept.coef_)).reshape(1,-1), index=['value'])
coefficients_wo = pd.DataFrame(model_without_intercept.coef_.reshape(1,-1), index=['value'])

print("coefficients of model with intercept:")
display(coefficients_w)
print("\ncoefficients of model without intercept:")
display(coefficients_wo)

coefficients of model with intercept:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
value,31.631084,-0.13347,0.035809,0.049523,3.119835,-15.417061,4.057199,-0.010821,-1.385998,0.242727,-0.008702,-0.910685,0.011794,-0.547113



coefficients of model without intercept:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
value,-0.119845,0.038353,0.038048,3.059651,-2.095502,5.843406,-0.015081,-0.919043,0.116974,-0.006085,-0.428269,0.01664,-0.466386


## Performance metrics

Since linear regression models make continuous predictions both a ROC-curve as well as a confusion matrix do not make much sense. Often used metrics look at prediction errors like MAE (mean absolute error), MSE (mean squared error), RMSE (rooted mean squared error). Another approach is looking at how well the model explains the variance, the R-squared value. Do note that with the latter there is a pitfall: The more variables you add to your model the higher the R-squared metric, sometimes it's worth looking at an adjusted variant of the R-squared that intoduces a penalty for the number of variables.

In [17]:
#MAE
MAE_w = metrics.mean_absolute_error(y_test, prediction_with_intercept)
MAE_wo = metrics.mean_absolute_error(y_test, prediction_without_intercept)

#MSE
MSE_w = metrics.mean_squared_error(y_test, prediction_with_intercept)
MSE_wo = metrics.mean_squared_error(y_test, prediction_without_intercept)

#RMSE
RMSE_w = math.sqrt(MSE_w)
RMSE_wo = math.sqrt(MSE_wo)

#R^2
R_sq_w = metrics.r2_score(y_test, prediction_with_intercept)
R_sq_wo = metrics.r2_score(y_test, prediction_without_intercept)

metrics = pd.DataFrame([[MAE_w, MSE_w, RMSE_w, R_sq_w],[MAE_wo, MSE_wo, RMSE_wo, R_sq_wo]], columns=['MAE', 'MSE', 'RMSE', 'R squared'],index=['with intercept', 'without intercept'])
display(metrics)

Unnamed: 0,MAE,MSE,RMSE,R squared
with intercept,3.16271,21.517444,4.63869,0.711226
without intercept,3.289227,24.868111,4.986794,0.666259
