# Comparison of Regression Methods using Boston Housing Data

In [1]:
# import all the required libraries and put matplotlib in inline mode to plot on the notebook
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import model_selection
%matplotlib inline

We load the data points and plot them.

In [2]:
dataset = pd.read_csv('housing.csv')
train,test = model_selection.train_test_split(dataset, test_size=0.33, random_state=1234)

dataset.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [3]:
target = 'MEDV'
variables = dataset.columns[dataset.columns!='MEDV']

Let's select the input variables (x) and the target variable (y) both in the train and the test sets.

In [4]:
train_x = train[variables].values.reshape(len(train), len(variables))
train_y = train[target].values.reshape(len(train), 1)

test_x = test[variables].values.reshape(len(test), len(variables))
test_y = test[target].values.reshape(len(test), 1)

## Linear Regression
To evaluate linear regression we generate a model from the train set and evaluate it on the test set.

In [5]:
simple = linear_model.LinearRegression()
simple_fit = simple.fit(train_x,train_y)
r2_simple_train = simple.score(train_x,train_y)
r2_simple_test = simple.score(test_x,test_y)
print("Simple Model \t%3.3f\t%3.3f" % (r2_simple_train,r2_simple_test))
print("\nDegree\tCoefficient")
for i,c in enumerate(np.append(simple_fit.intercept_,simple_fit.coef_)):
    print("%d\t%3.3f" % (i,c) )

Simple Model 	0.728	0.737

Degree	Coefficient
0	54.018
1	-0.102
2	0.063
3	-0.023
4	2.744
5	-22.747
6	2.436
7	0.005
8	-1.898
9	0.389
10	-0.014
11	-1.120
12	0.007
13	-0.586


## Ridge Regression
We use the function <code>RidgeCV</code> to apply crossvalidation over Ridge regression to compute the best alpha, then we evaluate the model with the selected alpha using the test set.

In [6]:
ridge = linear_model.RidgeCV(cv=10)
ridge_fit = ridge.fit(train_x,train_y)
r2_ridge_train = ridge.score(train_x,train_y)
r2_ridge_test = ridge.score(test_x,test_y)
print("Ridge(%3.3f) \t%3.3f\t%3.3f" % (ridge_fit.alpha_,r2_ridge_train,r2_ridge_test))

print("\nDegree\tCoefficient")
for i,c in enumerate(np.append(ridge_fit.intercept_,ridge_fit.coef_)):
    print("%d\t%3.3f" % (i,c) )

Ridge(0.100) 	0.728	0.739

Degree	Coefficient
0	52.604
1	-0.101
2	0.063
3	-0.032
4	2.710
5	-20.823
6	2.448
7	0.004
8	-1.863
9	0.385
10	-0.014
11	-1.097
12	0.007
13	-0.588


## Lasso Regression
We use the function <code>LassoCV</code> to apply crossvalidation over lasso to compute the best alpha, then we evaluate the model with the selected alpha using the test set.

In [7]:
lasso = linear_model.LassoCV(cv=10)
lasso_fit = lasso.fit(train[variables],train[target])
r2_lasso_train = lasso.score(train_x,train_y)
r2_lasso_test = lasso.score(test_x,test_y)
print("Lasso(%3.3f) \t%3.3f\t%3.3f" % (lasso_fit.alpha_,r2_lasso_train,r2_lasso_test))

for i,c in enumerate(np.append(lasso_fit.intercept_,lasso_fit.coef_)):
    print("%d\t%3.3f" % (i,c) )
    

Lasso(0.668) 	0.682	0.684
0	47.995
1	-0.064
2	0.076
3	-0.060
4	0.000
5	-0.000
6	0.604
7	0.011
8	-1.086
9	0.355
10	-0.018
11	-0.824
12	0.008
13	-0.741


## Summary
This is the final result.

In [8]:
print("Simple Model \t%3.3f\t%3.3f" % (r2_simple_train,r2_simple_test))
print("Ridge(%3.3f) \t%3.3f\t%3.3f" % (ridge_fit.alpha_,r2_ridge_train,r2_ridge_test))
print("Lasso(%3.3f) \t%3.3f\t%3.3f" % (lasso_fit.alpha_,r2_lasso_train,r2_lasso_test))

Simple Model 	0.728	0.737
Ridge(0.100) 	0.728	0.739
Lasso(0.668) 	0.682	0.684
