In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
%matplotlib inline

#import training dataset
train_df = pd.read_csv('/home/johan/Desktop/boston-housing/train.csv', index_col='ID')

#see the columns in our data
#train_df.info()
# take a look at the head of the dataset
train_df.head()

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9


In [2]:
#create our X and y
#Drop specified labels from rows or columns.
#Remove rows or columns by specifying label names and corresponding axis, or by specifying directly index 
#or column names. When using a multi-index, labels on different levels can be removed by specifying the level.
X = train_df.drop('medv', axis=1)
y = train_df['medv']

print("X shape:", X.shape)
print("y shape:", y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X shape: (333, 13)
y shape: (333,)
X_train shape: (233, 13)
y_train shape: (233,)
X_test shape: (100, 13)
y_test shape: (100,)


In [3]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

print('Training score: {}'.format(lr_model.score(X_train, y_train)))
print('Test score: {}'.format(lr_model.score(X_test, y_test)))

y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)

print('RMSE: {}'.format(rmse))

Training score: 0.7268827869293253
Test score: 0.725468795925456
RMSE: 4.587100299689435


In [4]:
steps = [('scalar', StandardScaler()),('poly', PolynomialFeatures(degree=2)),('model', LinearRegression())]
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))

Training score: 0.9475767600691033
Test score: 0.4676268497187529


For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].

The standard score of a sample x is calculated as:

    z = (x - u) / s

In [5]:
X = np.arange(6).reshape(3, 2)
print ('X:',X)
poly = PolynomialFeatures(2)
poly.fit_transform(X)


X: [[0 1]
 [2 3]
 [4 5]]


array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [6]:
#PolynomialFeatures
#Generate polynomial and interaction features.

#StandardScaler(), Standardize features by removing the mean and scaling to unit variance
poly = PolynomialFeatures(2)
poly.fit_transform(X_train)
print('PolynomialFeatures:',poly.fit_transform(X_train).shape)
print('-------------------')

steps = [('scalar', StandardScaler()),('poly', PolynomialFeatures(degree=2)),
    ('model', Ridge(alpha=10, fit_intercept=True))]

ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_train, y_train) 
print('Training Score: {}'.format(ridge_pipe.score(X_train, y_train)))
print('Test Score: {}'.format(ridge_pipe.score(X_test, y_test)))

PolynomialFeatures: (233, 105)
-------------------
Training Score: 0.9181086448043112
Test Score: 0.8287450913722809


In [7]:
steps = [('scalar', StandardScaler()),('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso(alpha=0.3, fit_intercept=True))
]

lasso_pipe = Pipeline(steps)
lasso_pipe.fit(X_train, y_train)

print('Training score: {}'.format(lasso_pipe.score(X_train, y_train)))
print('Test score: {}'.format(lasso_pipe.score(X_test, y_test)))

Training score: 0.8483818643379695
Test score: 0.8307977758339726


l1+l1

https://medium.com/coinmonks/regularization-of-linear-models-with-sklearn-f88633a93a2