In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.pipeline import make_pipeline
from sklearn import model_selection
from sklearn.model_selection import cross_val_score

Rozważmy zbiór Boston

In [3]:
boston = datasets.load_boston()
# print description
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
# get the data
boston_X = boston.data
boston_Y = boston.target

In [5]:
# Split the data into training/testing sets
boston_X_train = boston_X[:-50]
boston_X_test = boston_X[-50:]
 
# Split the targets into training/testing sets
boston_y_train = boston_Y[:-50]
boston_y_test = boston_Y[-50:]

In [6]:
X = boston_X_train
y = boston_y_train

# Zadanie
Znajdż najleprzy model za pomocą podwujną cross-validation (nested cross-validation, double cross-validation) .


In [35]:
seed = 123
kfold = model_selection.KFold(n_splits=5, random_state=seed)

In [36]:
mean_r2 = []
var_r2 = []

In [37]:
grid_1 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), ElasticNet(alpha=1, random_state=seed)),
                    param_grid={'polynomialfeatures__degree': [1, 2, 3, 4, 5],
                    'elasticnet__alpha': [0.001, 0.01, 0.1, 1, 10]},
                    cv=kfold,
                    refit=True,
                    n_jobs=-1)
grid_1.fit(X, y)
grid_1.best_params_

{'elasticnet__alpha': 1, 'polynomialfeatures__degree': 1}

In [38]:
scores_1 = cross_val_score(grid_1, X, y, scoring='r2', cv=kfold)
mean_r2.append(np.mean(scores_1))
var_r2.append(np.std(scores_1))
print('CV ElasticNet R2: %.3f +/- %.3f' % (np.mean(scores_1), np.std(scores_1)))

CV ElasticNet R2: 0.250 +/- 0.333


In [39]:
grid_2 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), Lasso(alpha=0.1)),
                   param_grid={'polynomialfeatures__degree': [1, 2, 3, 4, 5],
                               'lasso__alpha': [0.001, 0.01, 0.1, 1, 10]},
                   cv=kfold,
                   refit=True,
                   n_jobs=-1)
grid_2.fit(X, y)
grid_2.best_params_

{'lasso__alpha': 1, 'polynomialfeatures__degree': 1}

In [40]:
scores_2 = cross_val_score(grid_2, X, y, scoring='r2', cv=kfold)
mean_r2.append(np.mean(scores_2))
var_r2.append(np.std(scores_2))
print('CV Lasso R2: %.3f +/- %.3f' % (np.mean(scores_2), np.std(scores_2)))

CV Lasso R2: 0.196 +/- 0.385


In [41]:
grid_3 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), Ridge(alpha=0.1)),
                   param_grid={'polynomialfeatures__degree': [1, 2, 3, 4, 5, 6, 7],
                               'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
                   cv=kfold,
                   refit=True,
                   n_jobs=-1)
grid_3.fit(X, y)
grid_3.best_params_

{'polynomialfeatures__degree': 1, 'ridge__alpha': 100}

In [42]:
scores_3 = cross_val_score(grid_3, X, y, scoring='r2', cv=kfold)
mean_r2.append(np.mean(scores_3))
var_r2.append(np.std(scores_3))
print('CV Ridge R2: %.3f +/- %.3f' % (np.mean(scores_3), np.std(scores_3)))

CV Ridge R2: 0.136 +/- 0.633


In [43]:
grid_4 = GridSearchCV(make_pipeline(PolynomialFeatures(degree=2), linear_model.LinearRegression()),
                    param_grid={'polynomialfeatures__degree': [1, 2, 3, 4, 5, 6, 7]},
                    cv=kfold,
                    refit=True,
                    n_jobs=-1)
grid_4.fit(X, y)
grid_4.best_params_

{'polynomialfeatures__degree': 1}

In [44]:
scores_4 = cross_val_score(grid_4, X, y, scoring='r2', cv=kfold)
mean_r2.append(np.mean(scores_4))
var_r2.append(np.std(scores_4))
print('CV Linear R2: %.3f +/- %.3f' % (np.mean(scores_4), np.std(scores_4)))

CV Linear R2: -0.201 +/- 1.515


In [45]:
d = {'mean r2': mean_r2, 
     'var r2': var_r2
    }
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['ElasticNet', 'Lasso', 'Ridge', 'LR'])
df

Unnamed: 0,Method,mean r2,var r2
0,ElasticNet,0.249927,0.333442
1,Lasso,0.195784,0.384993
2,Ridge,0.136481,0.632705
3,LR,-0.200892,1.515316
