In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

def load_housing(path):
    housing = pd.read_csv(os.path.join(path, 'housing.csv'))
    train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
    housing['income_cat'] = pd.cut(housing['median_income'], 
                               bins = [0., 1.5, 3.0, 4.5, 6., np.inf], 
                               labels = [1, 2, 3, 4, 5])
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    
    for train_index, test_index in split.split(housing, housing['income_cat']):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]

    for set_ in (strat_train_set, strat_test_set):
        set_.drop('income_cat', axis=1, inplace=True)
        
    return strat_train_set, strat_test_set

In [2]:
HOUSING_PATH = os.path.join("..", "datasets", "housing")

strat_train_set, strat_test_set = load_housing(HOUSING_PATH)

In [3]:
housing = strat_train_set.copy()
housing_labels = housing['median_house_value'].copy()
housing.drop('median_house_value', axis=1, inplace=True)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

col_names = 'total_rooms', 'total_bedrooms', 'population', 'households'
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    housing.columns.get_loc(c) for c in col_names]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num = housing.drop('ocean_proximity', axis=1)
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

Exercise 1.

In [6]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

svr = SVR()
svr_params = [
    {
        'kernel': ['linear'],
        'C': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0],
    },
    {
        'kernel': ['rbf'],
        'C': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0],
        'gamma': [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0],
    },
]
svr_grid_search = GridSearchCV(svr, svr_params, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
svr_grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=10, estimator=SVR(), n_jobs=-1,
             param_grid=[{'C': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0],
                          'kernel': ['linear']},
                         {'C': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0],
                          'gamma': [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0],
                          'kernel': ['rbf']}],
             scoring='neg_mean_squared_error')

In [7]:
svr_grid_search.best_params_

{'C': 100.0, 'kernel': 'linear'}

In [8]:
negative_mse = svr_grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

71455.22149618293

In [9]:
cvres = svr_grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

118170.47290155212 {'C': 0.1, 'kernel': 'linear'}
115205.27692525135 {'C': 0.5, 'kernel': 'linear'}
111843.95898327122 {'C': 1.0, 'kernel': 'linear'}
93254.29843200356 {'C': 5.0, 'kernel': 'linear'}
83096.70918124914 {'C': 10.0, 'kernel': 'linear'}
72875.41711346977 {'C': 50.0, 'kernel': 'linear'}
71455.22149618293 {'C': 100.0, 'kernel': 'linear'}
118922.2631615807 {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
118904.40195043657 {'C': 0.1, 'gamma': 0.05, 'kernel': 'rbf'}
118903.19987700762 {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
118924.17671157836 {'C': 0.1, 'gamma': 0.5, 'kernel': 'rbf'}
118930.32965634474 {'C': 0.1, 'gamma': 1.0, 'kernel': 'rbf'}
118933.76218613968 {'C': 0.1, 'gamma': 5.0, 'kernel': 'rbf'}
118933.92251189632 {'C': 0.1, 'gamma': 10.0, 'kernel': 'rbf'}
118874.23903224821 {'C': 0.5, 'gamma': 0.01, 'kernel': 'rbf'}
118779.23026038973 {'C': 0.5, 'gamma': 0.05, 'kernel': 'rbf'}
118773.44537989466 {'C': 0.5, 'gamma': 0.1, 'kernel': 'rbf'}
118885.2603050095 {'C': 0.5, 'gamma'

In [10]:
svr = SVR()
svr_params = [
    {
        'kernel': ['linear'],
        'C': [100.0, 500.0, 1000.0, 5000.0, 10000.0, 50000.0, 100000.0],
    },
    {
        'kernel': ['rbf'],
        'C': [100.0, 500.0, 1000.0, 5000.0, 10000.0, 50000.0, 100000.0],
        'gamma': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0],
    },
]
svr_grid_search = GridSearchCV(svr, svr_params, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
svr_grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=10, estimator=SVR(), n_jobs=-1,
             param_grid=[{'C': [100.0, 500.0, 1000.0, 5000.0, 10000.0, 50000.0,
                                100000.0],
                          'kernel': ['linear']},
                         {'C': [100.0, 500.0, 1000.0, 5000.0, 10000.0, 50000.0,
                                100000.0],
                          'gamma': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0],
                          'kernel': ['rbf']}],
             scoring='neg_mean_squared_error')

In [11]:
print('best_params:', svr_grid_search.best_params_)
negative_mse = svr_grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

best_params: {'C': 100000.0, 'gamma': 0.5, 'kernel': 'rbf'}


55363.602802280926

Exercise 2.

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

svr = SVR()
svr_param_distribs = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(100000, 200000),
    'gamma': expon(scale=1.0),
}
randomized_search = RandomizedSearchCV(svr, svr_param_distribs, cv=10, 
                                       scoring='neg_mean_squared_error',
                                       n_iter=50, n_jobs=-1, random_state=42)
randomized_search.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=10, estimator=SVR(), n_iter=50, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001A80C179700>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001A80BCDB5E0>,
                                        'kernel': ['linear', 'rbf']},
                   random_state=42, scoring='neg_mean_squared_error')

In [13]:
randomized_search.best_params_

{'C': 196394.6349177281, 'gamma': 0.26497040005002437, 'kernel': 'rbf'}

In [14]:
negative_mse = randomized_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

54220.978595011875