# Hyperparameter optimization

### Imports 

In [8]:
import os
import numpy as np
import sklearn as skl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

### Data

In [2]:
data_path = '/home/lorenzo/skl-repo/0_data/california_housing.csv'
df = pd.read_csv(data_path)
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

In [3]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state=3542)
print(f'Train set length: {len(train_X)}')
print(f'Test set length: {len(test_X)}')

Train set length: 16512
Test set length: 4128


In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, beds_ix, pop_ix, hh_ix = 3,4,5,6

class CombineAttributes(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_hh = X[:, rooms_ix] / X[:, hh_ix]
        avg_hh_size = X[:, pop_ix] / X[:, hh_ix]
        if self.add_bedrooms_per_room:
            beds_per_room = X[:, beds_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_hh, avg_hh_size, beds_per_room]
        else:
            return np.c_[X, rooms_per_hh, avg_hh_size]

In [5]:
trainX_cat = train_X.select_dtypes(exclude=np.number)
trainX_num = train_X.select_dtypes(include=np.number)
testX_cat = test_X.select_dtypes(exclude=np.number)
testX_num = test_X.select_dtypes(include=np.number)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_columns = list(trainX_num.columns)
cat_columns = list(trainX_cat.columns)

# pipeline for numerical columns
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('cstm_attribs', CombineAttributes()),
    ('std_scaler', StandardScaler()),
])

# pipeline for categorical columns
cat_pipeline = Pipeline([
    ('onehot_enc', OneHotEncoder()),
])

# full pipeline
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_columns),
    ('cat', cat_pipeline, cat_columns),
])

In [7]:
# Apply pipeline transformation to features (X)
train_X = full_pipeline.fit_transform(train_X)
test_X = full_pipeline.transform(test_X)

# Convert target to numpy array (y)
train_y, test_y = train_y.values, test_y.values

### Baseline model (untuned)

In [18]:
rf = RandomForestRegressor(n_jobs=-1)
rf.fit(train_X, train_y)

rf_pred = rf.predict(test_X)
rf_mse = mean_squared_error(test_y, rf_pred)
rf_rmse = np.sqrt(rf_mse)
print(f'Baseline random forest RMSE: {rf_rmse}')

Baseline random forest RMSE: 50845.35324493719


### Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV

parameters_grid = [
    {'n_estimators': [10, 50], 'min_samples_leaf':[1, 3, 5], 'max_depth':[None, 5, 10]},
    # alternative combinations can be added with more dictionaries
    # {}
]

rf = RandomForestRegressor()
grid_search = GridSearchCV(rf, parameters_grid, cv=5, scoring="neg_mean_squared_error", return_train_score=True, n_jobs=-1)

grid_search.fit(train_X, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [11]:
grid_search.best_params_

{'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 50}

In [12]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

52468.70323642938 {'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 10}
50365.4226809366 {'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 50}
51991.026592453534 {'max_depth': None, 'min_samples_leaf': 3, 'n_estimators': 10}
50372.311833637665 {'max_depth': None, 'min_samples_leaf': 3, 'n_estimators': 50}
52345.9189791181 {'max_depth': None, 'min_samples_leaf': 5, 'n_estimators': 10}
50831.28973792548 {'max_depth': None, 'min_samples_leaf': 5, 'n_estimators': 50}
64358.18050700863 {'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 10}
63778.28788496295 {'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 50}
63837.118788949425 {'max_depth': 5, 'min_samples_leaf': 3, 'n_estimators': 10}
63714.60116814434 {'max_depth': 5, 'min_samples_leaf': 3, 'n_estimators': 50}
63937.15285023483 {'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 10}
63761.69979003445 {'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 50}
54008.88904458058 {'max_depth': 10, 'min_samp

In [16]:
tuned_rf = grid_search.best_estimator_

rf_pred = tuned_rf.predict(test_X)
rf_mse = mean_squared_error(test_y, rf_pred)
rf_rmse = np.sqrt(rf_mse)
print(f'Tuned random forest RMSE: {rf_rmse}')

Tuned random forest RMSE: 50830.86914545845


### Random Search

In [13]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, norm, randint

distributions = [
    {'n_estimators': randint(10, 100), 'min_samples_leaf': randint(1, 10), 'max_depth': [None, 10]},
    # alternative combinations can be added with more dictionaries
    # {}
]

rf = RandomForestRegressor()
rndm_seach = RandomizedSearchCV(rf, distributions, n_iter = 20, cv=5, scoring="neg_mean_squared_error", 
                           return_train_score=True, n_jobs=-1, )

rndm_seach.fit(train_X, train_y)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [14]:
rndm_seach.best_params_

{'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 90}

In [15]:
cvres = rndm_seach.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

51107.97019675681 {'max_depth': None, 'min_samples_leaf': 7, 'n_estimators': 57}
50601.581484899376 {'max_depth': None, 'min_samples_leaf': 5, 'n_estimators': 95}
51042.50763934293 {'max_depth': None, 'min_samples_leaf': 6, 'n_estimators': 58}
51172.69593087151 {'max_depth': None, 'min_samples_leaf': 7, 'n_estimators': 56}
52881.71751248242 {'max_depth': 10, 'min_samples_leaf': 7, 'n_estimators': 80}
53067.4811102635 {'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 49}
51460.74174465332 {'max_depth': None, 'min_samples_leaf': 6, 'n_estimators': 20}
51007.42778904997 {'max_depth': None, 'min_samples_leaf': 7, 'n_estimators': 99}
51277.48777974723 {'max_depth': None, 'min_samples_leaf': 8, 'n_estimators': 82}
51659.3707531114 {'max_depth': None, 'min_samples_leaf': 9, 'n_estimators': 42}
51627.60504936137 {'max_depth': None, 'min_samples_leaf': 9, 'n_estimators': 67}
50081.92806421921 {'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 90}
52948.202116099295 {'max_depth': 

In [17]:
tuned_rf = rndm_seach.best_estimator_

rf_pred = tuned_rf.predict(test_X)
rf_mse = mean_squared_error(test_y, rf_pred)
rf_rmse = np.sqrt(rf_mse)
print(f'Tuned random forest RMSE: {rf_rmse}')

Tuned random forest RMSE: 50706.591774714696
