## Hyperparameter tuning for selected models

- Previously it was observed that RandomForestRegressor, ExtraTreesRegressor and XGBRegressor were the top performing models
- The best final model and the optimal hyperparameters would be decided here post hyperparameter tuning

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("../data/gurgaon_properties_post_feature_selection_top_12.csv")
data.head(5)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,study room,servant room,store room,pooja room,furnishing_type
0,flat,sector 36,0.82,3,2,2,New Property,850,0,0,0,0,2
1,flat,sector 89,0.95,2,2,2,New Property,1226,1,1,0,0,2
2,flat,sohna road,0.32,2,2,1,New Property,1000,0,0,0,0,2
3,flat,sector 92,1.6,3,4,3+,Relatively New,1615,0,1,0,0,0
4,flat,sector 102,0.48,2,2,1,Relatively New,582,0,0,1,0,2


In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import category_encoders as ce

numerical_cols = ['bedRoom', 'bathroom', 'built_up_area', 'study room', 'servant room', 'store room', 'pooja room']
categorical_cols = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type']

transform_label = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
    ('cat2', OneHotEncoder(drop='first', handle_unknown='ignore'), ['property_type', 'agePossession']),
    ('cat3', ce.TargetEncoder(), ['sector'])
], remainder='passthrough')

X = data.drop(columns=['price'])
y = np.log1p(data['price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model_list = {'RandomForestRegressor' : RandomForestRegressor(n_jobs=-1), 
              'ExtraTreesRegressor' : ExtraTreesRegressor(n_jobs=-1), 
              'XGBRegressor' : XGBRegressor(n_jobs=-1)}

params_list = {'RandomForestRegressor' : {'regressor__n_estimators':[100, 150, 200, 250, 300, 350, 400], 'regressor__max_depth': [None, 10, 20, 30], 'regressor__max_samples':[0.1, 0.25, 0.5, 1], 'regressor__max_features' : ['sqrt', 'log2', None]}, 
              'ExtraTreesRegressor' : {'regressor__n_estimators':[100, 150, 200, 250, 300, 350, 400], 'regressor__max_depth': [None, 10, 20, 30], 'regressor__max_samples':[0.1, 0.25, 0.5, 1], 'regressor__max_features' : ['sqrt', 'log2', None]}, 
              'XGBRegressor' : {'regressor__n_estimators': [100, 150, 200, 250, 300, 350, 400], 'regressor__max_depth' : [None, 10, 20, 30], 'regressor__learning_rate' : [0.01, 0.1], 'regressor__reg_alpha':[0, 0.001, 0.005, 0.01, 0.05], 'regressor__gamma' :  [i/10.0 for i in range(0,5)]}}

In [8]:
def compute_score(preprocessor, model_in, params):

    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', model_in)
    ])

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    #score = cross_val_score(pipeline, X ,y, cv=kfold, scoring='r2')

    #search = GridSearchCV(pipeline, params, scoring = 'r2', cv=kfold, n_jobs = -1, verbose=4)
    search = GridSearchCV(pipeline, params, scoring = 'neg_mean_absolute_error', cv=kfold, n_jobs = -1, verbose=4)
    search.fit(X, y)

    final_pipe = search.best_estimator_

    final_pipe.fit(X_train, y_train)
    preds = final_pipe.predict(X_test)
    preds = np.expm1(preds)
    mae = mean_absolute_error(np.expm1(y_test), preds)
    r2 = r2_score(np.expm1(y_test), preds)

    return search.best_params_, search.best_score_,  mae, r2

In [5]:

best_params_rfregressor, r2_score_rfregressor, mae_score_rfregressor = compute_score(transform_label, model_list['RandomForestRegressor'], params_list['RandomForestRegressor'])

Fitting 10 folds for each of 336 candidates, totalling 3360 fits


[CV 1/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, score=0.891 total time=   1.4s
[CV 2/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, score=0.887 total time=   1.7s
[CV 4/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, score=0.857 total time=   1.8s
[CV 3/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, score=0.874 total time=   1.7s
[CV 5/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, score=0.873 total time=   1.3s
[CV 7/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, score=0.876 total time=   1.1s
[CV 8/10] END regressor__max_depth=None,

In [6]:
print("Scores and best parameters for Random Forest Regressor : ")
print(f"Best params : {best_params_rfregressor}")
print(f"R2 score : {r2_score_rfregressor}")
print(f"MAE score : {mae_score_rfregressor}")

Scores and best parameters for Random Forest Regressor : 
Best params : {'regressor__max_depth': 20, 'regressor__max_features': None, 'regressor__max_samples': 0.5, 'regressor__n_estimators': 350}
R2 score : 0.9023358759069635
MAE score : 0.45553338270926896


In [9]:

best_params_rfregressor, grid_score_rfregressor, mae_score_rfregressor, r2_score_rfregressor = compute_score(transform_label, model_list['RandomForestRegressor'], params_list['RandomForestRegressor'])

Fitting 10 folds for each of 336 candidates, totalling 3360 fits
[CV 4/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, score=-0.147 total time=   0.8s
[CV 2/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, score=-0.135 total time=   0.8s
[CV 1/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, score=-0.127 total time=   0.9s
[CV 3/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, score=-0.142 total time=   0.9s
[CV 6/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, score=-0.133 total time=   0.9s
[CV 8/10] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__max_samples=0.1, regressor__n_estimators=100;, s

NameError: name 'r2_score' is not defined

In [None]:
print("Scores and best parameters for Random Forest Regressor : ")
print(f"Best params : {best_params_rfregressor}")
print(f"Best gridsearch score : {grid_score_rfregressor}")
print(f"R2 score : {r2_score_rfregressor}")
print(f"MAE score : {mae_score_rfregressor}")