In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
train=pd.read_csv('train_full.csv',index_col=[0])
train.shape

(1460, 174)

In [3]:
train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,LotShape,LandContour,LandSlope,BldgType,HouseStyle,OverallQual,OverallCond,...,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd
0,10,65.0,8450,3,3,0,0,5,7,5,...,0,0,0,0,0,1,0,0,0,0
1,5,80.0,9600,3,3,0,0,2,6,8,...,0,0,0,0,0,1,0,0,0,0
2,10,68.0,11250,0,3,0,0,5,7,5,...,0,0,0,0,0,1,0,0,0,0
3,11,60.0,9550,0,3,0,0,5,7,5,...,0,0,0,0,0,0,0,0,0,1
4,10,84.0,14260,0,3,0,0,5,8,5,...,0,0,0,0,0,1,0,0,0,0


In [4]:
def root_mean_squared_log_error(y_valid, y_preds):
    """Calculate root mean squared error of log(y_true) and log(y_pred)"""
    if len(y_preds)!=len(y_valid): return 'error_mismatch'
    y_preds_new = [math.log(x) for x in y_preds]
    y_valid_new = [math.log(x) for x in y_valid]
    return mean_squared_error(y_valid_new, y_preds_new, squared=False)

In [5]:
y=train['SalePrice']
x=train.drop(['SalePrice'],axis=1)
X_train, X_valid, y_train, y_valid=train_test_split(x,y,random_state=73)

In [6]:
from sklearn.ensemble import RandomForestRegressor
RF=RandomForestRegressor(n_estimators=1000,random_state=73)
RF.fit(X_train,y_train)
y_pred_RF=RF.predict(X_valid)
print('RMSLE:', root_mean_squared_log_error(y_valid, y_pred_RF))

RMSLE: 0.144563283965543


In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [8]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train,y_train)
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 16.3min finished


{'n_estimators': 1600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': False}

In [10]:
rf_random.best_estimator_

RandomForestRegressor(bootstrap=False, max_depth=70, max_features='sqrt',
                      min_samples_split=5, n_estimators=1600)

In [11]:
param_grid = {
    'bootstrap': [False],
    'max_depth': [60, 70, 80, 90],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [1500, 1600, 1700]
}

In [12]:
rf = RandomForestRegressor()

In [13]:
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [15]:
grid_search.fit(X_train,y_train)
grid_search.best_params_

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:  7.0min finished


{'bootstrap': False,
 'max_depth': 60,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 1700}

In [16]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=False, max_depth=60, max_features='sqrt',
                      min_samples_split=4, n_estimators=1700)

In [17]:
RF_f=RandomForestRegressor(bootstrap=False, max_depth=60, max_features='sqrt',
                      min_samples_split=4, n_estimators=1700)
RF_f.fit(X_train,y_train)
y_pred_RF_f=RF_f.predict(X_valid)
print('RMSLE:', root_mean_squared_log_error(y_valid, y_pred_RF_f))

RMSLE: 0.1408329214047232


In [9]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

grid = GridSearchCV(RF, param_grid=params, n_jobs=4, cv=5, verbose=3 )
grid.fit(X_train,y_train)
print('\n All results:')
print(grid.cv_results_)
print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_ * 2 - 1)
print('\n Best parameters:')
print(grid.best_params_)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Invalid parameter colsample_bytree for estimator RandomForestRegressor(n_estimators=1000, random_state=73). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
hp_model = RF(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.5, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=4,
             min_child_weight=1, monotone_constraints='()',
             n_estimators=1200, n_jobs=0, num_parallel_tree=1, random_state=73,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
hp_model.fit(X_train,y_train)
y_pred_hp_model=hp_model.predict(X_valid)
print('RMSLE:', root_mean_squared_log_error(y_valid, y_pred_hp_model))