# Made using Google Colab

# Going to check a baseline RandomForestRegression model

In [3]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

from feature_select import Train

In [2]:
train = Train(pd.read_csv('train_cleaned.csv'))

X = train.cleaned.drop(columns='SalePrice')
y = train.cleaned['SalePrice']

X_train, y_train, X_val, y_val, X_test, y_test = train.train_val_test_split(
    X=X, y=y, random_state=42
)

print(X_train.shape, y_train.shape, X_val.shape,
      y_val.shape, X_test.shape, y_test.shape)

(751, 25) (751,) (251, 25) (251,) (251, 25) (251,)


In [4]:
# Take the baseline of the model

model = RandomForestRegressor()

model.fit(X_train, y_train)

y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

rmsle_val = np.sqrt(mean_squared_log_error(y_val, y_pred_val))
rmsle_test = np.sqrt(mean_squared_log_error(y_test, y_pred_test))

print(f'Validation Root Mean Squared Log Error: {rmsle_val}')
print(f'Test Root Mean Squared Log Error: {rmsle_test}')

Validation Root Mean Squared Log Error: 0.16813343047531687
Test Root Mean Squared Log Error: 0.16537159314117167


### So far, the best score acheived was with a GridSearchCV and GradientBoostingRegressor

GridSearchCV - GBR Scores:

Val RMSLE: 0.15681644042610832

Test RMSLE: 0.15277363846826836


### Let's see if we can get a better score with a Grid Search!

In [6]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV


# First, we need to make a scorer for GridSearch based on the Kaggle competition
# metric (Root Mean Squared Logarithmic Error)

def rmsle(predicted, actual, size):
  return np.sqrt(np.nansum(np.square(np.log(predicted + 1) - np.log(actual + 1))) / float(size))


scorer = make_scorer(rmsle, greater_is_better=False, size=10)


# Now, we make a function to create our model with a Grid Search

def make_model(param_grid, scoring=scorer, cv=5):
  rfr = RandomForestRegressor(random_state=42)
  grid_search = GridSearchCV(rfr, scoring=scorer, cv=cv, param_grid=param_grid,
                             n_jobs=-1, verbose=10)
  return grid_search

In [7]:
# Next, let's make a grid of parameters we would like to test.
param_grid = {
    'n_estimators': [_ for _ in range(10, 110, 10)],
    'max_depth': [None] + [_ for _ in range(1, 6)],
    'min_samples_leaf': [_ for _ in range(1, 6)],
    'min_weight_fraction_leaf': [0.0, 0.2, 0.5, 0.75],
    'max_features': ['auto', 'sqrt', 'log2'] + [_ for _ in range(2, 7)],
    'max_leaf_nodes': [None] + [_ for _ in range(1, 4)],
    'bootstrap': [True, False]    
}

In [9]:
# Create the model

model = make_model(param_grid)
model

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, rand...
                         'max_depth': [None, 1, 2, 3, 4, 5],
                         'max_features': ['auto', 'sqrt', 'l

In [10]:
# Finally, we'll fit the model - this could take a while!

model.fit(X_train, y_train)

Fitting 5 folds for each of 76800 candidates, totalling 384000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1963s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 136 tas

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, rand...
                         'max_depth': [None, 1, 2, 3, 4, 5],
                         'max_features': ['auto', 'sqrt', 'l

In [11]:
# Check the model's R^2 scores and RMSLEs for Val and Test
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

print('Val Score: ', model.score(X_val, y_val))
print('Test Score: ', model.score(X_test, y_test))

rmsle_val = np.sqrt(mean_squared_log_error(y_pred_val, y_val))
rmsle_test = np.sqrt(mean_squared_log_error(y_pred_test, y_test))

print(f'Val RMSLE: {rmsle_val}')
print(f'Test RMSLE: {rmsle_test}')

Val Score:  -0.8358665191144757
Test Score:  -0.79732846531855
Val RMSLE: 0.16683995692326634
Test RMSLE: 0.15914771529354965


In [12]:
# Check the best parameters for this run
model.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 6,
 'max_leaf_nodes': None,
 'min_samples_leaf': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100}

In [13]:
test = Train(pd.read_csv('test.csv'))
cleaned_df = test.clean_df()

In [14]:
IDs = cleaned_df['Id']
feat = cleaned_df[cleaned_df.columns[1:]]

In [15]:
y_pred = model.predict(feat)
submission_gs_rfr = pd.DataFrame({'Id': IDs, 'SalePrice': y_pred})
submission_gs_rfr.head()

Unnamed: 0,Id,SalePrice
0,1461,119752.433333
1,1462,152268.435
2,1463,176756.425667
3,1464,180207.882857
4,1465,190946.6125


In [16]:
submission_gs_rfr.to_csv('grid_search_rfr0.csv', index=False)