# Made Using Google Colab

In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import make_scorer, mean_squared_log_error
from sklearn.model_selection import GridSearchCV

from feature_select import Train


def rmsle(predicted, actual, size):
  return np.sqrt(np.nansum(np.square(np.log(predicted + 1) - np.log(actual + 1))) / float(size))


train = Train(pd.read_csv('train_cleaned.csv'))

scorer = make_scorer(rmsle, greater_is_better=False, size=10)

In [None]:
# Split the data
X = train.cleaned.drop(columns='SalePrice')
y = train.cleaned['SalePrice']

X_train, y_train, X_val, y_val, X_test, y_test = train.train_val_test_split(
    X=X, y=y, random_state=42
)

print(X_train.shape, y_train.shape, X_val.shape,
      y_val.shape, X_test.shape, y_test.shape)

(751, 25) (751,) (251, 25) (251,) (251, 25) (251,)


In [None]:
# Take the baseline of the model

model = AdaBoostRegressor()

model.fit(X_train, y_train)

y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

rmsle_val = np.sqrt(mean_squared_log_error(y_val, y_pred_val))
rmsle_test = np.sqrt(mean_squared_log_error(y_test, y_pred_test))

print(f'Validation Root Mean Squared Log Error: {rmsle_val}')
print(f'Test Root Mean Squared Log Error: {rmsle_test}')

Validation Root Mean Squared Log Error: 0.1839092126322143
Test Root Mean Squared Log Error: 0.1689916719185605


### Scores are No Bueno!

In [None]:
# Let's see if we can make it better with a GridSearch

def make_model(param_grid, scoring=scorer, cv=5):
  rfr = AdaBoostRegressor(random_state=42)
  grid_search = GridSearchCV(rfr, scoring=scorer, cv=cv, param_grid=param_grid,
                             n_jobs=-1, verbose=10)
  return grid_search


param_grid = {
    'base_estimator': [None, 1, 2],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [1, 0.1, 0.01, 0.001],
    'loss': ['linear', 'square', 'exponential']
}

model = make_model(param_grid, cv=5)
model

GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostRegressor(base_estimator=None, learning_rate=1.0,
                                         loss='linear', n_estimators=50,
                                         random_state=42),
             iid='deprecated', n_jobs=-1,
             param_grid={'base_estimator': [None, 1, 2],
                         'learning_rate': [1, 0.1, 0.01, 0.001],
                         'loss': ['linear', 'square', 'exponential'],
                         'n_estimators': [50, 100, 150, 200]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(rmsle, greater_is_better=False, size=10),
             verbose=10)

In [None]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   

GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostRegressor(base_estimator=None, learning_rate=1.0,
                                         loss='linear', n_estimators=50,
                                         random_state=42),
             iid='deprecated', n_jobs=-1,
             param_grid={'base_estimator': [None, 1, 2],
                         'learning_rate': [1, 0.1, 0.01, 0.001],
                         'loss': ['linear', 'square', 'exponential'],
                         'n_estimators': [50, 100, 150, 200]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(rmsle, greater_is_better=False, size=10),
             verbose=10)

In [None]:
model.best_params_

{'base_estimator': None,
 'learning_rate': 1,
 'loss': 'square',
 'n_estimators': 50}

In [None]:
model.best_score_

-0.6170465148907293

In [None]:
# Check the model's R^2 scores and RMSLEs for Val and Test
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

print('Val Score: ', model.score(X_val, y_val))
print('Test Score: ', model.score(X_test, y_test))

rmsle_val = np.sqrt(mean_squared_log_error(y_pred_val, y_val))
rmsle_test = np.sqrt(mean_squared_log_error(y_pred_test, y_test))

print(f'Val RMSLE: {rmsle_val}')
print(f'Test RMSLE: {rmsle_test}')

Val Score:  -0.9026594446230958
Test Score:  -0.8534474496691925
Val RMSLE: 0.18017190474005756
Test RMSLE: 0.17034913168902605


In [None]:
test = Train(pd.read_csv('test.csv'))
cleaned_df = test.clean_df()

IDs = cleaned_df['Id']
feat = cleaned_df[cleaned_df.columns[1:]]

y_pred = model.predict(feat)
submission_gs_abr = pd.DataFrame({'Id': IDs, 'SalePrice': y_pred})
submission_gs_abr.head()

Unnamed: 0,Id,SalePrice
0,1461,125343.256281
1,1462,158627.307692
2,1463,188105.179724
3,1464,177953.013699
4,1465,194449.847162


In [None]:
# submission_gs_abr.to_csv('grid_search_abr0.csv', index=False)