# Notebook was made using Google Colab

[Link to Notebook](https://colab.research.google.com/drive/1z8X7JRmKwWuaiC9fLI9nSUkFLDItYNhg?usp=sharing)

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from feature_select import Train

In [2]:
train = Train(pd.read_csv('train_cleaned.csv'))

In [3]:
features = list(train.cleaned.columns)
features.remove('SalePrice')
target = 'SalePrice'

X_train, y_train, X_val, y_val, X_test, y_test = train.train_val_test_split(
    X=train.cleaned[features],
    y=train.cleaned[target],
    random_state=42)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(751, 25) (751,) (251, 25) (251,) (251, 25) (251,)


In [4]:
from sklearn.metrics import make_scorer, mean_squared_log_error

def rmsle(predicted, actual, size):
    return np.sqrt(np.nansum(np.square(np.log(predicted + 1) - np.log(actual + 1)))/float(size))

scorer = make_scorer(rmsle, greater_is_better=False, size=10)

In [5]:
def make_model(param_grid, scoring=scorer, cv=5):
  gbr = GradientBoostingRegressor(random_state=42)
  grid_search = GridSearchCV(gbr, scoring=scorer, cv=cv, param_grid=param_grid,
                             n_jobs=-1, verbose=10)
  return grid_search

grid = {
    'learning_rate': [1, 0.1, 0.01, 0.001, 0.0001],
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [x for x in range(1,6)],
    'min_samples_leaf': [x for x in range(1,6)],
    'max_features': [0.25, 0.75, 'auto', 'sqrt', 'log2']
}

model = make_model(grid)

In [6]:
model

GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_n...
             iid='deprecated

In [7]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 3125 candidates, totalling 15625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   

GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_n...
             iid='deprecated

In [8]:
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

print('Val Score: ', model.score(X_val, y_val))
print('Test Score: ', model.score(X_test, y_test))

rmsle_val = np.sqrt(mean_squared_log_error(y_pred_val, y_val))
rmsle_test = np.sqrt(mean_squared_log_error(y_pred_test, y_test))

print(f'Val RMSLE: {rmsle_val}')
print(f'Test RMSLE: {rmsle_test}')

Val Score:  -0.7856488014989083
Test Score:  -0.7653944040374953
Val RMSLE: 0.15681644042610832
Test RMSLE: 0.15277363846826836


In [9]:
model.best_params_

{'learning_rate': 0.1,
 'max_depth': 5,
 'max_features': 0.25,
 'min_samples_leaf': 5,
 'n_estimators': 100}

In [10]:
model.best_score_

-0.5600546836101257

In [13]:
test.cleaned.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal


In [14]:
test = Train(pd.read_csv('test.csv'))
cleaned_df = test.clean_df()

In [15]:
cleaned_df.head()

Unnamed: 0,Id,LotArea,TotalBsmtSF,1stFlrSF,GrLivArea,RegularLotShape,LandIsLvl,LotConfigCL,LotAdjacencyType,HouseCondition,YrBuilt,WasRemodeled,VeneerType,HeatingQuality,EleSystem,BsmtHasBath,HasHalfBath,Bedrooms,RemainingRooms,AdditionalRooms,NumFireplaces,GarageAreaByCar,HasDeck,HasPool,MonthSold,YearSold
0,1461,11622,882.0,896,896,1,1,0,1,1,1,0,0,1,0,0,0,1,3,1,0,1,1,0,5,4
1,1462,14267,1329.0,1329,1329,0,1,1,0,1,1,0,1,1,0,0,1,2,3,1,0,1,1,0,5,4
2,1463,13830,928.0,928,1629,0,1,0,0,1,1,1,0,2,0,0,1,2,3,1,1,2,1,0,2,4
3,1464,9978,926.0,926,1604,0,1,0,0,1,1,0,1,0,0,0,1,2,4,2,1,2,1,0,5,4
4,1465,5005,1280.0,1280,1280,0,0,0,0,2,1,0,0,0,0,0,0,1,3,1,0,2,0,0,0,4


In [17]:
IDs = cleaned_df['Id']
feat = cleaned_df[cleaned_df.columns[1:]]

In [18]:
y_pred = model.predict(feat)
submission_gs_gbr = pd.DataFrame({'Id': IDs, 'SalePrice': y_pred})
submission_gs_gbr.head()

Unnamed: 0,Id,SalePrice
0,1461,122661.099857
1,1462,157634.141995
2,1463,184338.662334
3,1464,181461.02606
4,1465,184500.783411


In [19]:
# Submission received 0.18596 which is ~0.005 better than the previous

# submission_gs_gbr.to_csv('grid_search_gbr0.csv', index=False)