In [11]:
import xgboost
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

### Simple Case: Use simple xgb-regressor on entire training data

In [3]:
train_set = pd.read_csv(r'../data/train-preprocessed.csv')
test_set = pd.read_csv(r'../data/test-preprocessed.csv')

In [4]:
X_train = train_set.drop(['SalePrice'], axis = 1).values
y_train = train_set['SalePrice'].values

In [None]:
reg = xgboost.XGBRegressor()
reg.fit(X_train, y_train)

In [6]:
y_pred = reg.predict(test_set)

In [7]:
y_pred

array([124207.586, 141063.67 , 185765.48 , ..., 154401.88 , 106872.75 ,
       207102.53 ], dtype=float32)

In [8]:
pred = pd.DataFrame(y_pred)
sub_df = pd.read_csv('../data/sample_submission_master.csv')
datasets = pd.concat([sub_df['Id'], pred], axis = 1)
datasets.columns = ['Id', 'SalePrice']
# datasets.to_csv('sample_submission.csv',index = False)

### Use validation set to evaluate performance before submission:

In [10]:
train_set = pd.read_csv(r'../data/train-preprocessed.csv')
test_set = pd.read_csv(r'../data/test-preprocessed.csv')

In [15]:
X_train = train_set.drop(['SalePrice'], axis = 1).values
y_train = train_set['SalePrice'].values

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [13]:
reg = xgboost.XGBRegressor()
reg.fit(X_train, y_train)

In [15]:
y_pred = reg.predict(X_val)
y_pred

array([153611.92 , 345751.47 , 109269.49 , 159112.56 , 329905.3  ,
        82253.51 , 215146.48 , 146728.53 ,  86224.89 , 132071.53 ,
       172813.16 , 123547.164, 118669.625, 211036.44 , 169960.33 ,
       126702.375, 205460.28 , 139001.06 , 126723.47 , 196679.8  ,
       151210.84 , 228225.45 , 171587.58 , 134412.   , 197245.73 ,
       160856.11 , 192447.27 , 103517.41 , 173304.28 , 213185.69 ,
       112225.82 , 240633.17 , 155268.27 , 115707.14 , 245915.28 ,
       150378.3  , 126189.14 , 204009.17 , 305101.5  , 125876.42 ,
       135943.73 , 241698.4  , 115234.836, 378550.53 , 132641.03 ,
       136344.92 , 111652.47 , 119040.84 , 246609.89 , 145918.56 ,
       114211.92 , 200929.1  , 115157.69 , 364562.9  , 139749.16 ,
       255543.62 , 207874.38 , 139344.97 , 149075.1  , 116787.66 ,
        64701.977, 161199.16 , 330078.3  , 325828.75 , 275660.66 ,
       226308.36 , 114198.76 , 309453.78 ,  90732.62 , 171359.72 ,
       121193.016, 120220.336, 116102.87 ,  76742.   , 493606.

In [19]:
def log_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))

In [20]:
log_rmse(y_val, y_pred)

np.float64(0.14331872456114564)

### Hyperparameter tuning

In [32]:
reg = xgboost.XGBRegressor()

In [33]:
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
learning_rate = [0.05, 0.10, 0.15, 0.20]
min_child_weight = [1, 2, 3, 4]
booster = ['gbtree', 'gblinear']
base_score = [0.25, 0.5, 0.75, 1]

In [34]:
hyperparam_grid = {
    'n_estimators' : n_estimators,
    'max_depth' : max_depth,
    'learning_rate' : learning_rate,
    'min_child_weight' : min_child_weight,
    'booster' : booster,
    'base_score' : base_score
}

In [35]:
random_cv = RandomizedSearchCV(estimator = reg,
                               param_distributions = hyperparam_grid,
                               cv = 5, n_iter = 50,
                               scoring = 'neg_root_mean_squared_log_error', n_jobs = 4,
                               return_train_score = True,
                               random_state = 42)

gridsearch_cv = GridSearchCV(estimator = reg, 
                             param_grid = hyperparam_grid,
                             scoring = 'neg_root_mean_squared_log_error', n_jobs = 4,
                             cv = 5, return_train_score = True)

In [36]:
gridsearch_cv.fit(X_train, y_train)

KeyboardInterrupt: 

In [34]:
# random_cv.best_estimator_.fit(X_train, y_train)
gridsearch_cv.best_estimator_.fit(X_train, y_train)

result for gridsearchCV:
XGBRegressor(base_score=0.25, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.05, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
             max_leaves=None, min_child_weight=2, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=900,
             n_jobs=None, num_parallel_tree=None, ...)

In [35]:
# y_pred = random_cv.best_estimator_.predict(test_set)
y_pred = gridsearch_cv.best_estimator_.predict(test_set)

In [36]:
y_pred

array([115705.555, 163391.89 , 179194.23 , ..., 152175.2  , 118642.414,
       231193.2  ], dtype=float32)

In [38]:
pred = pd.DataFrame(y_pred)
sub_df = pd.read_csv('../data/sample_submission_master.csv')
datasets = pd.concat([sub_df['Id'], pred], axis = 1)
datasets.columns = ['Id', 'SalePrice']
# datasets.to_csv('sample_submission.csv',index = False)

### Nested cross validation

In [37]:
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
learning_rate = [0.05, 0.10, 0.15, 0.20]
min_child_weight = [1, 2, 3, 4]
booster = ['gbtree', 'gblinear']
base_score = [0.25, 0.5, 0.75, 1]

In [39]:
reg = xgboost.XGBRegressor()
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('reg', reg)])
p_grid = {
    'reg__n_estimators' : n_estimators,
    'reg__max_depth' : max_depth,
    'reg__learning_rate' : learning_rate,
    'reg__min_child_weight' : min_child_weight,
    'reg__booster' : booster,
    'reg__base_score' : base_score
}

inner_cv = RepeatedKFold(n_splits=10, n_repeats = 10, random_state=42)

random_cv = RandomizedSearchCV(estimator = pipeline,
                               param_distributions = p_grid,
                               cv = inner_cv, n_iter = 50,
                               scoring = 'neg_root_mean_squared_log_error', n_jobs = 4,
                               return_train_score = True,
                               random_state = 42)

random_cv.fit(X_train, y_train)


 -0.12815463 -0.13408242 -0.12307997 -0.13546421         nan -0.12731121
         nan         nan -0.13756509 -0.12564021 -0.12967726 -0.13440699
         nan -0.1371536          nan         nan         nan         nan
         nan -0.12898505 -0.13693135         nan         nan         nan
 -0.1255535          nan -0.13715362         nan         nan -0.13018797
         nan         nan -0.1304244  -0.13862813         nan         nan
         nan -0.12800359         nan         nan         nan         nan
         nan         nan]


In [45]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_set.values)

In [46]:
random_cv.best_estimator_.fit(X_train_scaled, y_train)

In [47]:
y_pred = random_cv.best_estimator_.predict(X_test_scaled)

In [48]:
y_pred

array([118836.57, 166246.4 , 185205.12, ..., 149052.39, 118045.27,
       227025.19], dtype=float32)

In [50]:
pred = pd.DataFrame(y_pred)
sub_df = pd.read_csv('../data/sample_submission_master.csv')
datasets = pd.concat([sub_df['Id'], pred], axis = 1)
datasets.columns = ['Id', 'SalePrice']
# datasets.to_csv('sample_submission.csv',index = False)