In [567]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor, DMatrix, cv, train, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse, mean_absolute_percentage_error as mape, r2_score
import matplotlib.pylab as plt
import graphviz

# Hyperparameter tuning
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval

### Read in data

In [553]:
gls = pd.read_csv(r'G:\REA\Working files\land-bidding\land_sales_full_data\feature eng\gls_1207_new_feat.csv')
gls = gls.sort_values(by=['year_launch', 'month_launch', 'date_launch'])

In [554]:
gls.columns

Index(['sg_gls_id', 'land_parcel_id', 'project_dwid', 'address_dwid',
       'date_launch', 'date_close', 'date_award', 'land_parcel',
       'land_parcel_std', 'street', 'zone', 'region', 'join_by', 'error_check',
       'site_area_sqm', 'devt_type', 'devt_class', 'lease_term', 'gpr',
       'gfa_sqm', 'num_bidders', 'successful_tenderer_name',
       'successful_tender_price', 'successful_price_psm_gfa', 'proj_name_raw',
       'proj_name_res', 'proj_name_non_res', 'in_db', 'source',
       'timediff_launch_to_close', 'timediff_close_to_award',
       'timediff_launch_to_award', 'launch_month_index', 'year_launch',
       'month_launch', 'day_launch', 'close_month_index', 'year_close',
       'month_close', 'day_close', 'award_month_index', 'year_award',
       'month_award', 'day_award', 'merge_key', 'tender_details',
       'tenderer_name_1st', 'tender_price_1st', 'price_psm_gfa_1st',
       'tenderer_name_2nd', 'tender_price_2nd', 'price_psm_gfa_2nd',
       'price_premium_total',

In [555]:
# select features
categories = ['region', 'zone', 'devt_class', 'source']
numeric = ['site_area_sqm', 'lease_term', 'gpr',
           'num_bidders', 'timediff_launch_to_close',
           'avg_dist_cbd', 'avg_dist_mrt', 'avg_num_bus',
           'avg_num_good_sch', 'proj_num_of_units', 'proj_max_floor',
           'num_of_nearby_completed_proj_200m', 'num_of_schools', 'year_launch']
cols = categories + numeric

### Pretreatment

In [556]:
x = pd.get_dummies(gls[cols])
y = gls.price_psm_real

In [557]:
dmat = DMatrix(data=x, label=y)

### Split train-test

In [558]:
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=False, test_size=0.2)
train_data = DMatrix(x_train, label=y_train)
test_data = DMatrix(x_test, label=y_test)

### Set model and regress

In [580]:
# using train
params = {
    'max_depth': 6,
    'eta': 0.02,
    'objective': 'reg:squarederror',
}
xgb = XGBRegressor(objective ='reg:squarederror').fit(x_train, y_train)
xgb_reg = train(params=params, dtrain=train_data, num_boost_round=100)
pred_train = xgb_reg.predict(train_data)
mape_train = mape(y_train, pred_train)
pred_test = xgb_reg.predict(test_data)
mape_test = mape(y_test, pred_test)
print("MAPE train: %f" %mape_train, "MAPE test: %f" %mape_test, "MAPE test-train: %f" %(mape_test-mape_train), sep='\n')

MAPE train: 0.161727
MAPE test: 0.178458
MAPE test-train: 0.016731


In [560]:
r2_test, r2_train = r2_score(y_test, pred_test), r2_score(y_train, pred_train)
print(r2_test, r2_train, sep='\n')

0.593963160350227
0.840179796477937


In [561]:
test_check = pd.DataFrame({'y_actual': y_test, 'y_predict': pred_test})
train_check = pd.DataFrame({'actual': y_train, 'predict': pred_train})

In [577]:
xgb.get_params()

{'objective': 'reg:squarederror',
 'base_score': 0.5,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 0,
 'gpu_id': -1,
 'grow_policy': 'depthwise',
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.300000012,
 'max_bin': 256,
 'max_cat_threshold': 64,
 'max_cat_to_onehot': 4,
 'max_delta_step': 0,
 'max_depth': 6,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 100,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'sampling_method': 'uniform',
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

### Param tuning and random search

In [569]:
# Round 1
# define search space
param_space = {'max_depth': [4, 5, 6],
               'learning_rate': [0.01, 0.02, 0.05],
               'gamma': [0, 0.25, 1.0],
               'reg_lambda': [0, 1.0, 5.0]
               }
scoring = ['neg_mean_absolute_percentage_error']
kfold = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
random_search = RandomizedSearchCV(estimator=xgb,
                                    param_distributions=param_space,
                                    n_iter=50,
                                    scoring=scoring,
                                    refit=scoring[0],
                                    n_jobs=-1,
                                    cv=kfold,
                                    verbose=1)

In [572]:
random_search_res = random_search.fit(x_train, y_train)
# Print the best score and the corresponding hyperparameters
print(f'The best score is {random_search_res.best_score_:.4f}')
# print('The best score standard deviation is', round(randome_search_res.cv_results_['std_test_recall'][randome_search_res.best_index_], 4))
print(f'The best hyperparameters are {random_search_res.best_params_}')

The best score is -646.2954
The best hyperparameters are {'reg_lambda': 0, 'max_depth': 5, 'learning_rate': 0.02, 'gamma': 0.25}


In [None]:
# Round 2
# define search space
param_space = {'max_depth': [4, 5, 6],
               'learning_rate': [0.01, 0.02, 0.05],
               'gamma': [0, 0.25, 1.0],
               'reg_lambda': [0, 1.0, 5.0]
               }
scoring = ['neg_mean_absolute_percentage_error']
kfold = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
random_search = RandomizedSearchCV(estimator=xgb,
                                   param_distributions=param_space,
                                   n_iter=50,
                                   scoring=scoring,
                                   refit=scoring[0],
                                   n_jobs=-1,
                                   cv=kfold,
                                   verbose=1)

In [None]:
random_search_res = random_search.fit(x_train, y_train)
# Print the best score and the corresponding hyperparameters
print(f'The best score is {random_search_res.best_score_:.4f}')
# print('The best score standard deviation is', round(randome_search_res.cv_results_['std_test_recall'][randome_search_res.best_index_], 4))
print(f'The best hyperparameters are {random_search_res.best_params_}')

In [None]:
validation_res = cv(dtrain=train_data, params=params, nfold=3, num_boost_round=100,
                    early_stopping_rounds=10, metrics='rmse', as_pandas=True, seed=42)

In [None]:
validation_res['test-rmse-mean'].tail(1)

### Visualize

In [None]:
# xg_reg = train(params=params, dtrain=dmat, num_boost_round=100)
# plot_tree(xg_reg, num_trees=0)
# plt.rcParams['figure.figsize'] = [50, 10]
# plt.show()