In [1]:
import pandas as pd
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore")



In [2]:
X_train_reduced = pd.read_csv('featured/X_train_reduced.csv')
X_train_reduced.shape

(1460, 9)

In [3]:
X_test_reduced = pd.read_csv('featured/X_test_reduced.csv')
X_test_reduced.shape

(1459, 9)

In [4]:
y_train = pd.read_csv('featured/y_train.csv')
y_train.shape

(1460, 1)

## Model selection

In [5]:
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from xgboost import XGBRegressor as XGBR

In [6]:
lr = LR()
lr_result = cross_val_score(lr,X_train_reduced,y_train,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [7]:
ridge = Ridge()
ridge_result = cross_val_score(ridge,X_train_reduced,y_train,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [8]:
lasso = Lasso()
lasso_result = cross_val_score(lasso,X_train_reduced,y_train,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [9]:
en = ElasticNet()
en_result = cross_val_score(en,X_train_reduced,y_train,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [10]:
svr = SVR()
svr_result = cross_val_score(svr,X_train_reduced,y_train,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [11]:
knr = KNR()
knr_result = cross_val_score(knr,X_train_reduced,y_train,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [12]:
rfr = RFR()
rfr_result = cross_val_score(rfr,X_train_reduced,y_train,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [13]:
gbr = GBR()
gbr_result = cross_val_score(gbr,X_train_reduced,y_train,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [14]:
xgbr = XGBR()
xgbr_result = cross_val_score(xgbr,X_train_reduced,y_train,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [19]:
print("MSE of LR is: "+str(-lr_result))
print("MSE of Ridge is: "+str(-ridge_result))
print("MSE of Lasso is: "+str(-lasso_result))
print("MSE of EN is: "+str(-en_result))

print("MSE of SVR is: "+str(-svr_result))
print("MSE of KNR is: "+str(-knr_result))
print("MSE of RFR is: "+str(-rfr_result))
print("MSE of GBR is: "+str(-gbr_result))
print("MSE of XGBR is: "+str(-xgbr_result))

MSE of LR is: 2168371869.950872
MSE of Ridge is: 2168369110.2263203
MSE of Lasso is: 2172221073.0788074
MSE of EN is: 2172221273.9125924
MSE of SVR is: 6624638773.308963
MSE of KNR is: 2461198496.322137
MSE of RFR is: 1917874530.0148284
MSE of GBR is: 1939859399.0198486
MSE of XGBR is: 1743767256.8602238


## Train the best XGBR model

In [20]:
params_XGBR = { 
                      'objective':['reg:linear'],
                      'learning_rate': [0.045,0.05,0.06], 
                      'max_depth': [3,4,5],
                      'min_child_weight': [2,3,4],
                      'silent': [1],
                      'subsample': [0.5,0.55,0.6],
                      'colsample_bytree': [0.7,0.8,0.85],
                      'n_estimators': [650,750,800]}
xgbr = XGBR()
clf = RandomizedSearchCV(xgbr, param_distributions=params_XGBR, cv=3, n_iter=15)
clf.fit(X_train_reduced,y_train)
clf.best_params_

{'subsample': 0.5,
 'silent': 1,
 'objective': 'reg:linear',
 'n_estimators': 650,
 'min_child_weight': 2,
 'max_depth': 3,
 'learning_rate': 0.045,
 'colsample_bytree': 0.85}

In [24]:
clf.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.85, gamma=0,
             importance_type='gain', learning_rate=0.045, max_delta_step=0,
             max_depth=3, min_child_weight=2, missing=None, n_estimators=650,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=1,
             subsample=0.5, verbosity=1)

In [25]:
xgbr_best = XGBR(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.85, gamma=0,
             importance_type='gain', learning_rate=0.045, max_delta_step=0,
             max_depth=3, min_child_weight=2, missing=None, n_estimators=650,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=1,
             subsample=0.5, verbosity=1)
# use all training set to fit the model
xgbr_best.fit(X_train_reduced, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.85, gamma=0,
             importance_type='gain', learning_rate=0.045, max_delta_step=0,
             max_depth=3, min_child_weight=2, missing=None, n_estimators=650,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=1,
             subsample=0.5, verbosity=1)

In [26]:
joblib.dump(xgbr_best, 'xgbr_best.m')

['xgbr_best.m']