In [23]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor as ABR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMRegressor as LGBMR
from catboost import CatBoostRegressor as CBR

from sklearn.ensemble import VotingRegressor as VR

import joblib

In [6]:
X_train_origin = pd.read_csv('../data/featured_data/X_train.csv')
y_train_origin = pd.read_csv('../data/featured_data/y_train.csv')

In [7]:
X_train_origin = X_train_origin.to_numpy()
y_train_origin = y_train_origin.to_numpy()
y_train_origin = y_train_origin.ravel()

In [8]:
X_train_origin.shape,y_train_origin.shape

((1460, 74), (1460,))

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1) 

In [10]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1168, 74), (292, 74), (1168,), (292,))

### Baseline

In [11]:
def modeling(model_name):
    model=model_name()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    y_pred_no_log = np.expm1(y_pred)
    y_val_no_log = np.expm1(y_val)
    RMSE = math.sqrt(mean_squared_error(y_val_no_log,y_pred_no_log))
    print("RMSE: "+str(RMSE))
    MAE = mean_absolute_error(y_val_no_log,y_pred_no_log)
    print("MAE: "+str(MAE))
    R2=r2_score(y_val_no_log,y_pred_no_log)
    print("R_square: "+str(R2))

In [12]:
modeling(LR) # reject

RMSE: 38734.25269123501
MAE: 18336.66210405226
R_square: 0.78963064450471


In [13]:
modeling(Lasso) # reject

RMSE: 81705.2949377076
MAE: 53921.62067363013
R_square: 0.06396407424779604


In [14]:
modeling(Ridge) # reject

RMSE: 38696.78076455677
MAE: 18305.391475471533
R_square: 0.7900374747128446


In [15]:
modeling(ElasticNet) # reject

RMSE: 62403.81622904341
MAE: 36014.43096782633
R_square: 0.45397263069049765


In [16]:
modeling(KNR) # reject

RMSE: 41567.55930701231
MAE: 21716.195234870647
R_square: 0.757729149613089


In [17]:
modeling(SVR) # reject

RMSE: 39141.75796415196
MAE: 19456.730576283855
R_square: 0.7851809618165013


In [18]:
modeling(RFR) # reject

RMSE: 38524.34845351734
MAE: 20538.248420550655
R_square: 0.7919044858582183


In [19]:
modeling(ABR) # reject

RMSE: 40293.25206322671
MAE: 25045.500551268342
R_square: 0.7723557134266243


In [20]:
modeling(GBR) # accept

RMSE: 32214.938945161346
MAE: 18033.893925880122
R_square: 0.8544853434119452


In [21]:
modeling(XGBR) # accept

RMSE: 33033.135911442005
MAE: 18926.783684717477
R_square: 0.8469998974283094


In [22]:
modeling(LGBMR) # accept

RMSE: 34676.329838811806
MAE: 18596.55068242809
R_square: 0.8313996918360319


In [24]:
modeling(CBR) # accept, best MAE

Learning rate set to 0.040124
0:	learn: 0.3855145	total: 81.5ms	remaining: 1m 21s
1:	learn: 0.3769883	total: 93.9ms	remaining: 46.9s
2:	learn: 0.3699629	total: 106ms	remaining: 35.1s
3:	learn: 0.3623353	total: 112ms	remaining: 27.8s
4:	learn: 0.3545791	total: 117ms	remaining: 23.4s
5:	learn: 0.3462391	total: 122ms	remaining: 20.3s
6:	learn: 0.3392832	total: 128ms	remaining: 18.2s
7:	learn: 0.3325403	total: 133ms	remaining: 16.5s
8:	learn: 0.3257890	total: 138ms	remaining: 15.2s
9:	learn: 0.3199255	total: 143ms	remaining: 14.2s
10:	learn: 0.3128725	total: 148ms	remaining: 13.3s
11:	learn: 0.3061000	total: 154ms	remaining: 12.6s
12:	learn: 0.3005820	total: 159ms	remaining: 12s
13:	learn: 0.2949150	total: 164ms	remaining: 11.5s
14:	learn: 0.2899846	total: 169ms	remaining: 11.1s
15:	learn: 0.2845032	total: 174ms	remaining: 10.7s
16:	learn: 0.2792911	total: 180ms	remaining: 10.4s
17:	learn: 0.2747364	total: 186ms	remaining: 10.2s
18:	learn: 0.2705875	total: 193ms	remaining: 9.94s
19:	learn:

172:	learn: 0.1109257	total: 1.05s	remaining: 5.03s
173:	learn: 0.1105880	total: 1.06s	remaining: 5.03s
174:	learn: 0.1102295	total: 1.06s	remaining: 5.02s
175:	learn: 0.1099862	total: 1.07s	remaining: 5.01s
176:	learn: 0.1096838	total: 1.08s	remaining: 5.01s
177:	learn: 0.1093887	total: 1.08s	remaining: 5s
178:	learn: 0.1090892	total: 1.09s	remaining: 4.99s
179:	learn: 0.1086616	total: 1.09s	remaining: 4.99s
180:	learn: 0.1084142	total: 1.1s	remaining: 4.98s
181:	learn: 0.1081127	total: 1.11s	remaining: 4.97s
182:	learn: 0.1077380	total: 1.11s	remaining: 4.96s
183:	learn: 0.1073894	total: 1.12s	remaining: 4.95s
184:	learn: 0.1068630	total: 1.12s	remaining: 4.95s
185:	learn: 0.1064171	total: 1.13s	remaining: 4.94s
186:	learn: 0.1061055	total: 1.14s	remaining: 4.93s
187:	learn: 0.1057917	total: 1.14s	remaining: 4.93s
188:	learn: 0.1056129	total: 1.15s	remaining: 4.92s
189:	learn: 0.1053065	total: 1.15s	remaining: 4.91s
190:	learn: 0.1050558	total: 1.16s	remaining: 4.9s
191:	learn: 0.104

357:	learn: 0.0651120	total: 2.1s	remaining: 3.77s
358:	learn: 0.0649713	total: 2.11s	remaining: 3.77s
359:	learn: 0.0647678	total: 2.12s	remaining: 3.76s
360:	learn: 0.0646190	total: 2.12s	remaining: 3.76s
361:	learn: 0.0645456	total: 2.13s	remaining: 3.75s
362:	learn: 0.0643514	total: 2.13s	remaining: 3.75s
363:	learn: 0.0643071	total: 2.14s	remaining: 3.74s
364:	learn: 0.0641868	total: 2.15s	remaining: 3.73s
365:	learn: 0.0639710	total: 2.15s	remaining: 3.73s
366:	learn: 0.0637230	total: 2.16s	remaining: 3.72s
367:	learn: 0.0635344	total: 2.16s	remaining: 3.72s
368:	learn: 0.0633633	total: 2.17s	remaining: 3.71s
369:	learn: 0.0632050	total: 2.17s	remaining: 3.7s
370:	learn: 0.0630905	total: 2.18s	remaining: 3.7s
371:	learn: 0.0629561	total: 2.19s	remaining: 3.69s
372:	learn: 0.0627547	total: 2.19s	remaining: 3.69s
373:	learn: 0.0626130	total: 2.2s	remaining: 3.68s
374:	learn: 0.0625349	total: 2.21s	remaining: 3.67s
375:	learn: 0.0623687	total: 2.21s	remaining: 3.67s
376:	learn: 0.06

537:	learn: 0.0430321	total: 3.16s	remaining: 2.71s
538:	learn: 0.0429149	total: 3.16s	remaining: 2.71s
539:	learn: 0.0428136	total: 3.17s	remaining: 2.7s
540:	learn: 0.0427022	total: 3.17s	remaining: 2.69s
541:	learn: 0.0425894	total: 3.18s	remaining: 2.69s
542:	learn: 0.0424979	total: 3.19s	remaining: 2.68s
543:	learn: 0.0424266	total: 3.2s	remaining: 2.68s
544:	learn: 0.0423446	total: 3.2s	remaining: 2.67s
545:	learn: 0.0423205	total: 3.21s	remaining: 2.67s
546:	learn: 0.0422379	total: 3.21s	remaining: 2.66s
547:	learn: 0.0421599	total: 3.22s	remaining: 2.65s
548:	learn: 0.0420882	total: 3.22s	remaining: 2.65s
549:	learn: 0.0419888	total: 3.23s	remaining: 2.64s
550:	learn: 0.0418873	total: 3.23s	remaining: 2.64s
551:	learn: 0.0417928	total: 3.24s	remaining: 2.63s
552:	learn: 0.0417441	total: 3.25s	remaining: 2.62s
553:	learn: 0.0417291	total: 3.25s	remaining: 2.62s
554:	learn: 0.0417209	total: 3.25s	remaining: 2.61s
555:	learn: 0.0416542	total: 3.26s	remaining: 2.6s
556:	learn: 0.04

697:	learn: 0.0307668	total: 4.04s	remaining: 1.75s
698:	learn: 0.0306857	total: 4.05s	remaining: 1.74s
699:	learn: 0.0306113	total: 4.05s	remaining: 1.74s
700:	learn: 0.0305575	total: 4.06s	remaining: 1.73s
701:	learn: 0.0305049	total: 4.07s	remaining: 1.73s
702:	learn: 0.0304385	total: 4.07s	remaining: 1.72s
703:	learn: 0.0303618	total: 4.08s	remaining: 1.71s
704:	learn: 0.0303170	total: 4.08s	remaining: 1.71s
705:	learn: 0.0302377	total: 4.09s	remaining: 1.7s
706:	learn: 0.0301737	total: 4.09s	remaining: 1.7s
707:	learn: 0.0301588	total: 4.1s	remaining: 1.69s
708:	learn: 0.0301010	total: 4.11s	remaining: 1.68s
709:	learn: 0.0300482	total: 4.11s	remaining: 1.68s
710:	learn: 0.0299721	total: 4.12s	remaining: 1.67s
711:	learn: 0.0299111	total: 4.12s	remaining: 1.67s
712:	learn: 0.0298635	total: 4.13s	remaining: 1.66s
713:	learn: 0.0298128	total: 4.13s	remaining: 1.66s
714:	learn: 0.0297383	total: 4.14s	remaining: 1.65s
715:	learn: 0.0296661	total: 4.14s	remaining: 1.64s
716:	learn: 0.0

876:	learn: 0.0220609	total: 5.1s	remaining: 715ms
877:	learn: 0.0220284	total: 5.11s	remaining: 709ms
878:	learn: 0.0219711	total: 5.11s	remaining: 704ms
879:	learn: 0.0219311	total: 5.12s	remaining: 698ms
880:	learn: 0.0218769	total: 5.12s	remaining: 692ms
881:	learn: 0.0218268	total: 5.13s	remaining: 686ms
882:	learn: 0.0218194	total: 5.13s	remaining: 680ms
883:	learn: 0.0217710	total: 5.14s	remaining: 674ms
884:	learn: 0.0217501	total: 5.14s	remaining: 669ms
885:	learn: 0.0216935	total: 5.15s	remaining: 663ms
886:	learn: 0.0216841	total: 5.16s	remaining: 657ms
887:	learn: 0.0216338	total: 5.16s	remaining: 651ms
888:	learn: 0.0215912	total: 5.17s	remaining: 645ms
889:	learn: 0.0215377	total: 5.17s	remaining: 639ms
890:	learn: 0.0215068	total: 5.18s	remaining: 634ms
891:	learn: 0.0214580	total: 5.19s	remaining: 628ms
892:	learn: 0.0214197	total: 5.19s	remaining: 622ms
893:	learn: 0.0213909	total: 5.2s	remaining: 616ms
894:	learn: 0.0213884	total: 5.21s	remaining: 611ms
895:	learn: 0.

### Final tuning

In [20]:
params_XGBR = {
    'max_depth': [5, 10, 15, 20, 25],
    'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
    'n_estimators': [500, 1000, 2000, 3000, 5000],
    'min_child_weight': [0, 2, 5, 10, 20],
    'max_delta_step': [0, 0.2, 0.6, 1, 2],
    'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
    'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
    'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]
}
xgbr = XGBR()
xgbr_clf = RandomizedSearchCV(xgbr, param_distributions=params_XGBR, cv=3,scoring='r2',n_jobs=-1,n_iter=20)
xgbr_clf.fit(X_train_origin,y_train_origin)
xgbr_clf.best_params_



{'subsample': 0.8,
 'scale_pos_weight': 0.2,
 'reg_lambda': 0.2,
 'reg_alpha': 0.5,
 'n_estimators': 1000,
 'min_child_weight': 20,
 'max_depth': 15,
 'max_delta_step': 1,
 'learning_rate': 0.05,
 'colsample_bytree': 0.8}

In [21]:
params_LGBMR = {'n_estimators': [1000, 1500, 2000, 2500],
               'max_depth':  [4, 5, 8, -1],
               'num_leaves': [15, 31, 63, 127],
               'subsample': [0.6, 0.7, 0.8, 1.0],
               'colsample_bytree': [0.6, 0.7, 0.8, 1.0],
               'learning_rate' : [0.01,0.02,0.03]
              }
lgbmr = LGBMR()
lgbmr_clf = RandomizedSearchCV(lgbmr, param_distributions=params_LGBMR, cv=3,scoring='r2',n_jobs=-1,n_iter=20)
lgbmr_clf.fit(X_train_origin,y_train_origin)
lgbmr_clf.best_params_

{'subsample': 0.6,
 'num_leaves': 63,
 'n_estimators': 2000,
 'max_depth': 4,
 'learning_rate': 0.02,
 'colsample_bytree': 1.0}

In [22]:
params_GBR = {'n_estimators':range(20,81,10),
              'learning_rate': [0.2,0.1, 0.05, 0.02, 0.01 ],
              'max_depth': [4, 6,8],
              'min_samples_leaf': [3, 5, 9, 14],
              'max_features': [0.8,0.5,0.3, 0.1]}
gbr = GBR()
gbr_clf = RandomizedSearchCV(gbr, param_distributions=params_GBR, cv=3,scoring='r2',n_jobs=-1,n_iter=20)
gbr_clf.fit(X_train_origin,y_train_origin)
gbr_clf.best_params_

{'n_estimators': 80,
 'min_samples_leaf': 14,
 'max_features': 0.3,
 'max_depth': 6,
 'learning_rate': 0.05}

### Train the best model

In [25]:
xgbr_best = XGBR(subsample=0.8,scale_pos_weight=0.2,reg_alpha=0.5,reg_lambda=0.2,n_estimators=1000,min_child_weight=20,max_depth=15,
                max_delta_step=1,learning_rate=0.05,colsample_bytree=0.8)
xgbr_best.fit(X_train_origin,y_train_origin)
joblib.dump(xgbr_best,'../models/xgbr.m')



['../models/xgbr.m']

In [26]:
lgbmr_best = LGBMR(subsample=0.6,num_leaves=63,n_estimators=2000,max_depth=4,learning_rate=0.02,colsample_bytree=1.0)
lgbmr_best.fit(X_train_origin,y_train_origin)
joblib.dump(lgbmr_best,'../models/lgbmr.m')

['../models/lgbmr.m']

In [27]:
gbr_best = GBR(n_estimators=80,min_samples_leaf=14,max_features=0.3,max_depth=6,learning_rate=0.05)
gbr_best.fit(X_train_origin,y_train_origin)
joblib.dump(gbr_best,'../models/gbr.m')

['../models/gbr.m']

In [29]:
voter = VR(
    estimators=[('xgbr', xgbr_best), ('lgbmr', lgbmr_best), ('gbr', gbr_best)])
voter.fit(X_train_origin, y_train_origin)
joblib.dump(voter, '../models/voter.m')



['../models/voter.m']