In [2]:
import pandas as pd 
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor as ABR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMRegressor as LGBMR

from sklearn.ensemble import VotingRegressor as VR

import joblib

In [2]:
X_train_origin = pd.read_csv('../data/featured_data/X_train.csv')
y_train_origin = pd.read_csv('../data/featured_data/y_train.csv')

In [3]:
X_train_origin = X_train_origin.to_numpy()
y_train_origin = y_train_origin.to_numpy()
y_train_origin = y_train_origin.ravel()

In [4]:
X_train_origin.shape,y_train_origin.shape

((1460, 74), (1460,))

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1) 

In [6]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1168, 74), (292, 74), (1168,), (292,))

### Baseline

In [7]:
def modeling(model_name):
    model=model_name()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    y_pred_no_log = np.expm1(y_pred)
    y_val_no_log = np.expm1(y_val)
    RMSE = math.sqrt(mean_squared_error(y_val_no_log,y_pred_no_log))
    print("RMSE: "+str(RMSE))
    R2=r2_score(y_val_no_log,y_pred_no_log)
    print("R_square: "+str(R2))

In [8]:
modeling(LR) # reject

RMSE: 38734.25269123501
R_square: 0.78963064450471


In [9]:
modeling(Lasso) # reject

RMSE: 81705.2949377076
R_square: 0.06396407424779604


In [10]:
modeling(Ridge) # reject

RMSE: 38696.78076455675
R_square: 0.7900374747128447


In [11]:
modeling(ElasticNet) # reject

RMSE: 62403.81622904341
R_square: 0.45397263069049765


In [12]:
modeling(KNR) # reject

RMSE: 41567.55930701231
R_square: 0.757729149613089


In [13]:
modeling(SVR) # reject

RMSE: 39141.757964151904
R_square: 0.785180961816502


In [14]:
modeling(RFR) # reject

RMSE: 38497.08579516456
R_square: 0.7921989089820298


In [15]:
modeling(ABR) # reject

RMSE: 40126.75965339593
R_square: 0.7742330869337376


In [16]:
modeling(GBR) # accept

RMSE: 32145.9725920116
R_square: 0.8551077175840008


In [17]:
modeling(XGBR) # accept

RMSE: 33033.135911442005
R_square: 0.8469998974283094


In [18]:
modeling(LGBMR) # accept

RMSE: 34676.329838811806
R_square: 0.8313996918360319


### Final tuning

In [20]:
params_XGBR = {
    'max_depth': [5, 10, 15, 20, 25],
    'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
    'n_estimators': [500, 1000, 2000, 3000, 5000],
    'min_child_weight': [0, 2, 5, 10, 20],
    'max_delta_step': [0, 0.2, 0.6, 1, 2],
    'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
    'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
    'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]
}
xgbr = XGBR()
xgbr_clf = RandomizedSearchCV(xgbr, param_distributions=params_XGBR, cv=3,scoring='r2',n_jobs=-1,n_iter=20)
xgbr_clf.fit(X_train_origin,y_train_origin)
xgbr_clf.best_params_



{'subsample': 0.8,
 'scale_pos_weight': 0.2,
 'reg_lambda': 0.2,
 'reg_alpha': 0.5,
 'n_estimators': 1000,
 'min_child_weight': 20,
 'max_depth': 15,
 'max_delta_step': 1,
 'learning_rate': 0.05,
 'colsample_bytree': 0.8}

In [21]:
params_LGBMR = {'n_estimators': [1000, 1500, 2000, 2500],
               'max_depth':  [4, 5, 8, -1],
               'num_leaves': [15, 31, 63, 127],
               'subsample': [0.6, 0.7, 0.8, 1.0],
               'colsample_bytree': [0.6, 0.7, 0.8, 1.0],
               'learning_rate' : [0.01,0.02,0.03]
              }
lgbmr = LGBMR()
lgbmr_clf = RandomizedSearchCV(lgbmr, param_distributions=params_LGBMR, cv=3,scoring='r2',n_jobs=-1,n_iter=20)
lgbmr_clf.fit(X_train_origin,y_train_origin)
lgbmr_clf.best_params_

{'subsample': 0.6,
 'num_leaves': 63,
 'n_estimators': 2000,
 'max_depth': 4,
 'learning_rate': 0.02,
 'colsample_bytree': 1.0}

In [22]:
params_GBR = {'n_estimators':range(20,81,10),
              'learning_rate': [0.2,0.1, 0.05, 0.02, 0.01 ],
              'max_depth': [4, 6,8],
              'min_samples_leaf': [3, 5, 9, 14],
              'max_features': [0.8,0.5,0.3, 0.1]}
gbr = GBR()
gbr_clf = RandomizedSearchCV(gbr, param_distributions=params_GBR, cv=3,scoring='r2',n_jobs=-1,n_iter=20)
gbr_clf.fit(X_train_origin,y_train_origin)
gbr_clf.best_params_

{'n_estimators': 80,
 'min_samples_leaf': 14,
 'max_features': 0.3,
 'max_depth': 6,
 'learning_rate': 0.05}

### Train the best model

In [25]:
xgbr_best = XGBR(subsample=0.8,scale_pos_weight=0.2,reg_alpha=0.5,reg_lambda=0.2,n_estimators=1000,min_child_weight=20,max_depth=15,
                max_delta_step=1,learning_rate=0.05,colsample_bytree=0.8)
xgbr_best.fit(X_train_origin,y_train_origin)
joblib.dump(xgbr_best,'../models/xgbr.m')



['../models/xgbr.m']

In [26]:
lgbmr_best = LGBMR(subsample=0.6,num_leaves=63,n_estimators=2000,max_depth=4,learning_rate=0.02,colsample_bytree=1.0)
lgbmr_best.fit(X_train_origin,y_train_origin)
joblib.dump(lgbmr_best,'../models/lgbmr.m')

['../models/lgbmr.m']

In [27]:
gbr_best = GBR(n_estimators=80,min_samples_leaf=14,max_features=0.3,max_depth=6,learning_rate=0.05)
gbr_best.fit(X_train_origin,y_train_origin)
joblib.dump(gbr_best,'../models/gbr.m')

['../models/gbr.m']

In [29]:
voter = VR(
    estimators=[('xgbr', xgbr_best), ('lgbmr', lgbmr_best), ('gbr', gbr_best)])
voter.fit(X_train_origin, y_train_origin)
joblib.dump(voter, '../models/voter.m')



['../models/voter.m']