In [1]:
import pandas as pd 
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor as ABR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMRegressor as LGBMR

import joblib

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
X_train_origin = pd.read_csv('../data/featured_data/X_train.csv')
y_train_origin = pd.read_csv('../data/featured_data/y_train.csv')

In [3]:
X_train_origin = X_train_origin.to_numpy()
y_train_origin = y_train_origin.to_numpy()
y_train_origin = y_train_origin.ravel()

In [4]:
X_train_origin.shape,y_train_origin.shape

((1460, 74), (1460,))

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_train_origin, y_train_origin, test_size=0.2, random_state=1) 

In [6]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1168, 74), (292, 74), (1168,), (292,))

### Baseline

In [7]:
def modeling(model_name):
    model=model_name()
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    y_pred_no_log = np.expm1(y_pred)
    y_val_no_log = np.expm1(y_val)
    RMSE = math.sqrt(mean_squared_error(y_val_no_log,y_pred_no_log))
    print("RMSE: "+str(RMSE))
    R2=r2_score(y_val_no_log,y_pred_no_log)
    print("R_square: "+str(R2))

In [8]:
modeling(LR) # reject

RMSE: 38734.25269123499
R_square: 0.7896306445047101


In [9]:
modeling(Lasso) # reject

RMSE: 81705.2949377076
R_square: 0.06396407424779604


In [10]:
modeling(Ridge) # reject

RMSE: 38696.78076455676
R_square: 0.7900374747128447


In [11]:
modeling(ElasticNet) # reject

RMSE: 62403.81622904341
R_square: 0.45397263069049765


In [12]:
modeling(KNR) # reject

RMSE: 41567.55930701231
R_square: 0.757729149613089


In [13]:
modeling(SVR) # reject

RMSE: 39141.757964151904
R_square: 0.785180961816502


In [14]:
modeling(RFR) # reject

RMSE: 40089.742098375165
R_square: 0.7746494417209817


In [15]:
modeling(ABR) # reject

RMSE: 41486.66669532709
R_square: 0.7586711752151062


In [16]:
modeling(GBR) # accept

RMSE: 32240.74014162161
R_square: 0.8542521624188695


In [17]:
modeling(XGBR) # accept

RMSE: 33033.135911442005
R_square: 0.8469998974283094


In [18]:
modeling(LGBMR) # accept

RMSE: 34676.329838811806
R_square: 0.8313996918360319


In [19]:
sss

NameError: name 'sss' is not defined

### Final tuning

In [None]:
params_XGBR = {
    'max_depth': [5, 10, 15, 20, 25],
    'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
    'n_estimators': [500, 1000, 2000, 3000, 5000],
    'min_child_weight': [0, 2, 5, 10, 20],
    'max_delta_step': [0, 0.2, 0.6, 1, 2],
    'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
    'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
    'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]
}
xgbr = XGBR()
xgbr_clf = GridSearchCV(xgbr, param_grid==params_XGBR, cv=5,scoring='r2',n_jobs=-1,n_iter=45)
xgbr_clf.fit(X_train_origin,y_train_origin)
xgbr_clf.best_params_

In [None]:
params_LGBMR = {'n_estimators': [1000, 1500, 2000, 2500],
               'max_depth':  [4, 5, 8, -1],
               'num_leaves': [15, 31, 63, 127],
               'subsample': [0.6, 0.7, 0.8, 1.0],
               'colsample_bytree': [0.6, 0.7, 0.8, 1.0],
               'learning_rate' : [0.01,0.02,0.03]
              }
lgbmr = LGBMR()
lgbmr_clf = GridSearchCV(lgbmr, param_grid=params_LGBMR, cv=5,scoring='r2',n_jobs=-1,n_iter=45)
lgbmr_clf.fit(X_train_origin,y_train_origin)
lgbmr_clf.best_params_

In [None]:
params_GBR = {'n_estimators':range(20,81,10),
              'learning_rate': [0.2,0.1, 0.05, 0.02, 0.01 ],
              'max_depth': [4, 6,8],
              'min_samples_leaf': [3, 5, 9, 14],
              'max_features': [0.8,0.5,0.3, 0.1]}
gbr = GBR()
gbr_clf = GridSearchCV(lgbmr, param_grid=params_GBR, cv=5,scoring='r2',n_jobs=-1,n_iter=45)
gbr_clf.fit(X_train_origin,y_train_origin)
gbr_clf.best_params_