In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.model_selection import KFold, cross_val_score
import lightgbm as lgb

In [2]:
train_data_path = join('./data', 'train.csv')
test_data_path = join('./data', 'test.csv')

train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

In [3]:
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int)
del train['id']
train.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201410,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,201502,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2,201502,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
3,201406,257500.0,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
4,201501,291850.0,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711


In [4]:
y = train['price']
del train['price']

y.head(), y.shape

(0    221900.0
 1    180000.0
 2    510000.0
 3    257500.0
 4    291850.0
 Name: price, dtype: float64,
 (15035,))

In [5]:
test['date'] = test['date'].apply(lambda i: i[:6]).astype(int)
del test['id']
test.head(3)

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201412,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
1,201412,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
2,201405,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930


In [6]:
y= np.log1p(y)

___

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

In [8]:
random_state=2020
lightgbm = LGBMRegressor(random_state=random_state)
model = lightgbm

In [9]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

In [10]:
param_grid = {'n_estimators': [50, 100], 'max_depth' : [1, 10],}
model = LGBMRegressor(random_state=random_state)

In [11]:
from sklearn.model_selection import GridSearchCV
grid_model = GridSearchCV(model, param_grid=param_grid,
                         scoring='neg_mean_squared_error',
                         cv=5, verbose=2, n_jobs=5)

In [12]:
def my_GridSearch_LGBM(model, train, y, param_grid, verbose=2, n_jobs=5):
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=verbose, n_jobs=n_jobs)
    
    grid_model.fit(train, y)

    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']

    hyper = pd.DataFrame(params)
    hyper['score'] = score

    hyper['RMSLE'] = np.sqrt(-1 * hyper['score'])
    hyper = hyper.sort_values('RMSLE')


    return hyper

In [13]:
my_GridSearch_LGBM(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
3,10,100,-0.027027,0.164399
2,10,50,-0.029177,0.170814
1,1,100,-0.05502,0.234564
0,1,50,-0.073394,0.270914


In [14]:
def get_scores(model, train, y):
    df = {}

    model_name = model.__class__.__name__

    X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=random_state)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    df[model_name] = rmse(y_test, y_pred)

    score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
    
    return score_df
get_scores(model, train, y)   

Unnamed: 0,RMSE
LGBMRegressor,111920.367359


## 하이퍼 파라미터 튜닝 및 RMSLE 변화 관찰

__(test1) max_depth 깊을수록, n_estimators가 많을수록 RMSLE 값이 작았음. max_depth값에 제한을 두지 않고 무한대로 늘려보자(-1)__

In [15]:
param_grid ={'n_estimators': [50, 100], 'max_depth': [-1],}
model = LGBMRegressor(random_state=random_state)
my_GridSearch_LGBM(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
1,-1,100,-0.027051,0.164472
0,-1,50,-0.029198,0.170875


0.164399 -> 0.164472로 0.000133상승함. max_depth가 무작정 깊다고 좋은건 아님

__(test2) max_depth를 그대로(10)두고 n_estimator를 늘려보자__

In [16]:
param_grid ={'n_estimators': [200, 500, 1000], 'max_depth': [10],}
model = LGBMRegressor(random_state=random_state)
my_GridSearch_LGBM(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
0,10,200,-0.026188,0.161828
1,10,500,-0.026245,0.162004
2,10,1000,-0.026739,0.163521


n_estimators가 200일때는 RMSLE 값이 0.002571 하락했으나, n_estimators값이 늘어날수록 에러값이 상승함. max_depth가 10일때 n_estimators의 최적값은 100-500 사이에 있을 것으로 예상할 수 있음.

In [17]:
param_grid ={'n_estimators': [150, 180, 200, 250, 300, 350, 400, 450], 'max_depth': [10],}
model = LGBMRegressor(random_state=random_state)
my_GridSearch_LGBM(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
5,10,350,-0.02611,0.161587
6,10,400,-0.026121,0.161619
4,10,300,-0.026127,0.16164
3,10,250,-0.026136,0.161667
7,10,450,-0.026176,0.161791
2,10,200,-0.026188,0.161828
1,10,180,-0.026267,0.162072
0,10,150,-0.026458,0.16266


n_estimators 값이 350 일때 가장 낮은 loss 값을 보임

__(test3) n_estimators값을 350으로 고정하고 max_depth값을 수정해보자__

In [18]:
param_grid ={'n_estimators': [350], 'max_depth': [5, 8, 10, 12], }
model = LGBMRegressor(random_state=random_state)
my_GridSearch_LGBM(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
2,10,350,-0.02611,0.161587
3,12,350,-0.026203,0.161874
1,8,350,-0.026323,0.162243
0,5,350,-0.026548,0.162936


하이퍼파라미터 max_depth와 n_estimators의 최적의 조합은 10, 350

__(test4) 다른 hyper parameter추가 해보자__

In [19]:
param_grid ={'n_estimators': [350], 'max_depth': [10], 'learning_rate' : [0.05, 0.1],
             'boosting_type' : ['gbdt', 'rf'], 'subsample' : [1.0, 10.0], 
             'feature_fraction' : [0.7, 0.9]}
model = LGBMRegressor(random_state=random_state)
my_GridSearch_LGBM(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


Unnamed: 0,boosting_type,feature_fraction,learning_rate,max_depth,n_estimators,subsample,score,RMSLE
0,gbdt,0.7,0.05,10,350,1.0,-0.025758,0.160493
4,gbdt,0.9,0.05,10,350,1.0,-0.025944,0.161073
6,gbdt,0.9,0.1,10,350,1.0,-0.026,0.161245
2,gbdt,0.7,0.1,10,350,1.0,-0.026061,0.161433
1,gbdt,0.7,0.05,10,350,10.0,,
3,gbdt,0.7,0.1,10,350,10.0,,
5,gbdt,0.9,0.05,10,350,10.0,,
7,gbdt,0.9,0.1,10,350,10.0,,
8,rf,0.7,0.05,10,350,1.0,,
9,rf,0.7,0.05,10,350,10.0,,


"subsample"은 데이터의 일부 행을 발췌해서 다양성을 높이는 방법으로 사용하는데 민감한 옵션이라 column sampling과 섞어서 쓴다고 함. subsample을 10으로 높이고 boosting_type으로 rf(랜덤포레스트)를 쓴 경우 모두 NaN이라고 뜸. 이유는 정확히 알 수 없으나 fitting하던 중 매개변수 예측치가 발산하면서 overflow가 일어나서 그런건가..? 잘 모르겠음

__위와 같은 과정을 통해 어느정도 최적화된 하이퍼 파라미터는 아래와 같음__

In [20]:
param_grid ={'n_estimators': [350], 'max_depth': [10], 'learning_rate' : [0.05],
             'boosting_type' : ['gbdt'], 'subsample' : [1.0], 
             'feature_fraction' : [0.7], }
model = LGBMRegressor(random_state=random_state)
my_GridSearch_LGBM(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,boosting_type,feature_fraction,learning_rate,max_depth,n_estimators,subsample,score,RMSLE
0,gbdt,0.7,0.05,10,350,1.0,-0.025758,0.160493


__RMSLE : 0.160493__