In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score


In [2]:
train_data_path = join('./data', 'train.csv')
test_data_path = join('./data', 'test.csv')

train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

In [3]:
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int)
del train['id']
train.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201410,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,201502,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2,201502,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
3,201406,257500.0,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
4,201501,291850.0,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711


In [4]:
y = train['price']
del train['price']

y.head(), y.shape

(0    221900.0
 1    180000.0
 2    510000.0
 3    257500.0
 4    291850.0
 Name: price, dtype: float64,
 (15035,))

In [5]:
test['date'] = test['date'].apply(lambda i: i[:6]).astype(int)
del test['id']
test.head(3)

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201412,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
1,201412,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
2,201405,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930


In [6]:
y= np.log1p(y)

---

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [8]:
random_state=2020
gboost = GradientBoostingRegressor(random_state=random_state)
model = gboost

In [9]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

In [11]:
param_grid = {'n_estimators': [50, 100], 'max_depth' : [1, 10],}
model = GradientBoostingRegressor(random_state=random_state)

In [12]:
from sklearn.model_selection import GridSearchCV
grid_model = GridSearchCV(model, param_grid=param_grid,
                         scoring='neg_mean_squared_error',
                         cv=5, verbose=2, n_jobs=5)

In [14]:
def my_GridSearch_gboost(model, train, y, param_grid, verbose=2, n_jobs=5):
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=verbose, n_jobs=n_jobs)
    
    grid_model.fit(train, y)

    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']

    hyper = pd.DataFrame(params)
    hyper['score'] = score

    hyper['RMSLE'] = np.sqrt(-1 * hyper['score'])
    hyper = hyper.sort_values('RMSLE')


    return hyper

In [15]:
my_GridSearch_gboost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
3,10,100,-0.029481,0.171699
2,10,50,-0.029896,0.172905
1,1,100,-0.055036,0.234597
0,1,50,-0.073253,0.270653


## 하이퍼 파라미터 튜닝 및 RMSLE 변화 관찰

__(test1) max_depth 깊을수록, n_estimators가 많을수록 RMSLE 값이 작음. max_depth값에 제한을 두지 않고 무한대로 늘려보자(-1)__

In [18]:
param_grid ={'n_estimators': [50, 100], 'max_depth': [8, 10, 30, 50],}
model = GradientBoostingRegressor(random_state=random_state)
my_GridSearch_gboost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
1,8,100,-0.027912,0.16707
0,8,50,-0.028832,0.169799
3,10,100,-0.029481,0.171699
2,10,50,-0.029896,0.172905
4,30,50,-0.059956,0.24486
6,50,50,-0.06,0.244948
5,30,100,-0.060232,0.245421
7,50,100,-0.06028,0.24552


__(test2) max_depth가 8일때 RMSLE가 가장 낮음. max_depth8로 고정하고 n_estimators 값 올려보기__

In [19]:
param_grid ={'n_estimators': [30, 50, 100, 200], 'max_depth': [8],}
model = GradientBoostingRegressor(random_state=random_state)
my_GridSearch_gboost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
3,8,200,-0.027682,0.166379
2,8,100,-0.027912,0.16707
1,8,50,-0.028832,0.169799
0,8,30,-0.031323,0.176984


200보다 더 높여보자

In [20]:
param_grid ={'n_estimators': [200, 300, 400, 500], 'max_depth': [8],}
model = GradientBoostingRegressor(random_state=random_state)
my_GridSearch_gboost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
0,8,200,-0.027682,0.166379
1,8,300,-0.027816,0.166781
2,8,400,-0.027913,0.167071
3,8,500,-0.027961,0.167214


n_estimators 200으로 고정

__(test3) max_depth 8, n_estimators 200으로 고정 후 learning_rate 파라미터 추가하기__

In [24]:
param_grid ={'n_estimators': [200], 'max_depth': [8], 
            'learning_rate' : [0.0001, 0.001, 0.01, 0.1],}
model = GradientBoostingRegressor(random_state=random_state)
my_GridSearch_gboost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,learning_rate,max_depth,n_estimators,score,RMSLE
3,0.1,8,200,-0.027682,0.166379
2,0.01,8,200,-0.03796,0.194834
1,0.001,8,200,-0.201728,0.449142
0,0.0001,8,200,-0.269463,0.519099


learning_rate 0.1로 고정

__(test4) max_depth 8, n_estimators 200, learning_rate 0.1로 고정 후  subsample 파라미터 추가__

In [25]:
param_grid ={'n_estimators': [200], 'max_depth': [8], 
            'learning_rate' : [0.1], 'subsample' : [0, 0.5, 1.0]}
model = GradientBoostingRegressor(random_state=random_state)
my_GridSearch_gboost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


Unnamed: 0,learning_rate,max_depth,n_estimators,subsample,score,RMSLE
2,0.1,8,200,1.0,-0.027682,0.166379
1,0.1,8,200,0.5,-0.028292,0.168202
0,0.1,8,200,0.0,,


subsample = 1.0으로 고정

__위와 같은 과정을 통해 어느정도 최적화된 하이퍼 파라미터는 아래와 같음__

In [26]:
param_grid ={'n_estimators': [200], 'max_depth': [8], 
            'learning_rate' : [0.1], 'subsample' : [1.0]}
model = GradientBoostingRegressor(random_state=random_state)
my_GridSearch_gboost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,learning_rate,max_depth,n_estimators,subsample,score,RMSLE
0,0.1,8,200,1.0,-0.027682,0.166379


__RMSLE : 0.166379__