In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb

In [2]:
train_data_path = join('./data', 'train.csv')
test_data_path = join('./data', 'test.csv')

train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

In [3]:
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int)
del train['id']
train.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201410,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,201502,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2,201502,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
3,201406,257500.0,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
4,201501,291850.0,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711


In [4]:
y = train['price']
del train['price']

y.head(), y.shape

(0    221900.0
 1    180000.0
 2    510000.0
 3    257500.0
 4    291850.0
 Name: price, dtype: float64,
 (15035,))

In [5]:
test['date'] = test['date'].apply(lambda i: i[:6]).astype(int)
del test['id']
test.head(3)

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201412,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
1,201412,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
2,201405,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930


In [6]:
y= np.log1p(y)

___

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [8]:
random_state=2020
xgboost = xgb.XGBRegressor(random_state=random_state)
model = xgboost

In [9]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

In [10]:
param_grid = {'n_estimators': [50, 100], 'max_depth' : [1, 10],}

In [11]:
from sklearn.model_selection import GridSearchCV
grid_model = GridSearchCV(model, param_grid=param_grid,
                         scoring='neg_mean_squared_error',
                         cv=5, verbose=2, n_jobs=5)

In [12]:
def my_GridSearch_XGBoost(model, train, y, param_grid, verbose=2, n_jobs=5):
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=verbose, n_jobs=n_jobs)
    
    grid_model.fit(train, y)

    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']

    hyper = pd.DataFrame(params)
    hyper['score'] = score

    hyper['RMSLE'] = np.sqrt(-1 * hyper['score'])
    hyper = hyper.sort_values('RMSLE')


    return hyper

In [13]:
my_GridSearch_XGBoost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
2,10,50,-0.030355,0.174227
3,10,100,-0.0306,0.17493
1,1,100,-0.043012,0.207394
0,1,50,-0.048726,0.220739


In [14]:
def get_scores(model, train, y):
    df = {}

    model_name = model.__class__.__name__

    X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=random_state)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    df[model_name] = rmse(y_test, y_pred)

    score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
    
    return score_df
get_scores(model, train, y)   

Unnamed: 0,RMSE
XGBRegressor,112334.521178


## 하이퍼 파라미터 튜닝 및 RMSLE 변화 관찰

__(test1) max_depth 깊을수록, n_estimators가 적을수록 RMSLE 값이 작았음. max_depth값에 제한을 두지 않고 무한대로 늘려보고자 했으나(-1), xgboost는 -1 입력하면 error가 발생함__

In [15]:
param_grid ={'n_estimators': [50, 100], 'max_depth': [5, 10, 50],}
model = xgboost
my_GridSearch_XGBoost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
1,5,100,-0.028401,0.168527
0,5,50,-0.029647,0.172184
2,10,50,-0.030355,0.174227
3,10,100,-0.0306,0.17493
5,50,100,-0.033143,0.182053
4,50,50,-0.033143,0.182053


max_depth가 5 일때 RMSLE값이 가장 낮음. 과소적합이 일어난건 아닐까 의심되지만 일단 넘어감

__(test2) max_depth 5 고정, n_estimators 바꿔보자__

In [16]:
param_grid ={'n_estimators': [80, 100, 150, 300], 'max_depth': [5],}
model = xgboost
my_GridSearch_XGBoost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
2,5,150,-0.028249,0.168075
1,5,100,-0.028401,0.168527
3,5,300,-0.028595,0.1691
0,5,80,-0.028706,0.169427


max_depth가 5, n_estimators 150이 최적값인 것으로 보임.

__(test3) 다른 하이퍼 파라미터 추가해보자__

1. gamma [default=0]: 노드 분할에 필요한 최소 loss 감소를 지정함. 값이 클수록 알고리즘을 보수적으로 만듦
4. colsample_bytree [default=1] : 각 tree를 생성할 때 subsampling 할 columns의 비율로 0~1사이의 값을 가짐. 트리를 생성할 때마다 subsampling이 발생함.
5. min_child_weight [default=1] : overfitting을 방지하기 위한 파라미터로 최소 가중치 합을 정의함. 값이 높을수록 과소적합이 발생하며 알고리즘이 보수적임. 범위는 0~양의무한대

In [17]:
param_grid ={'n_estimators': [150], 'max_depth': [5], 'gamma': [1.9],
            'colsample_bytree': [0.2, 0.5, 1], 'min_child_weight': [0, 1, 10]}
model = xgboost
my_GridSearch_XGBoost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


Unnamed: 0,colsample_bytree,gamma,max_depth,min_child_weight,n_estimators,score,RMSLE
3,0.5,1.9,5,0,150,-0.034288,0.185171
4,0.5,1.9,5,1,150,-0.034288,0.185171
5,0.5,1.9,5,10,150,-0.035079,0.187295
2,0.2,1.9,5,10,150,-0.035648,0.188807
0,0.2,1.9,5,0,150,-0.035701,0.188947
1,0.2,1.9,5,1,150,-0.035701,0.188947
8,1.0,1.9,5,10,150,-0.037046,0.192473
6,1.0,1.9,5,0,150,-0.037604,0.193917
7,1.0,1.9,5,1,150,-0.037604,0.193917


하이퍼파라미터를 더 추가했더니 오히려 RMSLE값이 높아짐. 그 중 가장 적은 RMSLE값을 갖는 파라미터들로 고정하고 gamma값을 바꿔봄

In [18]:
param_grid ={'n_estimators': [150], 'max_depth': [5], 'gamma': [0, 1.9, 3],
            'colsample_bytree': [0.5], 'min_child_weight': [0]}
model = xgboost
my_GridSearch_XGBoost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


Unnamed: 0,colsample_bytree,gamma,max_depth,min_child_weight,n_estimators,score,RMSLE
0,0.5,0.0,5,0,150,-0.02829,0.168196
1,0.5,1.9,5,0,150,-0.034288,0.185171
2,0.5,3.0,5,0,150,-0.03735,0.193262


xgboost모델은 max_depth와 n_estimators를 조정하는게 가장 적합한 모델인 것 같음.

__즉, 위와 같은 과정을 통해 어느정도 최적화된 하이퍼 파라미터는 아래와 같음__

In [19]:
param_grid ={'n_estimators': [150], 'max_depth': [5]}
model = xgboost
my_GridSearch_XGBoost(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
0,5,150,-0.028249,0.168075


__RMSLE : 0.168075__