In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.model_selection import KFold, cross_val_score

In [2]:
train_data_path = join('./data', 'train.csv')
test_data_path = join('./data', 'test.csv')

train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

In [3]:
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int)
del train['id']
train.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201410,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,201502,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2,201502,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
3,201406,257500.0,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
4,201501,291850.0,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711


In [4]:
y = train['price']
del train['price']

y.head(), y.shape

(0    221900.0
 1    180000.0
 2    510000.0
 3    257500.0
 4    291850.0
 Name: price, dtype: float64,
 (15035,))

In [5]:
test['date'] = test['date'].apply(lambda i: i[:6]).astype(int)
del test['id']
test.head(3)

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201412,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
1,201412,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
2,201405,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930


---

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [7]:
random_state=2020
rdforest = RandomForestRegressor(random_state=random_state)
model = rdforest

In [8]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

In [9]:
param_grid = {'n_estimators': [50, 100], 'max_depth' : [1, 10],}

In [10]:
from sklearn.model_selection import GridSearchCV
grid_model = GridSearchCV(model, param_grid=param_grid,
                         scoring='neg_mean_squared_error',
                         cv=5, verbose=2, n_jobs=5)

In [14]:
def my_GridSearch_rdforest(model, train, y, param_grid, verbose=2, n_jobs=5):
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=verbose, n_jobs=n_jobs)
    
    grid_model.fit(train, y)

    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']

    hyper = pd.DataFrame(params)
    hyper['score'] = score

    hyper['RMSLE'] = np.sqrt(-1 * hyper['score'])
    hyper = hyper.sort_values('RMSLE')


    return hyper

In [15]:
my_GridSearch_rdforest(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
2,10,50,-19641400000.0,140147.778612
3,10,100,-19705000000.0,140374.511226
1,1,100,-84789730000.0,291186.754734
0,1,50,-84800560000.0,291205.363331


## 하이퍼 파라미터 튜닝 및 RMSLE 변화 관찰

__(test1) max_depth 깊을수록, n_estimators가 적을수록 RMSLE 값이 작음. max_depth값에 제한을 두지 않고 무한대로 늘려보자(-1)__

In [16]:
param_grid ={'n_estimators': [50, 100], 'max_depth': [3, 5, 10, 50],}
model = rdforest
my_GridSearch_rdforest(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
7,50,100,-18247070000.0,135081.711998
6,50,50,-18462710000.0,135877.542137
4,10,50,-19641400000.0,140147.778612
5,10,100,-19705000000.0,140374.511226
2,5,50,-31362000000.0,177093.200631
3,5,100,-31583700000.0,177718.041942
0,3,50,-48124600000.0,219373.207904
1,3,100,-48437390000.0,220084.95476


In [21]:
param_grid ={'n_estimators': [50, 100], 'max_depth': [500],}
model = rdforest
my_GridSearch_rdforest(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
1,500,100,-18247070000.0,135081.711998
0,500,50,-18462710000.0,135877.542137


In [26]:
param_grid ={'bootstrap': [True, False], 'max_depth': [50],
             'max_features': ['auto', 'sqrt'],
             'min_samples_leaf': [1, 2, 4], 
             'min_samples_split': [2, 5, 10],
             'n_estimators': [10}
model = rdforest
my_GridSearch_rdforest(model, train, y , param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


Unnamed: 0,bootstrap,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,score,RMSLE
3,True,50,auto,2,2,10,-20305820000.0,142498.49596
4,True,50,auto,2,5,10,-20726200000.0,143965.951116
29,False,50,sqrt,1,10,10,-20800790000.0,144224.803575
31,False,50,sqrt,2,5,10,-20960190000.0,144776.354722
32,False,50,sqrt,2,10,10,-21027910000.0,145010.03323
0,True,50,auto,1,2,10,-21086310000.0,145211.275325
5,True,50,auto,2,10,10,-21355460000.0,146135.074169
27,False,50,sqrt,1,2,10,-21543640000.0,146777.531431
2,True,50,auto,1,10,10,-21601910000.0,146975.883093
1,True,50,auto,1,5,10,-21602110000.0,146976.569185


randomforest regressor 모델도 max_depth와 n_estimator만 조합했을 때, RMSLE가 가장 낮다. 그런데 왜이렇게 수치가 높은지는 의문이다. 

__RMSLE = 142498.495960__