### rfr_Extra & Gradient

- df_rfr_opt사용하여 ExtraTree/Gradient model optimizer 시행

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold ,StratifiedKFold

In [3]:
etr = ExtraTreesRegressor(random_state=42)
gbr = GradientBoostingRegressor(random_state=42)

In [4]:
df = pd.read_csv('./data/df_rfr_opt.csv', index_col=0)
df['기준_분기_코드'] = df['기준_분기_코드'].apply(lambda x : str(x)+'분기')
df = pd.get_dummies(df)
y = df['확진자수']
x = df.drop('확진자수', axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=42)

In [5]:
scaler = StandardScaler()
scaler.fit(x_train)

x_train_stan = scaler.transform(x_train)
x_test_stan = scaler.transform(x_test)

### ExtraTreesRegressor

In [10]:
# 0.18 -> 0.28
etr = ExtraTreesRegressor(random_state=42)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

1.0
0.28084014928741996


In [13]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=700)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

1.0
0.2909200556925191


In [22]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=30)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

1.0
0.31179285693422953


In [33]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=30,min_samples_split=10)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.8553767984320616
0.25127530983422075


In [43]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=30,min_samples_split=2,max_depth=9)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.8242329701343399
0.29387986191055204


In [56]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=50,min_samples_split=2,max_depth=9)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.8187238603078689
0.2821854575487346


In [9]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=700,min_samples_split=3,max_depth=15)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.8963654320472862
0.307740488140682


In [58]:
rfr = ExtraTreesRegressor(random_state=42,n_estimators=700,min_samples_split=5,max_depth=15)
rfr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.8187238603078689
0.2821854575487346


In [None]:
rfr = ExtraTreesRegressor(random_state=42,n_estimators=700,min_samples_split=6,max_depth=15)
rfr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

In [11]:
cv = KFold(5, shuffle=True, random_state=42)
params = { 
    'n_estimators': [20,40,60,80,100],
    'max_depth' : [None, 4,8,10,20,30],
    'min_samples_split': [2,4,6]}

In [12]:
grid_e=GridSearchCV(ExtraTreesRegressor(random_state=42),param_grid=params,cv=cv, refit=True, verbose=True, n_jobs=-1) 

In [13]:
grid_e.fit(x_train_stan,y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=ExtraTreesRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 4, 8, 10, 20, 30],
                         'min_samples_split': [2, 4, 6],
                         'n_estimators': [20, 40, 60, 80, 100]},
             verbose=True)

In [14]:
grid_e.best_estimator_ 

ExtraTreesRegressor(max_depth=4, n_estimators=20, random_state=42)

In [15]:
grid_e.best_score_ 

-0.01320083464230688

In [16]:
grid_e.score(x_train_stan,y_train)

0.5160894289816809

In [17]:
# 0.23 -> 0.28 상승
grid_e.score(x_test_stan,y_test)

0.2826602654096114

In [18]:
cv = KFold(5, shuffle=True, random_state=42)
params = { 
    'n_estimators': [350,500,600,700,750],
    'max_depth' : [10,12,15,17,19],
    'min_samples_split': [2,4,6,8]}
grid_e=GridSearchCV(ExtraTreesRegressor(random_state=42),param_grid=params,cv=cv, refit=True, verbose=True, n_jobs=-1) 

In [19]:
grid_e.fit(x_train_stan,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=ExtraTreesRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [10, 12, 15, 17, 19],
                         'min_samples_split': [2, 4, 6, 8],
                         'n_estimators': [350, 500, 600, 700, 750]},
             verbose=True)

In [20]:
grid_e.best_estimator_ 

ExtraTreesRegressor(max_depth=10, min_samples_split=8, n_estimators=500,
                    random_state=42)

In [21]:
grid_e.best_score_ 

-0.06120611049558544

In [22]:
grid_e.score(x_train_stan,y_train)

0.7577326053001262

In [23]:
# 0.29 -> 0.25 하락
grid_e.score(x_test_stan,y_test)

0.25333264458847826

### feature_importances_

In [61]:
etc_model = ExtraTreesRegressor(random_state=42,n_estimators=30)
etc_model.fit(x_train_stan, y_train)

feature_list = pd.concat([pd.Series(x.columns), pd.Series(etc_model.feature_importances_)], axis=1)
feature_list.columns = ['features_name', 'importance']
feature_list.sort_values("importance", ascending =False)[:30]

Unnamed: 0,features_name,importance
774,matching_상권_코드_명_용산구청,0.055377
397,여성연령대_30_일요일시간대_4_생활인구_수,0.0339
687,matching_상권_코드_명_공항대로59다길,0.027786
145,남성연령대_30_일요일시간대_4_생활인구_수,0.020567
13,시간대_5_생활인구_수,0.018844
92,남성연령대_20_금요일시간대_5_생활인구_수,0.018439
535,아파트_가격_3_억_세대_수,0.018209
71,남성연령대_20_화요일시간대_2_생활인구_수,0.014721
139,남성연령대_30_토요일시간대_4_생활인구_수,0.012828
348,여성연령대_20_토요일시간대_3_생활인구_수,0.012575


## GradientBoostingRegressor

### test_param_tuning

In [36]:
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.9510361154531684
0.13882241575736753


In [37]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.1,subsample=0.7)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.9120958547713851
0.22629728668970372


In [65]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.1,subsample=0.7,n_estimators=46)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.7862355373554485
0.2540660872505872


In [38]:
# gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.1,subsample=0.3)
# gbr.fit(x_train_stan,y_train)
# print(gbr.score(x_train_stan, y_train))
# print(gbr.score(x_test_stan, y_test))

In [39]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.25,subsample=0.7)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.9920659858389741
0.12040839317818053


In [40]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.25,subsample=0.7,n_estimators=30)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))


0.8524013978520993
0.13327027990256946


In [41]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.25,subsample=0.7,n_estimators=21)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.7828310234729493
0.16097554243712286


### Gridsearch

In [42]:
cv = KFold(5, shuffle=True, random_state=42)
params = { 
    'learning_rate': np.arange(0.01,0.25,0.01),
    'subsample' : np.arange(0.1,1,0.1),
    'n_estimators': np.arange(5,100,5)}
grid_b=GridSearchCV(GradientBoostingRegressor(random_state=42),param_grid=params,cv=cv, refit=True, verbose=True, n_jobs=-1) 

In [43]:
grid_b.fit(x_train_stan,y_train)

Fitting 5 folds for each of 4104 candidates, totalling 20520 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=GradientBoostingRegressor(random_state=42), n_jobs=-1,
             param_grid={'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
       0.23, 0.24]),
                         'n_estimators': array([ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85,
       90, 95]),
                         'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
             verbose=True)

In [44]:
grid_b.best_estimator_ 

GradientBoostingRegressor(learning_rate=0.03, n_estimators=35, random_state=42,
                          subsample=0.1)

In [45]:
grid_b.best_score_ 

0.1285349081576704

In [46]:
grid_b.score(x_train_stan,y_train)

0.2793835647750842

In [47]:
grid_b.score(x_test_stan,y_test)

0.20920553807531916

### 결과적으로 ExtraTreesRegressor이  raw_df 사용 시 보다 조금 개선.
### overfitted 해소시 더 높은 score 기대