In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold ,StratifiedKFold

In [2]:
etr = ExtraTreesRegressor(random_state=42)
gbr = GradientBoostingRegressor(random_state=42)

In [3]:
df = pd.read_csv('./data/df_ada_opt.csv', index_col=0)
df['기준_분기_코드'] = df['기준_분기_코드'].apply(lambda x : str(x)+'분기')
df = pd.get_dummies(df)
y = df['확진자수']
x = df.drop('확진자수', axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=42)

In [4]:
scaler = StandardScaler()
scaler.fit(x_train)

x_train_stan = scaler.transform(x_train)
x_test_stan = scaler.transform(x_test)

### ExtraTreesRegressor

In [5]:
# 0.18 -> 0.31 상승
etr = ExtraTreesRegressor(random_state=42)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

1.0
0.3146167889645214


In [9]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=700,min_samples_split=3,max_depth=15)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.9576126859988109
0.3023589750426976


In [10]:
rfr = ExtraTreesRegressor(random_state=42,n_estimators=700,min_samples_split=10,max_depth=15)
rfr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.9576126859988109
0.3023589750426976


In [10]:
rfr = ExtraTreesRegressor(random_state=42,n_estimators=700,min_samples_split=10,max_depth=15)
rfr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.9576126859988109
0.3023589750426976


In [16]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=30, max_depth=30)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.9999997912565589
0.3112289928915565


In [18]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=100, max_depth=30)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.9999964008539024
0.3263571577800346


In [43]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=94, max_depth=30)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.9999959446905871
0.32962969569547274


In [56]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=94, max_depth=29,min_samples_split=2)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.9999943574028314
0.3311159705686815


In [11]:
cv = KFold(5, shuffle=True, random_state=42)
params = { 
    'n_estimators': [20,40,60,80,100],
    'max_depth' : [None, 4,8,10,20,30],
    'min_samples_split': [2,4,6]}

In [12]:
grid_e=GridSearchCV(ExtraTreesRegressor(random_state=42),param_grid=params,cv=cv, refit=True, verbose=True, n_jobs=-1) 

In [13]:
grid_e.fit(x_train_stan,y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=ExtraTreesRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 4, 8, 10, 20, 30],
                         'min_samples_split': [2, 4, 6],
                         'n_estimators': [20, 40, 60, 80, 100]},
             verbose=True)

In [14]:
grid_e.best_estimator_ 

ExtraTreesRegressor(max_depth=4, n_estimators=40, random_state=42)

In [15]:
grid_e.best_score_ 

-0.01179162399662812

In [16]:
grid_e.score(x_train_stan,y_train)

0.5369838113004057

In [17]:
# 0.23 -> 0.28 상승
grid_e.score(x_test_stan,y_test)

0.28250079431073005

In [18]:
cv = KFold(5, shuffle=True, random_state=42)
params = { 
    'n_estimators': [350,500,600,700,750],
    'max_depth' : [10,12,15,17,19],
    'min_samples_split': [2,4,6,8]}
grid_e=GridSearchCV(ExtraTreesRegressor(random_state=42),param_grid=params,cv=cv, refit=True, verbose=True, n_jobs=-1) 

In [19]:
grid_e.fit(x_train_stan,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=ExtraTreesRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [10, 12, 15, 17, 19],
                         'min_samples_split': [2, 4, 6, 8],
                         'n_estimators': [350, 500, 600, 700, 750]},
             verbose=True)

In [20]:
grid_e.best_estimator_ 

ExtraTreesRegressor(max_depth=10, min_samples_split=8, n_estimators=500,
                    random_state=42)

In [21]:
grid_e.best_score_ 

-0.06524618089146612

In [22]:
grid_e.score(x_train_stan,y_train)

0.7821845092199221

In [23]:
grid_e.score(x_test_stan,y_test)

0.269554609335506

In [218]:
etc_model = ExtraTreesRegressor(max_depth=15, min_samples_split=6, n_estimators=700,
                    random_state=42)
etc_model.fit(x_train_stan, y_train)

print(etc_model.feature_importances_)
feature_list = pd.concat([pd.Series(x.columns), pd.Series(etc_model.feature_importances_)], axis=1)
feature_list.columns = ['features_name', 'importance']
feature_list.sort_values("importance", ascending =False)[:15]

[1.72463698e-02 3.93863840e-03 6.95663865e-03 2.59117871e-03
 3.43437636e-03 3.05555607e-03 1.92734383e-03 1.29481585e-02
 9.56440574e-03 1.28577175e-02 1.08782798e-02 1.38983322e-02
 1.91172152e-03 5.31600858e-03 1.24650884e-02 1.54995031e-02
 1.12019736e-02 1.28884342e-02 4.94439122e-03 1.34244373e-03
 1.62434092e-03 2.70136225e-03 2.15510652e-03 3.92434460e-03
 6.21756925e-03 1.83610152e-03 3.30579547e-02 2.77510466e-03
 2.46898908e-02 1.48232672e-03 1.01870292e-03 1.74015653e-03
 1.82826389e-03 2.35369177e-03 1.42918953e-03 1.96663053e-03
 2.16919768e-03 1.24710693e-03 1.85350710e-03 1.80255389e-03
 1.81000426e-03 1.69249621e-03 1.62259876e-03 4.10993460e-03
 1.90134058e-03 1.32558171e-03 1.76922322e-03 2.53248569e-03
 2.79600801e-03 2.38736350e-03 1.82959212e-03 2.71807124e-03
 4.42362026e-03 2.92994846e-03 3.57531163e-03 2.36710275e-03
 2.16829594e-03 3.19790798e-03 2.63005600e-03 1.33479989e-03
 1.76783046e-03 3.02543262e-03 1.13180565e-02 1.86562278e-03
 1.79020380e-02 1.470080

Unnamed: 0,features_name,importance
26,남성연령대_30_토요일시간대_5_생활인구_수,0.033058
109,아파트_가격_4_억_세대_수,0.024877
28,남성연령대_30_일요일시간대_4_생활인구_수,0.02469
67,여성연령대_20_토요일시간대_3_생활인구_수,0.023798
68,여성연령대_20_토요일시간대_4_생활인구_수,0.022645
190,버스정류장 수,0.018196
64,여성연령대_20_수요일시간대_5_생활인구_수,0.017902
105,아파트_면적_132_제곱미터_세대_수,0.017258
0,시간대_5_생활인구_수,0.017246
15,남성연령대_20_토요일시간대_5_생활인구_수,0.0155


### GradientBoostingRegressor

In [92]:
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.9407835182101775
0.09202296663912457


In [86]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.02)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.6077913395342838
0.2146240672775045


In [109]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.12,subsample=0.8, n_estimators=45)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.8168630147509401
0.20170792759496947


In [71]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.02,subsample=0.9, n_estimators=100)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.6279217424103798
0.23255988282127005


In [96]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.1,subsample=0.7,n_estimators=60)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))


0.8263390199841927
0.21999243435155968


In [213]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.1,subsample=0.69 ,n_estimators=23)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.6389027613881709
0.30079069800417646


In [97]:
cv = KFold(5, shuffle=True, random_state=42)
params = { 
    'learning_rate': np.arange(0.01,0.25,0.01),
    'subsample' : np.arange(0.1,1,0.1),
    'n_estimators': np.arange(5,100,5)}
grid_b=GridSearchCV(GradientBoostingRegressor(random_state=42),param_grid=params,cv=cv, refit=True, verbose=True, n_jobs=-1) 

In [98]:
grid_b.fit(x_train_stan,y_train)

Fitting 5 folds for each of 4104 candidates, totalling 20520 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=GradientBoostingRegressor(random_state=42), n_jobs=-1,
             param_grid={'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
       0.23, 0.24]),
                         'n_estimators': array([ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85,
       90, 95]),
                         'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
             verbose=True)

In [99]:
grid_b.best_estimator_ 

GradientBoostingRegressor(learning_rate=0.08, n_estimators=10, random_state=42,
                          subsample=0.8)

In [110]:
grid_b.best_score_ 

0.10215465207928398

In [111]:
grid_b.score(x_train_stan,y_train)

0.38350688839991975

In [112]:
grid_b.score(x_test_stan,y_test)

0.1882366663909122

In [216]:
gbr_model = GradientBoostingRegressor(random_state=42,learning_rate=0.1,subsample=0.69,n_estimators=23)
gbr_model.fit(x_train_stan, y_train)

print(gbr_model.feature_importances_)
feature_list = pd.concat([pd.Series(x.columns), pd.Series(gbr_model.feature_importances_)], axis=1)
feature_list.columns = ['features_name', 'importance']
feature_list.sort_values("importance", ascending =False)[:15]

[6.65725706e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.10074016e-02 7.01406582e-03
 8.14691232e-03 0.00000000e+00 7.37017661e-03 2.58900517e-02
 0.00000000e+00 7.96097974e-03 0.00000000e+00 3.90605401e-02
 0.00000000e+00 1.88747546e-02 3.21705890e-03 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 2.91119429e-04
 1.85077073e-03 3.95841543e-04 8.50167893e-05 0.00000000e+00
 7.69034880e-02 4.26840162e-04 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.13454661e-03
 1.59864323e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00
 9.61234209e-04 6.15175719e-03 5.70081337e-03 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.02864885e-03 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.48372252e-02 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.000000

Unnamed: 0,features_name,importance
28,남성연령대_30_일요일시간대_4_생활인구_수,0.076903
0,시간대_5_생활인구_수,0.066573
109,아파트_가격_4_억_세대_수,0.061304
15,남성연령대_20_토요일시간대_5_생활인구_수,0.039061
107,아파트_가격_1_억_세대_수,0.032565
137,목요일_매출_비율,0.032426
170,시간대_건수~24_매출_건수,0.026818
11,남성연령대_20_금요일시간대_2_생활인구_수,0.02589
148,연령대_20_매출_비율,0.025437
175,연령대_30_매출_건수,0.024126


### 최종결론

### Extratree model에 한해서는  df_ada_opt.csv를 적용했을 때 프로젝트 기간 동안의 총 score 중 가장 높았음

### 모든 경우의 수와 모델을 조합한 결과 획득한 best_score와 model은raw_df를 사용한 raw_ML_model_RandomForest에서의 RandomForest_test_param_tuning의 score