### raw_Extra & Gradient

- raw_DataFrame을 사용 ExtraTree/GradientBoost model로 최적화 작업 진행


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings(action='ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold ,StratifiedKFold

In [3]:
etr = ExtraTreesRegressor(random_state=42)
gbr = GradientBoostingRegressor(random_state=42)

In [4]:
df = pd.read_csv('./data/final_result.csv', index_col=0)
df['기준_분기_코드'] = df['기준_분기_코드'].apply(lambda x : str(x)+'분기')
df = pd.get_dummies(df)
y = df['확진자수']
x = df.drop('확진자수', axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=42)

In [5]:
scaler = StandardScaler()
scaler.fit(x_train)

x_train_stan = scaler.transform(x_train)
x_test_stan = scaler.transform(x_test)

### ExtraTreesRegressor

### test_param_tuning

In [None]:
etr = ExtraTreesRegressor(random_state=42)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

1.0
0.18584435721392267


In [None]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=50,min_samples_split=2,max_depth=10)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.7838385430549585
0.2868247148822042


In [None]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=200,min_samples_split=2,max_depth=10)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.7815732084622651
0.2878807078108092


In [None]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=700,min_samples_split=3,max_depth=15)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.8191963681485704
0.29119701288096567


In [None]:
rfr = ExtraTreesRegressor(random_state=42,n_estimators=700,min_samples_split=6,max_depth=15)
rfr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.8191963681485704
0.29119701288096567


In [None]:
etr = ExtraTreesRegressor(random_state=42,n_estimators=50,min_samples_split=3,max_depth=13)
etr.fit(x_train_stan,y_train)
print(etr.score(x_train_stan, y_train))
print(etr.score(x_test_stan, y_test))

0.8007457548056383
0.29446730961649437


### Gridsearch  -  KFold = 5고정, param_tuning

In [None]:
cv = KFold(5, shuffle=True, random_state=42)
params = { 
    'n_estimators': [20,40,60,80,100],
    'max_depth' : [None, 4,8,10,20,30],
    'min_samples_split': [2,4,6]}

In [None]:
grid_e=GridSearchCV(ExtraTreesRegressor(random_state=42),param_grid=params,cv=cv, refit=True, verbose=True, n_jobs=-1) 

In [None]:
grid_e.fit(x_train_stan,y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=ExtraTreesRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 4, 8, 10, 20, 30],
                         'min_samples_split': [2, 4, 6],
                         'n_estimators': [20, 40, 60, 80, 100]},
             verbose=True)

In [None]:
grid_e.best_estimator_ 

ExtraTreesRegressor(max_depth=4, min_samples_split=6, n_estimators=60,
                    random_state=42)

In [None]:
grid_e.best_score_ 

-0.05207003120580571

In [None]:
grid_e.score(x_train_stan,y_train)

0.5277203342002765

In [None]:
grid_e.score(x_test_stan,y_test)

0.23575952528285316

In [None]:
cv = KFold(5, shuffle=True, random_state=42)
params = { 
    'n_estimators': [350,500,600,700,750],
    'max_depth' : [10,12,15,17,19],
    'min_samples_split': [2,4,6,8]}
grid_e=GridSearchCV(ExtraTreesRegressor(random_state=42),param_grid=params,cv=cv, refit=True, verbose=True, n_jobs=-1) 

In [None]:
grid_e.fit(x_train_stan,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=ExtraTreesRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [10, 12, 15, 17, 19],
                         'min_samples_split': [2, 4, 6, 8],
                         'n_estimators': [350, 500, 600, 700, 750]},
             verbose=True)

In [None]:
grid_e.best_estimator_ 

ExtraTreesRegressor(max_depth=12, min_samples_split=8, n_estimators=750,
                    random_state=42)

In [None]:
grid_e.best_score_ 

-0.0751257064132372

In [None]:
grid_e.score(x_train_stan,y_train)

0.7486015389474503

In [None]:
grid_e.score(x_test_stan,y_test)

0.2929364437080988

### feature_importances_

In [None]:
etc_model = ExtraTreesRegressor(random_state=42,n_estimators=50,min_samples_split=3,max_depth=13)
etc_model.fit(x_train_stan, y_train)

feature_list = pd.concat([pd.Series(x.columns), pd.Series(etc_model.feature_importances_)], axis=1)
feature_list.columns = ['features_name', 'importance']
feature_list.sort_values("importance", ascending =False)[:15]

Unnamed: 0,features_name,importance
904,matching_상권_코드_명_용산구청,0.073374
392,여성연령대_30_토요일시간대_5_생활인구_수,0.056385
760,matching_상권_코드_명_디지털미디어시티,0.037634
397,여성연령대_30_일요일시간대_4_생활인구_수,0.036952
706,matching_상권_코드_명_공항대로59다길,0.036689
715,matching_상권_코드_명_구로중앙로28길,0.035362
834,matching_상권_코드_명_서울 용산구 이태원역,0.032733
824,matching_상권_코드_명_서울 마포구 홍대입구역_3,0.031109
145,남성연령대_30_일요일시간대_4_생활인구_수,0.024863
941,matching_상권_코드_명_학동로56길,0.024682


## GradientBoostingRegressor

In [None]:
# loss='ls',
#     learning_rate=0.1,
#     n_estimators=100,
#     subsample=1.0,
#     criterion='friedman_mse',
#     min_samples_split=2,
#     min_samples_leaf=1,
#     min_weight_fraction_leaf=0.0,
#     max_depth=3,
#     min_impurity_decrease=0.0,
#     min_impurity_split=None,
#     init=None,
#     random_state=None,
#     max_features=None,
#     alpha=0.9,
#     verbose=0,
#     max_leaf_nodes=None,
#     warm_start=False,
#     validation_fraction=0.1,
#     n_iter_no_change=None,
#     tol=0.0001,
#     ccp_alpha=0.0,


### test_param_tuning

In [None]:
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.9098341034617643
0.28883890601728757


In [None]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.1,subsample=0.7)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.8951545090946867
0.31409749010863364


In [None]:
# gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.1,subsample=0.3)
# gbr.fit(x_train_stan,y_train)
# print(gbr.score(x_train_stan, y_train))
# print(gbr.score(x_test_stan, y_test))

0.8226970993149983
0.29630575601562725


In [None]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.1,subsample=0.7)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.991044737382711
0.3663167572311079


In [None]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.25,subsample=0.7,n_estimators=30)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))


0.8560720396376961
0.39521563505662627


In [None]:
gbr = GradientBoostingRegressor(random_state=42,learning_rate=0.25,subsample=0.7,n_estimators=21)
gbr.fit(x_train_stan,y_train)
print(gbr.score(x_train_stan, y_train))
print(gbr.score(x_test_stan, y_test))

0.7923572256293517
0.4142803699078096


### Gridsearch  -  KFold = 5고정, param_tuning

In [None]:
cv = KFold(5, shuffle=True, random_state=42)
params = { 
    'learning_rate': np.arange(0.01,0.25,0.01),
    'subsample' : np.arange(0.1,1,0.1),
    'n_estimators': np.arange(5,100,5)}
grid_b=GridSearchCV(GradientBoostingRegressor(random_state=42),param_grid=params,cv=cv, refit=True, verbose=True, n_jobs=-1) 

In [None]:
grid_b.fit(x_train_stan,y_train)

Fitting 5 folds for each of 4104 candidates, totalling 20520 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=GradientBoostingRegressor(random_state=42), n_jobs=-1,
             param_grid={'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
       0.23, 0.24]),
                         'n_estimators': array([ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85,
       90, 95]),
                         'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
             verbose=True)

In [None]:
grid_b.best_estimator_ 

GradientBoostingRegressor(learning_rate=0.01, n_estimators=90, random_state=42,
                          subsample=0.4)

In [None]:
grid_b.best_score_ 

0.11084907030224903

In [None]:
grid_b.score(x_train_stan,y_train)

0.3814796340104245

In [None]:
grid_b.score(x_test_stan,y_test)

0.2345830216304131

### feature_importances_

In [6]:
gbr_model = GradientBoostingRegressor(random_state=42,learning_rate=0.25,subsample=0.7,n_estimators=21)
gbr_model.fit(x_train_stan, y_train)


feature_list = pd.concat([pd.Series(x.columns), pd.Series(gbr_model.feature_importances_)], axis=1)
feature_list.columns = ['features_name', 'importance']
feature_list.sort_values("importance", ascending =False)[:27]

Unnamed: 0,features_name,importance
760,matching_상권_코드_명_디지털미디어시티,0.087761
138,남성연령대_30_토요일시간대_3_생활인구_수,0.069
904,matching_상권_코드_명_용산구청,0.061606
13,시간대_5_생활인구_수,0.054327
535,아파트_가격_3_억_세대_수,0.048688
98,남성연령대_20_토요일시간대_5_생활인구_수,0.039192
647,연령대_30_매출_건수,0.028671
706,matching_상권_코드_명_공항대로59다길,0.026366
600,연령대_20_매출_비율,0.024382
71,남성연령대_20_화요일시간대_2_생활인구_수,0.02327
