# 앙상블(Ensemble)

* 일반화와 강건성(Robustness)을 향상시키기 위해 여러 모델의 예측 값을 결합하는 방법
* 강건성 : 구조적으로 튼튼하고 건강해지는 특성
* 앙상블에는 크게 두가지 종류가 존재
  * 평균 방법
    * 여러개의 추정값을 독립적으로 구한뒤 평균을 취함
    * 결합 추정값은 분산이 줄어들기 때문에 단일 추정값보다 좋은 성능을 보임
  * 부스팅 방법
    * 순차적으로 모델 생성
    * 결합된 모델의 편향을 감소 시키기 위해 노력
    * 부스팅 방법의 목표는 여러개의 약한 모델들을 결합해 하나의 강력한 앙상블 모델을 구축하는 것

## Bagging meta-estimator

* bagging은 bootstrap aggregating의 줄임말
* 원래 훈련 데이터셋의 일부를 사용해 여러 모델을 훈련
* 각각의 결과를 결합해 최종 결과를 생성
* 분산을 줄이고 과적합을 막음
* 강력하고 복잡한 모델에서 잘 동작 / 굉장히 우수한 성능

In [1]:
from sklearn.datasets import load_iris, load_breast_cancer, load_wine, load_boston, load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

In [2]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format = 'retina'

### Bagging을 사용한 분류

#### 데이터셋 불러오기

In [19]:
iris = load_iris()
wine = load_wine()
cancer = load_breast_cancer()

#### KNN

##### 붓꽃 데이터

In [20]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(),
)

bagging_model = BaggingClassifier(base_model,
                                  n_estimators=10,
                                  max_samples=0.5,
                                  max_features=0.5)

In [26]:
cross_val = cross_validate(
    estimator=base_model,
    X=iris.data, y=iris.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.0018561363220214843 (+/- 0.0006530496208083471)
avg score time : 0.003165006637573242 (+/- 0.000503913174898885)
avg test score : 0.96 (+/- 0.024944382578492935)


In [27]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=iris.data, y=iris.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.039535903930664064 (+/- 0.0020783010179425913)
avg score time : 0.015109395980834961 (+/- 0.0005574940284130443)
avg test score : 0.9466666666666665 (+/- 0.03399346342395189)


##### 와인 데이터

In [28]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(),
)

bagging_model = BaggingClassifier(base_model,
                                  n_estimators=10,
                                  max_samples=0.5,
                                  max_features=0.5)

In [29]:
cross_val = cross_validate(
    estimator=base_model,
    X=wine.data, y=wine.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.0021858692169189455 (+/- 0.0010238991780787668)
avg score time : 0.003700590133666992 (+/- 0.0010248302480866008)
avg test score : 0.9493650793650794 (+/- 0.037910929811115976)


In [31]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=wine.data, y=wine.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.039252424240112306 (+/- 0.004531296617613461)
avg score time : 0.01540517807006836 (+/- 0.0005427767644992488)
avg test score : 0.9441269841269841 (+/- 0.03488670790589105)


##### 유방암 데이터

In [32]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier(),
)

bagging_model = BaggingClassifier(base_model,
                                  n_estimators=10,
                                  max_samples=0.5,
                                  max_features=0.5)

In [33]:
cross_val = cross_validate(
    estimator=base_model,
    X=cancer.data, y=cancer.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.0019655227661132812 (+/- 0.0003611615127480492)
avg score time : 0.011516094207763672 (+/- 0.005328152308529014)
avg test score : 0.9648501785437045 (+/- 0.009609970350036127)


In [34]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=cancer.data, y=cancer.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.04462962150573731 (+/- 0.004210724308435661)
avg score time : 0.031119537353515626 (+/- 0.001442241375466627)
avg test score : 0.9596025461884802 (+/- 0.013079066864621501)


#### SVC

In [40]:
base_model = make_pipeline(
    StandardScaler(),
    SVC(),
)

bagging_model = BaggingClassifier(base_model,
                                  n_estimators=10,
                                  max_samples=0.5,
                                  max_features=0.5)

##### 붓꽃 데이터

In [41]:
cross_val = cross_validate(
    estimator=base_model,
    X=iris.data, y=iris.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.0028365135192871095 (+/- 0.000579892241067346)
avg score time : 0.0011091232299804688 (+/- 0.00022167410215276652)
avg test score : 0.9666666666666666 (+/- 0.02108185106778919)


In [42]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=iris.data, y=iris.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.05234966278076172 (+/- 0.0026160453168196543)
avg score time : 0.008614015579223634 (+/- 0.0004643980072429394)
avg test score : 0.96 (+/- 0.03265986323710903)


##### 와인 데이터

In [43]:
cross_val = cross_validate(
    estimator=base_model,
    X=wine.data, y=wine.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.002207517623901367 (+/- 0.00042878821514459755)
avg score time : 0.0008664608001708984 (+/- 0.00015571528997195813)
avg test score : 0.9833333333333334 (+/- 0.022222222222222233)


In [44]:
cross_val = cross_validate(
    estimator=base_model,
    X=wine.data, y=wine.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.0030369281768798826 (+/- 0.0004621296636626876)
avg score time : 0.0009696006774902344 (+/- 0.00040602238541207164)
avg test score : 0.9833333333333334 (+/- 0.022222222222222233)


##### 유방암 데이터

In [45]:
cross_val = cross_validate(
    estimator=base_model,
    X=cancer.data, y=cancer.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.006502199172973633 (+/- 0.0012991221755833734)
avg score time : 0.00252680778503418 (+/- 0.0005303153496267931)
avg test score : 0.9736376339077782 (+/- 0.014678541667933545)


In [46]:
cross_val = cross_validate(
    estimator=base_model,
    X=cancer.data, y=cancer.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.008110332489013671 (+/- 0.0017323651550630364)
avg score time : 0.003919649124145508 (+/- 0.0011337056834039372)
avg test score : 0.9736376339077782 (+/- 0.014678541667933545)


#### Decision Tree

In [47]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier(),
)

bagging_model = BaggingClassifier(base_model,
                                  n_estimators=10,
                                  max_samples=0.5,
                                  max_features=0.5)

##### 붓꽃 데이터

In [48]:
cross_val = cross_validate(
    estimator=base_model,
    X=iris.data, y=iris.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.0021028518676757812 (+/- 0.0005643025590541546)
avg score time : 0.0006976127624511719 (+/- 0.00037735053150881194)
avg test score : 0.9666666666666668 (+/- 0.036514837167011066)


In [49]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=iris.data, y=iris.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.04739022254943848 (+/- 0.0017199157970747478)
avg score time : 0.005189657211303711 (+/- 0.0007804914682483853)
avg test score : 0.9466666666666667 (+/- 0.03399346342395189)


##### 와인 데이터

In [51]:
cross_val = cross_validate(
    estimator=base_model,
    X=wine.data, y=wine.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.002028656005859375 (+/- 0.00021545354983891314)
avg score time : 0.00014910697937011718 (+/- 0.0002039013675332609)
avg test score : 0.8709523809523809 (+/- 0.05700512501657816)


In [53]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=wine.data, y=wine.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.04473419189453125 (+/- 0.008671522061709624)
avg score time : 0.004874420166015625 (+/- 0.0013478889763924578)
avg test score : 0.9553968253968254 (+/- 0.028236127772140195)


##### 유방암 데이터

In [54]:
cross_val = cross_validate(
    estimator=base_model,
    X=cancer.data, y=cancer.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.010175848007202148 (+/- 0.003047805613393002)
avg score time : 0.000599813461303711 (+/- 0.0005138284997769589)
avg test score : 0.919111939139885 (+/- 0.026433570377351927)


In [55]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=cancer.data, y=cancer.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.08575587272644043 (+/- 0.024134684550311387)
avg score time : 0.008229255676269531 (+/- 0.0016934803232578053)
avg test score : 0.9543083372147182 (+/- 0.01607149078080657)


### Bagging을 사용한 회귀

#### 데이터셋 불러오기

In [58]:
boston = load_boston()
diabetes = load_diabetes()

#### KNN

In [62]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor(),
)

bagging_model = BaggingRegressor(base_model,
                                  n_estimators=10,
                                  max_samples=0.5,
                                  max_features=0.5)

##### 보스턴 주택 가격 데이터

In [63]:
cross_val = cross_validate(
    estimator=base_model,
    X=boston.data, y=boston.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.002860546112060547 (+/- 0.0003480018540441986)
avg score time : 0.002646780014038086 (+/- 0.0003971920034081832)
avg test score : 0.47357748833823543 (+/- 0.13243123464477455)


In [64]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=boston.data, y=boston.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.03819551467895508 (+/- 0.004984743111268522)
avg score time : 0.0178652286529541 (+/- 0.002467115736996637)
avg test score : 0.4311994846984667 (+/- 0.12495268948587904)


##### 당뇨병 데이터

In [67]:
cross_val = cross_validate(
    estimator=base_model,
    X=diabetes.data, y=diabetes.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.0026103496551513673 (+/- 0.0012892527871185156)
avg score time : 0.0024349212646484373 (+/- 0.00020069675505003299)
avg test score : 0.3689720650295623 (+/- 0.044659049060165365)


In [68]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=diabetes.data, y=diabetes.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.03670506477355957 (+/- 0.007248941412013783)
avg score time : 0.0668802261352539 (+/- 0.09586039680499077)
avg test score : 0.4163070665101819 (+/- 0.06334312957293502)


#### SVR

In [75]:
base_model = make_pipeline(
    StandardScaler(),
    SVR(),
)

bagging_model = BaggingRegressor(base_model,
                                  n_estimators=10,
                                  max_samples=0.5,
                                  max_features=0.5)

##### 보스턴 주택 가격 데이터

In [76]:
cross_val = cross_validate(
    estimator=base_model,
    X=boston.data, y=boston.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.019654178619384767 (+/- 0.005791641060132959)
avg score time : 0.009356164932250976 (+/- 0.001991823254153647)
avg test score : 0.17631266230186618 (+/- 0.5224914915128981)


In [77]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=boston.data, y=boston.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.09568767547607422 (+/- 0.004352266292093822)
avg score time : 0.05663719177246094 (+/- 0.0007580245164241572)
avg test score : 0.21364546841074397 (+/- 0.26230309324050094)


##### 당뇨병 데이터

In [79]:
cross_val = cross_validate(
    estimator=base_model,
    X=diabetes.data, y=diabetes.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.011864042282104493 (+/- 0.0035704429015614137)
avg score time : 0.006018638610839844 (+/- 0.0025145738329341062)
avg test score : 0.14659936199629428 (+/- 0.021907980033429305)


In [81]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=diabetes.data, y=diabetes.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.07552108764648438 (+/- 0.014272631195582133)
avg score time : 0.0453087329864502 (+/- 0.0033887533320828556)
avg test score : 0.05935207349410818 (+/- 0.0446589071470321)


#### Decision Tree

In [82]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeRegressor(),
)

bagging_model = BaggingRegressor(base_model,
                                  n_estimators=10,
                                  max_samples=0.5,
                                  max_features=0.5)

##### 보스턴 주택 가격 데이터

In [85]:
cross_val = cross_validate(
    estimator=base_model,
    X=boston.data, y=boston.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.006621170043945313 (+/- 0.0005879962001917221)
avg score time : 0.0007555961608886718 (+/- 0.00031371570713753755)
avg test score : 0.1820372111934045 (+/- 0.8289116977859968)


In [86]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=boston.data, y=boston.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.058515310287475586 (+/- 0.005044957523268689)
avg score time : 0.005341958999633789 (+/- 0.0004421352670000398)
avg test score : 0.5428443429929491 (+/- 0.2280368273121039)


##### 당뇨병 데이터

In [87]:
cross_val = cross_validate(
    estimator=base_model,
    X=diabetes.data, y=diabetes.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.005097770690917968 (+/- 0.0005451357387477487)
avg score time : 0.0009433269500732422 (+/- 0.000466775467938965)
avg test score : -0.18678281107133418 (+/- 0.14381732939063377)


In [88]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=diabetes.data, y=diabetes.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.049727773666381835 (+/- 0.010924293273202773)
avg score time : 0.0053116798400878905 (+/- 0.0006788559889584064)
avg test score : 0.35570977100944295 (+/- 0.025054946221333595)


## Forests of randomized trees

* `sklearn.ensemble` 모듈에는 무작위 결정 트리를 기반으로하는 두 개의 평균화 알고리즘이 존재
  * Random Forest
  * Extra-Trees
* 모델 구성에 임의성을 추가해 다양한 모델 집합이 생성
* 앙상블 모델의 예측은 각 모델의 평균

In [101]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

### Random Forests 분류

In [90]:
model = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

In [91]:
cross_val = cross_validate(
    estimator=model,
    X=iris.data, y=iris.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.24234533309936523 (+/- 0.020834594588349444)
avg score time : 0.02151827812194824 (+/- 0.0073406482810394755)
avg test score : 0.96 (+/- 0.024944382578492935)


In [92]:
cross_val = cross_validate(
    estimator=model,
    X=wine.data, y=wine.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.26145339012145996 (+/- 0.022638304787056435)
avg score time : 0.022751617431640624 (+/- 0.001171792660430666)
avg test score : 0.9609523809523809 (+/- 0.02214499050996228)


In [93]:
cross_val = cross_validate(
    estimator=model,
    X=cancer.data, y=cancer.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.3896982669830322 (+/- 0.020585521478985784)
avg score time : 0.02173953056335449 (+/- 0.002469069962327234)
avg test score : 0.956078248719143 (+/- 0.024789109260795167)


### Random Forests 회귀

In [94]:
model = make_pipeline(
    StandardScaler(),
    RandomForestRegressor()
)

In [95]:
cross_val = cross_validate(
    estimator=model,
    X=boston.data, y=boston.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.6093801021575928 (+/- 0.022296018794226952)
avg score time : 0.021938657760620116 (+/- 0.005789284484995973)
avg test score : 0.6301319618501056 (+/- 0.19782047590241902)


In [96]:
cross_val = cross_validate(
    estimator=model,
    X=diabetes.data, y=diabetes.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.5019397258758544 (+/- 0.012632698583519581)
avg score time : 0.023879098892211913 (+/- 0.004637914970035239)
avg test score : 0.4236616592883712 (+/- 0.04878855324396473)


### Extremely Randomized Trees 분류

In [97]:
model = make_pipeline(
    StandardScaler(),
    ExtraTreesClassifier()
)

In [98]:
cross_val = cross_validate(
    estimator=model,
    X=iris.data, y=iris.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.21244435310363768 (+/- 0.042646894075242615)
avg score time : 0.022151088714599608 (+/- 0.006948866058073854)
avg test score : 0.9466666666666667 (+/- 0.039999999999999994)


In [99]:
cross_val = cross_validate(
    estimator=model,
    X=wine.data, y=wine.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.18783507347106934 (+/- 0.022337570200006072)
avg score time : 0.021895408630371094 (+/- 0.0005094953156503005)
avg test score : 0.9777777777777779 (+/- 0.020786985482077462)


In [100]:
cross_val = cross_validate(
    estimator=model,
    X=cancer.data, y=cancer.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.2253955841064453 (+/- 0.01106308123605771)
avg score time : 0.02423892021179199 (+/- 0.0008886796936744768)
avg test score : 0.9648657040832169 (+/- 0.01566592330434181)


### Extremely Randomized Trees 회귀

In [103]:
model = make_pipeline(
    StandardScaler(),
    ExtraTreesRegressor()
)

In [104]:
cross_val = cross_validate(
    estimator=model,
    X=boston.data, y=boston.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.3675689697265625 (+/- 0.023237726932160356)
avg score time : 0.020147037506103516 (+/- 0.0007336142452885374)
avg test score : 0.6062161780407413 (+/- 0.2925219438620807)


In [105]:
cross_val = cross_validate(
    estimator=model,
    X=diabetes.data, y=diabetes.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.33066482543945314 (+/- 0.026499108355507305)
avg score time : 0.021920394897460938 (+/- 0.0022851051439778444)
avg test score : 0.4445641912733319 (+/- 0.0430121555079802)


### Random Forest, Extra Tree 시각화

* 결정 트리, Random Forest, Extra Tree의 결정 경계와 회귀식 시각화

## AdaBoost

* 대표적인 부스팅 알고리즘
* 일련의 약한 모델들을 학습
* 수정된 버전의 데이터를 반복 학습 (가중치가 적용된)
* 가중치 투표(또는 합)을 통해 각 모델의 예측 값을 결합
* 첫 단계에서는 원본 데이터를 학습하고 연속적인 반복마다 개별 샘플에 대한 가중치가 수정되고 다시 모델이 학습
  * 잘못 예측된 샘플은 가중치 증가, 올바르게 예측된 샘플은 가중치 감소
  * 각각의 약한 모델들은 예측하기 어려운 샘플에 집중하게 됨

![AdaBoost](https://scikit-learn.org/stable/_images/sphx_glr_plot_adaboost_hastie_10_2_001.png)

In [106]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor

### AdaBoost 분류

In [108]:
model = make_pipeline(
    StandardScaler(),
    AdaBoostClassifier()
)

In [109]:
cross_val = cross_validate(
    estimator=model,
    X=iris.data, y=iris.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.131325626373291 (+/- 0.0019827326971137787)
avg score time : 0.015561628341674804 (+/- 0.0004880660996809424)
avg test score : 0.9466666666666667 (+/- 0.03399346342395189)


In [110]:
cross_val = cross_validate(
    estimator=model,
    X=wine.data, y=wine.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.14134840965270995 (+/- 0.017416240826568653)
avg score time : 0.01541728973388672 (+/- 0.00046465136001095703)
avg test score : 0.8085714285714285 (+/- 0.16822356718459935)


In [111]:
cross_val = cross_validate(
    estimator=model,
    X=cancer.data, y=cancer.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.29923286437988283 (+/- 0.04197284873494866)
avg score time : 0.019182205200195312 (+/- 0.0018014693145320827)
avg test score : 0.9701133364384411 (+/- 0.019709915473893072)


### AdaBoost 회귀

In [112]:
model = make_pipeline(
    StandardScaler(),
    AdaBoostRegressor()
)

In [113]:
cross_val = cross_validate(
    estimator=model,
    X=boston.data, y=boston.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.2032069206237793 (+/- 0.02990735856163627)
avg score time : 0.009979724884033203 (+/- 0.0006462544996559159)
avg test score : 0.5919171751197495 (+/- 0.18942551475778335)


In [114]:
cross_val = cross_validate(
    estimator=model,
    X=diabetes.data, y=diabetes.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.16580696105957032 (+/- 0.01443250411680169)
avg score time : 0.009401845932006835 (+/- 0.0006231574894104766)
avg test score : 0.41770328410959257 (+/- 0.053933942965769005)


## Gradient Tree Boosting

* 임의의 차별화 가능한 손실함수로 일반화한 부스팅 알고리즘
* 웹 검색, 분류 및 회귀 등 다양한 분야에서 모두 사용 가능

In [120]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

### Gradient Tree Boosting 분류

In [121]:
model = make_pipeline(
    StandardScaler(),
    GradientBoostingClassifier()
)

In [122]:
cross_val = cross_validate(
    estimator=model,
    X=iris.data, y=iris.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.40898962020874025 (+/- 0.024585501811147003)
avg score time : 0.0013871192932128906 (+/- 0.0007738927521449198)
avg test score : 0.9666666666666668 (+/- 0.02108185106778919)


In [123]:
cross_val = cross_validate(
    estimator=model,
    X=wine.data, y=wine.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.608646011352539 (+/- 0.03156907378430226)
avg score time : 0.0015822410583496093 (+/- 0.0004418402440554013)
avg test score : 0.9385714285714286 (+/- 0.032068206474093704)


In [124]:
cross_val = cross_validate(
    estimator=model,
    X=cancer.data, y=cancer.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.8432297706604004 (+/- 0.015326214283518378)
avg score time : 0.0020754337310791016 (+/- 0.0007420364372169891)
avg test score : 0.9631268436578171 (+/- 0.021024240542234)


### Gradient Tree Boosting 회귀

In [131]:
model = make_pipeline(
    StandardScaler(),
    GradientBoostingRegressor()
)

In [132]:
cross_val = cross_validate(
    estimator=model,
    X=boston.data, y=boston.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.21963753700256347 (+/- 0.020624276821749596)
avg score time : 0.0017985820770263672 (+/- 0.00040332575298369477)
avg test score : 0.6836818274250323 (+/- 0.1512644869187298)


In [133]:
cross_val = cross_validate(
    estimator=model,
    X=diabetes.data, y=diabetes.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.17988576889038085 (+/- 0.0022460708514956272)
avg score time : 0.0015725135803222657 (+/- 0.0003958764175685385)
avg test score : 0.41012286847184853 (+/- 0.06931810062458485)


## 투표 기반 분류 (Voting Classifier)

* 서로 다른 모델들의 결과를 투표를 통해 결합
* 두가지 방법으로 투표 가능
  * 가장 많이 예측된 클래스를 정답으로 채택 (hard voting)
  * 예측된 확률의 가중치 평균 (soft voting)

In [135]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

In [136]:
# hard 방식

model1 = SVC()
model2 = GaussianNB()
model3 = RandomForestClassifier()
vote_model = VotingClassifier(
    estimators=[('svc', model1), ('naive', model2), ('forest', model3)],
    voting='hard'
)

In [138]:
for model in (model1, model2, model3, vote_model):
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, iris.data, iris.target, cv=5)
    print('Accuracy : %0.2f (+/- %0.2f) [%s]' % (scores.mean(), scores.std(), model_name))

Accuracy : 0.97 (+/- 0.02) [SVC]
Accuracy : 0.95 (+/- 0.03) [GaussianNB]
Accuracy : 0.95 (+/- 0.03) [RandomForestClassifier]
Accuracy : 0.96 (+/- 0.02) [VotingClassifier]


In [None]:
#soft방식
model1 = SVC(probability=True)
model2 = GaussianNB()
model3 = RandomForestClassifier()
vote_model = VotingClassifier(
    estimators=[('svc', model1), ('naive', model2), ('forest', model3)],
    voting='soft',
    weights=[2,1,2]
)

In [139]:
for model in (model1, model2, model3, vote_model):
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, iris.data, iris.target, cv=5)
    print('Accuracy : %0.2f (+/- %0.2f) [%s]' % (scores.mean(), scores.std(), model_name))

Accuracy : 0.97 (+/- 0.02) [SVC]
Accuracy : 0.95 (+/- 0.03) [GaussianNB]
Accuracy : 0.97 (+/- 0.02) [RandomForestClassifier]
Accuracy : 0.97 (+/- 0.02) [VotingClassifier]


### 결정 경계 시각화

## 투표 기반 회귀 (Voting Regressor)

* 서로 다른 모델의 예측 값의 평균을 사용

In [140]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor

In [147]:
model1 = LinearRegression()
model2 = GradientBoostingRegressor()
model3 = RandomForestRegressor()
vote_model = VotingRegressor(
    estimators=[('Linear', model1), ('gbr', model2), ('rfr', model3)],
    weights= [1,1,1]
)

In [148]:
for model in (model1, model2, model3, vote_model):
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, boston.data, boston.target, cv=5)
    print('R2: %0.2f (+/- %0.2f) [%s]' % (scores.mean(), scores.std(), model_name))

R2: 0.35 (+/- 0.38) [LinearRegression]
R2: 0.67 (+/- 0.17) [GradientBoostingRegressor]
R2: 0.62 (+/- 0.22) [RandomForestRegressor]
R2: 0.65 (+/- 0.21) [VotingRegressor]


### 회귀식 시각화

## 스택 일반화 (Stacked Generalization)

* 각 모델의 예측 값을 최종 모델의 입력으로 사용
* 모델의 편향을 줄이는데 효과적

### 스택 회귀

In [151]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [150]:
estimators = [('ridge', Ridge()),
             ('lasso', Lasso()),
             ('svr', SVR())]

In [153]:
reg = make_pipeline(
    StandardScaler(),
    StackingRegressor(
        estimators=estimators,
        final_estimator=GradientBoostingRegressor()
    )
)

In [154]:
cross_val = cross_validate(
    estimator=reg,
    X=boston.data, y=boston.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.1652303695678711 (+/- 0.009551796141829856)
avg score time : 0.006015777587890625 (+/- 0.0003261018520244199)
avg test score : 0.3199641138499357 (+/- 0.3315014050515194)


#### 회귀식 시각화

### 스택 분류

In [155]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [157]:
estimators = [('logistic', LogisticRegression(max_iter=10000)),
             ('svc', SVC()),
             ('naive', GaussianNB())]

In [158]:
clf = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier()
)

In [159]:
cross_val = cross_validate(
    estimator=clf,
    X=iris.data, y=iris.target,
    cv=5
)
print(f"avg fit time : {cross_val['fit_time'].mean()} (+/- {cross_val['fit_time'].std()})")
print(f"avg score time : {cross_val['score_time'].mean()} (+/- {cross_val['score_time'].std()})")
print(f"avg test score : {cross_val['test_score'].mean()} (+/- {cross_val['test_score'].std()})")

avg fit time : 0.30034122467041013 (+/- 0.01805924199146007)
avg score time : 0.011328697204589844 (+/- 0.001015862145427881)
avg test score : 0.9666666666666666 (+/- 0.02108185106778919)


#### 결정 경계 시각화

In [4]:
plt.barh(list(X), model.feature_importances_)
plt.show()

NameError: name 'plt' is not defined