# 앙상블
- 일반화와 강건성(Robustness)을 향상시키기 위해 여러 모델의 예측 값을 결합하는 방법
- 앙상블에는 크게 두가지 종류가 존재
    - 평균 방법
        - 여러개의 추정값을 독립적으로 구한 뒤 평균을 취함
        - 결합 추정값은 분산이 줄어들기 떄문에 단일 추정값보다 좋은 성능을 보임
    - 부스팅 방법
        - 순차적으로 모델 생성
        - 결합된 모델의 편향을 감소 시키기 위해 노력
        - 부스팅 방법의 목표는 여러개의 약한 모델들을 결합해 하나의 강력한 앙상블 모델을 구축하는 것

# Bagging meta - estimator
- bagging은 bootstrap aggregating의 줄임말(무작위 재추출)
    - Train 한 데이터를 m 개의 바구니에 담게 되는데, 바구니에 담을 때 데이터를 넣는 건 랜덤
    - 중요한 점은 데이터를 넣을 때 데이터가 중복으로 한 바구니에 들어가도 된다는 점
    - 60% 정도 개수를 바구니에 넣는다고 함
    - 학습한 모델들을 가지고 투표
    - 해당 모델들에게 같은 데이터를 넣고 각각 나오는 결과를 가지고 평균값을 내어(투표를 하여) 정답을 정함
- 원래 훈련 데이터셋의 일부를 사용해 여러 모델을 훈련 방식
- 각각의 결과를 결합해 최종 결과를 생성
- 분산을 줄이고 과적합을 막음
- 강력하고 복잡한 모델에서 잘 동작

In [44]:
from sklearn.datasets import load_iris, load_wine, load_breast_cancer, load_boston, load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

In [2]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

## 분류

In [4]:
iris = load_iris()
wine = load_wine()
cancer = load_breast_cancer()

## KNN

### 붓꽃 데이터

In [6]:
base_model = make_pipeline(
        StandardScaler(),
        KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [11]:
cross_val = cross_validate(
        estimator = base_model,
        X = iris.data, y = iris.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0009915351867675782 (+/- 0.0006217428379503782)
avg score time: 0.0015952587127685547 (+/- 0.0004887149373057375)
avg test score: 0.96 (+/- 0.024944382578492935)


In [14]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = iris.data, y = iris.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.01337118148803711 (+/- 0.0013473473430168227)
avg score time: 0.004986763000488281 (+/- 1.5295253337671182e-05)
avg test score: 0.9533333333333334 (+/- 0.02666666666666666)


### 와인 데이터

In [15]:
base_model = make_pipeline(
        StandardScaler(),
        KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [16]:
cross_val = cross_validate(
        estimator = base_model,
        X = wine.data, y = wine.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0010060787200927735 (+/- 1.3092979373071293e-05)
avg score time: 0.0017810821533203124 (+/- 0.00039120804428973864)
avg test score: 0.9493650793650794 (+/- 0.037910929811115976)


In [17]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = wine.data, y = wine.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.013779973983764649 (+/- 0.0011568067158339507)
avg score time: 0.005274009704589844 (+/- 0.0006188708244549561)
avg test score: 0.9607936507936508 (+/- 0.022468028291073656)


### 유방암 데이터

In [19]:
base_model = make_pipeline(
        StandardScaler(),
        KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [20]:
cross_val = cross_validate(
        estimator = base_model,
        X = cancer.data, y = cancer.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0018001556396484374 (+/- 0.0007545731542895845)
avg score time: 0.00438995361328125 (+/- 0.000488659927931947)
avg test score: 0.9648501785437045 (+/- 0.009609970350036127)


In [21]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = cancer.data, y = cancer.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.015754842758178712 (+/- 0.001157292089758611)
avg score time: 0.009574317932128906 (+/- 0.0004885387067869067)
avg test score: 0.9578326346840551 (+/- 0.015063406185307365)


## SVC

### 붓꽃 데이터

In [25]:
base_model = make_pipeline(
        StandardScaler(),
        SVC()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [26]:
cross_val = cross_validate(
        estimator = base_model,
        X = iris.data, y = iris.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0011869430541992187 (+/- 0.00040798858812716174)
avg score time: 0.0003997802734375 (+/- 0.0004896311614689141)
avg test score: 0.9666666666666666 (+/- 0.02108185106778919)


In [27]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = iris.data, y = iris.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.018749713897705078 (+/- 0.0007464166309289618)
avg score time: 0.002393674850463867 (+/- 0.0004886555075578522)
avg test score: 0.9533333333333334 (+/- 0.02666666666666666)


### 와인 데이터

In [28]:
base_model = make_pipeline(
        StandardScaler(),
        SVC()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [29]:
cross_val = cross_validate(
        estimator = base_model,
        X = wine.data, y = wine.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0011942863464355468 (+/- 0.00040132427695904456)
avg score time: 0.0006062030792236328 (+/- 0.0004950265982426652)
avg test score: 0.9833333333333334 (+/- 0.022222222222222233)


In [30]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = wine.data, y = wine.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.019052839279174803 (+/- 0.0009957444631559194)
avg score time: 0.0021942138671875 (+/- 0.00039986758925676555)
avg test score: 0.9776190476190475 (+/- 0.01119469694127331)


### 유방암 데이터

In [31]:
base_model = make_pipeline(
        StandardScaler(),
        SVC()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [32]:
cross_val = cross_validate(
        estimator = base_model,
        X = cancer.data, y = cancer.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.002797365188598633 (+/- 0.00040205398111341743)
avg score time: 0.000868844985961914 (+/- 0.000455365165764895)
avg test score: 0.9736376339077782 (+/- 0.014678541667933545)


In [33]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = cancer.data, y = cancer.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.023358631134033202 (+/- 0.0007883253610004071)
avg score time: 0.004197502136230468 (+/- 0.0004163265637878461)
avg test score: 0.964803601925167 (+/- 0.017707461458490007)


## Decision Tree

### 붓꽃 데이터

In [34]:
base_model = make_pipeline(
        StandardScaler(),
        DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [35]:
cross_val = cross_validate(
        estimator = base_model,
        X = iris.data, y = iris.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0008033275604248047 (+/- 0.0004021379422733048)
avg score time: 0.0003986358642578125 (+/- 0.00048822743987322914)
avg test score: 0.9666666666666668 (+/- 0.036514837167011066)


In [36]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = iris.data, y = iris.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.01735200881958008 (+/- 0.0007986365277032389)
avg score time: 0.001396322250366211 (+/- 0.0004885971595254055)
avg test score: 0.9199999999999999 (+/- 0.05416025603090639)


### 와인 데이터

In [37]:
base_model = make_pipeline(
        StandardScaler(),
        DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [38]:
cross_val = cross_validate(
        estimator = base_model,
        X = wine.data, y = wine.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0009975433349609375 (+/- 1.669815373614806e-05)
avg score time: 0.0 (+/- 0.0)
avg test score: 0.8765079365079365 (+/- 0.03321445165041705)


In [39]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = wine.data, y = wine.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.016954469680786132 (+/- 0.001091710883290007)
avg score time: 0.0013953685760498048 (+/- 0.0004893768069489255)
avg test score: 0.9323809523809523 (+/- 0.05252601366335605)


### 유방암 데이터

In [40]:
base_model = make_pipeline(
        StandardScaler(),
        DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [41]:
cross_val = cross_validate(
        estimator = base_model,
        X = cancer.data, y = cancer.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.005169963836669922 (+/- 0.0007486868584791718)
avg score time: 0.0006049156188964843 (+/- 0.0004940654143010502)
avg test score: 0.9191274646793974 (+/- 0.017170600960997876)


In [42]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = cancer.data, y = cancer.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.023682594299316406 (+/- 0.0004483056760156679)
avg score time: 0.0017955780029296875 (+/- 0.00040068812246730356)
avg test score: 0.9420121099208197 (+/- 0.014219478795940665)


## Decision Tree

### 붓꽃 데이터

In [34]:
base_model = make_pipeline(
        StandardScaler(),
        DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [35]:
cross_val = cross_validate(
        estimator = base_model,
        X = iris.data, y = iris.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0008033275604248047 (+/- 0.0004021379422733048)
avg score time: 0.0003986358642578125 (+/- 0.00048822743987322914)
avg test score: 0.9666666666666668 (+/- 0.036514837167011066)


In [36]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = iris.data, y = iris.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.01735200881958008 (+/- 0.0007986365277032389)
avg score time: 0.001396322250366211 (+/- 0.0004885971595254055)
avg test score: 0.9199999999999999 (+/- 0.05416025603090639)


### 와인 데이터

In [37]:
base_model = make_pipeline(
        StandardScaler(),
        DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [38]:
cross_val = cross_validate(
        estimator = base_model,
        X = wine.data, y = wine.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0009975433349609375 (+/- 1.669815373614806e-05)
avg score time: 0.0 (+/- 0.0)
avg test score: 0.8765079365079365 (+/- 0.03321445165041705)


In [39]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = wine.data, y = wine.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.016954469680786132 (+/- 0.001091710883290007)
avg score time: 0.0013953685760498048 (+/- 0.0004893768069489255)
avg test score: 0.9323809523809523 (+/- 0.05252601366335605)


### 유방암 데이터

In [40]:
base_model = make_pipeline(
        StandardScaler(),
        DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [41]:
cross_val = cross_validate(
        estimator = base_model,
        X = cancer.data, y = cancer.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.005169963836669922 (+/- 0.0007486868584791718)
avg score time: 0.0006049156188964843 (+/- 0.0004940654143010502)
avg test score: 0.9191274646793974 (+/- 0.017170600960997876)


In [42]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = cancer.data, y = cancer.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.023682594299316406 (+/- 0.0004483056760156679)
avg score time: 0.0017955780029296875 (+/- 0.00040068812246730356)
avg test score: 0.9420121099208197 (+/- 0.014219478795940665)


## 회귀

In [45]:
boston = load_boston()
diabetes = load_diabetes()

## KNN

### 보스턴 데이터

In [55]:
base_model = make_pipeline(
        StandardScaler(),
        KNeighborsRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [56]:
cross_val = cross_validate(
        estimator = base_model,
        X = boston.data, y = boston.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0011973381042480469 (+/- 0.00039923209224251426)
avg score time: 0.0009970664978027344 (+/- 2.611744678045111e-07)
avg test score: 0.47357748833823543 (+/- 0.13243123464477455)


In [57]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = boston.data, y = boston.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.014238119125366211 (+/- 0.0009678177628082339)
avg score time: 0.006390810012817383 (+/- 0.00045925392027722434)
avg test score: 0.44193373600241975 (+/- 0.13970460832157108)


### 당뇨 데이터

In [58]:
base_model = make_pipeline(
        StandardScaler(),
        KNeighborsRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [59]:
cross_val = cross_validate(
        estimator = base_model,
        X = diabetes.data, y = diabetes.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0009974956512451172 (+/- 0.0006308265349212175)
avg score time: 0.001197052001953125 (+/- 0.00039997117884336226)
avg test score: 0.3689720650295623 (+/- 0.044659049060165365)


In [60]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = diabetes.data, y = diabetes.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.013787937164306641 (+/- 0.0007254046972755921)
avg score time: 0.005800914764404297 (+/- 0.00041039032058747476)
avg test score: 0.3926384220065617 (+/- 0.0516867709765904)


## SVR

### 보스턴 데이터

In [61]:
base_model = make_pipeline(
        StandardScaler(),
        SVR()
)

bagging_model = BaggingRegressor(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [62]:
cross_val = cross_validate(
        estimator = base_model,
        X = boston.data, y = boston.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.006096363067626953 (+/- 0.00021016215706618516)
avg score time: 0.0009915351867675782 (+/- 1.1329674961565412e-05)
avg test score: 0.17631266230186618 (+/- 0.5224914915128981)


In [63]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = boston.data, y = boston.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.024937009811401366 (+/- 0.001395318628895566)
avg score time: 0.004588651657104492 (+/- 0.0004790896951323353)
avg test score: 0.16534116993152786 (+/- 0.3357066640964653)


### 당뇨 데이터

In [64]:
base_model = make_pipeline(
        StandardScaler(),
        SVR()
)

bagging_model = BaggingRegressor(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [65]:
cross_val = cross_validate(
        estimator = base_model,
        X = diabetes.data, y = diabetes.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.004392290115356445 (+/- 0.00048578321360341354)
avg score time: 0.0009932994842529296 (+/- 1.9664778279282035e-05)
avg test score: 0.14659936199629434 (+/- 0.02190798003342928)


In [66]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = diabetes.data, y = diabetes.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.02119908332824707 (+/- 0.0009156666043781223)
avg score time: 0.0037950515747070313 (+/- 0.0003759379921557066)
avg test score: 0.057387132572700075 (+/- 0.03923304270522971)


## Decision Tree

### 보스턴 데이터

In [68]:
base_model = make_pipeline(
        StandardScaler(),
        DecisionTreeRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [69]:
cross_val = cross_validate(
        estimator = base_model,
        X = boston.data, y = boston.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.002782917022705078 (+/- 0.0004072957167497881)
avg score time: 0.0004034996032714844 (+/- 0.0004942641434868503)
avg test score: 0.06462288173334034 (+/- 0.9230667990599801)


In [70]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = boston.data, y = boston.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.020679187774658204 (+/- 0.0009345059857608854)
avg score time: 0.0020174503326416014 (+/- 0.0006096064482304004)
avg test score: 0.4166273377873931 (+/- 0.29369461812382514)


### 당뇨 데이터

In [71]:
base_model = make_pipeline(
        StandardScaler(),
        DecisionTreeRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [72]:
cross_val = cross_validate(
        estimator = base_model,
        X = diabetes.data, y = diabetes.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.002175760269165039 (+/- 0.0004085609224424521)
avg score time: 0.0004086017608642578 (+/- 0.0005004329167394967)
avg test score: -0.1923336077431711 (+/- 0.13553162844990252)


In [73]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = diabetes.data, y = diabetes.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.019965219497680663 (+/- 0.0015490814900504062)
avg score time: 0.0014001846313476563 (+/- 0.0004934191564112623)
avg test score: 0.3592272610957168 (+/- 0.08112187587998057)


- bagging은 트리 구조와 궁합이 잘 맞는다는 결론을 도출함

## SVR

### 보스턴 데이터

In [61]:
base_model = make_pipeline(
        StandardScaler(),
        SVR()
)

bagging_model = BaggingRegressor(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [62]:
cross_val = cross_validate(
        estimator = base_model,
        X = boston.data, y = boston.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.006096363067626953 (+/- 0.00021016215706618516)
avg score time: 0.0009915351867675782 (+/- 1.1329674961565412e-05)
avg test score: 0.17631266230186618 (+/- 0.5224914915128981)


In [63]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = boston.data, y = boston.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.024937009811401366 (+/- 0.001395318628895566)
avg score time: 0.004588651657104492 (+/- 0.0004790896951323353)
avg test score: 0.16534116993152786 (+/- 0.3357066640964653)


### 당뇨 데이터

In [64]:
base_model = make_pipeline(
        StandardScaler(),
        SVR()
)

bagging_model = BaggingRegressor(base_model, n_estimators = 10, max_samples = 0.5, max_features = 0.5)
# base_model을 기반으로 샘플은 절반 정도 사용, feature도 절반 정도 사용

In [65]:
cross_val = cross_validate(
        estimator = base_model,
        X = diabetes.data, y = diabetes.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.004392290115356445 (+/- 0.00048578321360341354)
avg score time: 0.0009932994842529296 (+/- 1.9664778279282035e-05)
avg test score: 0.14659936199629434 (+/- 0.02190798003342928)


In [66]:
cross_val = cross_validate(
        estimator = bagging_model,
        X = diabetes.data, y = diabetes.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.02119908332824707 (+/- 0.0009156666043781223)
avg score time: 0.0037950515747070313 (+/- 0.0003759379921557066)
avg test score: 0.057387132572700075 (+/- 0.03923304270522971)


# Forests of randomized trees
- sklearn.ensemble 모듈에는 무작위 결정 트리를 기반으로 하는 두 개의 평균화 알고리즘이 존재
    - Random Forest
    - Extra-Trees
- 모델 구성에 임의성을 추가해 다양한 모델 집합이 생성
- 앙상블 모델의 예측은 각 모델의 평균

In [74]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

## Random Forest 분류

In [76]:
model = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

In [77]:
cross_val = cross_validate(
        estimator = model,
        X = iris.data, y = iris.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.08593540191650391 (+/- 0.0012261566091278132)
avg score time: 0.006112337112426758 (+/- 0.0002420083169060051)
avg test score: 0.9666666666666668 (+/- 0.02108185106778919)


In [78]:
cross_val = cross_validate(
        estimator = model,
        X = wine.data, y = wine.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.08616704940795898 (+/- 0.0024052980711879494)
avg score time: 0.005385351181030273 (+/- 0.0004886555075578522)
avg test score: 0.9720634920634922 (+/- 0.02484722784679302)


In [79]:
cross_val = cross_validate(
        estimator = model,
        X = cancer.data, y = cancer.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.12523837089538575 (+/- 0.0033354746175713)
avg score time: 0.006575918197631836 (+/- 0.000485487991279659)
avg test score: 0.9613569321533924 (+/- 0.018865249100232455)


## Random Forest 회귀

In [81]:
model = make_pipeline(
    StandardScaler(),
    RandomForestRegressor()
)

In [82]:
cross_val = cross_validate(
        estimator = model,
        X = boston.data, y = boston.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.18706040382385253 (+/- 0.004251034968114065)
avg score time: 0.005790090560913086 (+/- 0.00040120324748190177)
avg test score: 0.61876348293949 (+/- 0.21037017404670985)


In [83]:
cross_val = cross_validate(
        estimator = model,
        X = diabetes.data, y = diabetes.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.1541867733001709 (+/- 0.002250888712945491)
avg score time: 0.005386447906494141 (+/- 0.0004882501568058569)
avg test score: 0.4344894906211568 (+/- 0.04964233101755858)


## Extreamely Randomized Trees 분류

In [85]:
model = make_pipeline(
    StandardScaler(),
    ExtraTreesClassifier()
)

In [86]:
cross_val = cross_validate(
        estimator = model,
        X = iris.data, y = iris.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.06029167175292969 (+/- 0.0012900273780444525)
avg score time: 0.005593442916870117 (+/- 0.0004968286829156827)
avg test score: 0.9533333333333334 (+/- 0.03399346342395189)


In [87]:
cross_val = cross_validate(
        estimator = model,
        X = wine.data, y = wine.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.06283950805664062 (+/- 0.0010841132885564776)
avg score time: 0.005978870391845703 (+/- 1.0521150134504455e-05)
avg test score: 0.9833333333333332 (+/- 0.022222222222222233)


In [88]:
cross_val = cross_validate(
        estimator = model,
        X = cancer.data, y = cancer.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.07479362487792969 (+/- 0.0011242823034822561)
avg score time: 0.006896591186523438 (+/- 0.0005133833022772506)
avg test score: 0.9613569321533924 (+/- 0.018031054391660237)


## Extreamely Randomized Trees 회귀

In [94]:
model = make_pipeline(
    StandardScaler(),
    ExtraTreesRegressor()
)

In [96]:
cross_val = cross_validate(
        estimator = model,
        X = boston.data, y = boston.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.12211728096008301 (+/- 0.0010871484502227304)
avg score time: 0.005379724502563477 (+/- 0.0004829065623824051)
avg test score: 0.6147661409404155 (+/- 0.2763362078844026)


In [97]:
cross_val = cross_validate(
        estimator = model,
        X = diabetes.data, y = diabetes.target,
        cv = 5
)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.10812067985534668 (+/- 0.0014864132153467311)
avg score time: 0.005780982971191406 (+/- 0.0007407075728355784)
avg test score: 0.441359944207888 (+/- 0.029610119400648845)
