<a href="https://colab.research.google.com/github/JUHYUN030/DALC_AI/blob/main/(%EC%8B%A4%EC%8A%B5)DALC_AIstudy_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 앙상블(Ensemble)

## Bagging meta-estimator
    * bagging은 bootstrap aggregating의 줄임말
    * 원래 훈련 데이터셋의 일부를 사용해 여러 모델을 훈련
    * 각각의 결과를 결합해 최종 결과를 생성
    * 분산을 줄이고 과적합을 막음
    * 강력하고 복잡한 모델에서 잘 동작

### 필요한 데이터 셋

In [72]:
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.datasets import load_boston, load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

### 분류 모델

In [73]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [74]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

## Bagging을 사용한 분류

데이터셋 불러오기

In [75]:
iris = load_iris()
wine = load_wine()
cancer = load_breast_cancer()

## KNN
붓꽃 데이터

In [76]:
# 베이스 모델
# 데이터 모델링하기 하기전에는 반드시 스케일링 과정을 거쳐야함
# StandardScaler() : 스케일러(기본 스케일. 평균과 표준편차 사용)
base_model = make_pipeline(StandardScaler(),
                          KNeighborsClassifier()) # KNN모델 만듬
# 베이스 모델을 사용한 배깅 모델
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5) # Bagging KNN 모델 만듬

#### 분류기에 사용되는 매개변수
* n_estimators : 앙상블에 사용할 분류기의 수
* max_samples : 무작위로 뽑을 샘플의 수(0~1사이의 수로 지정하면 비율이 되어, 훈련세트에 곱한 값만큼 샘플링)
* max_features: 최대 feature의 수


In [77]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model, # 베이스 모델을 기준으로 측정하기에 estimator에 base_model 넣음
    X = iris.data, y=iris.target, # x는 train set, y는 test set
    cv = 5)
# 교차검증을 통해 해당 모델의 정확성, 작동하는데 걸린 시간 등을 알 수 있음
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0016570091247558594 (+/- 0.00054084088838644)
avg score time: 0.003146934509277344 (+/- 0.0010678183875395513)
avg test score: 0.96 (+/- 0.024944382578492935)


In [78]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.022936630249023437 (+/- 0.0012904160303423143)
avg score time: 0.010262823104858399 (+/- 0.0022585869791821494)
avg test score: 0.96 (+/- 0.03265986323710903)


와인 데이터

In [79]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [80]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0015060901641845703 (+/- 0.0004887569008789552)
avg score time: 0.0027279376983642576 (+/- 0.0006509969492297201)
avg test score: 0.9493650793650794 (+/- 0.037910929811115976)


In [81]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.021157264709472656 (+/- 0.0012908942557112051)
avg score time: 0.009592866897583008 (+/- 0.0008105402034488905)
avg test score: 0.9607936507936508 (+/- 0.028521119729020605)


유방암 데이터

In [82]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [83]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0026851654052734374 (+/- 0.0008741134951872069)
avg score time: 0.007773303985595703 (+/- 0.0007155842122188231)
avg test score: 0.9648501785437045 (+/- 0.009609970350036127)


In [84]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.02351102828979492 (+/- 0.0014416756386614262)
avg score time: 0.01659402847290039 (+/- 0.0008680196808830881)
avg test score: 0.9560627231796305 (+/- 0.014679067143704956)


## SVC
붓꽃 데이터

In [85]:
base_model = make_pipeline(
    StandardScaler(),
    SVC()  # base_model
    ) 
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [86]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = iris.data, y = iris.target, 
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.003131723403930664 (+/- 0.001383031802086435)
avg score time: 0.0008324146270751953 (+/- 0.0001539552488261428)
avg test score: 0.9666666666666666 (+/- 0.02108185106778919)


In [87]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0344174861907959 (+/- 0.005354803337402826)
avg score time: 0.004334545135498047 (+/- 0.0009279673479509549)
avg test score: 0.9533333333333334 (+/- 0.03399346342395189)


와인 데이터

In [88]:
base_model = make_pipeline(StandardScaler(),
                          SVC())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [89]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y = wine.target, 
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0023375988006591798 (+/- 0.000597736434732994)
avg score time: 0.0006340980529785156 (+/- 8.27985676533387e-05)
avg test score: 0.9833333333333334 (+/- 0.022222222222222233)


In [90]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.03483781814575195 (+/- 0.004502545299674488)
avg score time: 0.0051787376403808595 (+/- 0.0008245974274128132)
avg test score: 0.9438095238095239 (+/- 0.017585410476517578)


유방암 데이터

In [91]:
base_model = make_pipeline(StandardScaler(),
                          SVC())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [92]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.006696319580078125 (+/- 0.002563269324206613)
avg score time: 0.0013887405395507813 (+/- 0.00032461504780635967)
avg test score: 0.9736376339077782 (+/- 0.014678541667933545)


In [93]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.04433259963989258 (+/- 0.00587829055046386)
avg score time: 0.007832574844360351 (+/- 0.000781353803556742)
avg test score: 0.9613569321533924 (+/- 0.01623457791867057)


## Decision Tree
붓꽃 데이터

In [94]:
base_model = make_pipeline(StandardScaler(),
                          DecisionTreeClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [95]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0018742084503173828 (+/- 0.00046117467901657437)
avg score time: 0.0006771087646484375 (+/- 0.00018839280345484596)
avg test score: 0.9533333333333334 (+/- 0.03399346342395189)


In [96]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.03133902549743652 (+/- 0.0026385820001336704)
avg score time: 0.003690767288208008 (+/- 0.0011508840181777742)
avg test score: 0.9333333333333332 (+/- 0.059628479399994376)


와인 데이터

In [97]:
base_model = make_pipeline(StandardScaler(),
                          DecisionTreeClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [98]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.001921367645263672 (+/- 0.0006149941951666092)
avg score time: 0.0005034446716308594 (+/- 5.2546199984374426e-05)
avg test score: 0.8931746031746032 (+/- 0.0541816509305139)


In [99]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.03230586051940918 (+/- 0.003994720354501961)
avg score time: 0.0036556243896484373 (+/- 0.0009232293619601969)
avg test score: 0.9555555555555555 (+/- 0.062360956446232345)


유방암 데이터

In [100]:
base_model = make_pipeline(StandardScaler(),
                          DecisionTreeClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [101]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.010716104507446289 (+/- 0.005244412308829611)
avg score time: 0.0007656574249267578 (+/- 8.937353144736826e-05)
avg test score: 0.9138177301661233 (+/- 0.02143404052739964)


In [102]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.04514164924621582 (+/- 0.006055959547004355)
avg score time: 0.00399932861328125 (+/- 0.0013746366481267069)
avg test score: 0.9507840397453812 (+/- 0.01629064906082428)


## Bagging을 사용한 회귀
### 데이터셋 불러오기

In [103]:
boston = load_boston()
diabetes = load_diabetes()

## KNN
### 보스턴 주택 가격 데이터

In [104]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsRegressor())
bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [105]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.003117656707763672 (+/- 0.002708305837678289)
avg score time: 0.0025036811828613283 (+/- 0.0011887776184844258)
avg test score: 0.47357748833823543 (+/- 0.13243123464477455)


In [106]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.024677228927612305 (+/- 0.004138462620674289)
avg score time: 0.010866165161132812 (+/- 0.00015090573962548367)
avg test score: 0.4954898218911324 (+/- 0.10307884551724632)


### 당뇨병 데이터

In [107]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsRegressor())
bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [108]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0019495010375976563 (+/- 0.0006151086303500321)
avg score time: 0.002288341522216797 (+/- 0.000610137413871883)
avg test score: 0.3689720650295623 (+/- 0.044659049060165365)


In [109]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.023909759521484376 (+/- 0.0018947412663379416)
avg score time: 0.012582778930664062 (+/- 0.0022000847666970285)
avg test score: 0.38947113072715805 (+/- 0.06677061173215378)


## SVR
### 보스턴 주택 가격 데이터

In [110]:
# SVR을 사용 -> base_model과 bagging_model
base_model = make_pipeline(StandardScaler(),
                          SVR())
bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [111]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.014416265487670898 (+/- 0.0014008772314966372)
avg score time: 0.0024796485900878905 (+/- 5.7916304394028096e-05)
avg test score: 0.17631266230186618 (+/- 0.5224914915128981)


In [112]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.053387117385864255 (+/- 0.0047795152493660805)
avg score time: 0.011536645889282226 (+/- 0.003469526745592342)
avg test score: 0.1981409145699493 (+/- 0.24606349749824044)


### 당뇨병 데이터

In [113]:
base_model = make_pipeline(StandardScaler(),
                          SVR())
bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [114]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.010633754730224609 (+/- 0.002514669187823867)
avg score time: 0.00202794075012207 (+/- 3.934683867444042e-05)
avg test score: 0.14659936199629434 (+/- 0.02190798003342928)


In [115]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0453528881072998 (+/- 0.005262838331487031)
avg score time: 0.008784627914428711 (+/- 0.0013509674982913242)
avg test score: 0.05884252988871217 (+/- 0.0363609991361921)


## Random Forest
* sklearn.ensemble 모듈에는 무작위 결정 트리를 기반으로하는 두 개의 평균화 알고리즘이 존재
    + Random Forest

In [116]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

## Random Forest 분류

In [117]:
base_model = make_pipeline(StandardScaler(),
                          RandomForestClassifier())

In [118]:
# 랜덤포레스트 모델과 붓꽃 데이터(iris)를 이용 -> avg fit time, avg score time, avg test score
cross_val = cross_validate(
    estimator = base_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.1508817195892334 (+/- 0.00223855951874184)
avg score time: 0.012100696563720703 (+/- 0.0017355067557496273)
avg test score: 0.96 (+/- 0.024944382578492935)


In [119]:
# 랜덤포레스트 모델과 와인 데이터(wine)를 이용 -> avg fit time, avg score time, avg test score
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.15858287811279298 (+/- 0.006185892730050359)
avg score time: 0.011793041229248047 (+/- 0.00039779888357476467)
avg test score: 0.9833333333333332 (+/- 0.022222222222222233)


In [120]:
# 랜덤포레스트 모델과 유방암 데이터(cancer)를 이용 -> avg fit time, avg score time, avg test score
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.21268105506896973 (+/- 0.004266657599897011)
avg score time: 0.012877607345581054 (+/- 0.00125532872060932)
avg test score: 0.95960254618848 (+/- 0.022574935272912117)


## Random Forest 회귀

In [121]:
model = make_pipeline(
StandardScaler(),
RandomForestRegressor())

In [122]:
cross_val = cross_validate(
    estimator = model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.30938029289245605 (+/- 0.008355160923423768)
avg score time: 0.010416698455810548 (+/- 0.00012028716117641036)
avg test score: 0.6221244060985249 (+/- 0.22105709808959323)


In [123]:
cross_val = cross_validate(
    estimator = model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.26178760528564454 (+/- 0.006152511540626241)
avg score time: 0.012453365325927734 (+/- 0.0013799053206812334)
avg test score: 0.41080155808411767 (+/- 0.048991238403201354)


## AdaBoost
* 대표적인 부스팅 알고리즘
* 일련의 약한 모델들을 학습
* 수정된 버전의 데이터를 반복 학습(가중치가 적용된)
* 가중치 투표(또는 합)을 통해 각 모델의 예측 값을 결합
* 첫 단계에서는 원본 데이터를 학습하고 연속적인 반복마다 개별 샘플에 대한 가중치가 수정되고 다시 모델이 학습
    + 잘못 예측된 샘플은 가중치 증가, 올바르게 예측된 샘플은 가중치 감소
    + 각각의 약한 모델들은 예측하기 어려운 샘플에 집중하게 됨

![AdaBoost](https://scikit-learn.org/stable/_images/sphx_glr_plot_adaboost_hastie_10_2_0011.png)

In [124]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor

## AdaBoost 분류

In [125]:
model = make_pipeline(
StandardScaler(),
AdaBoostClassifier())

In [126]:
cross_val = cross_validate(
    estimator = model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0820643424987793 (+/- 0.008682975161650043)
avg score time: 0.01017451286315918 (+/- 0.0011979289915524367)
avg test score: 0.9466666666666667 (+/- 0.03399346342395189)


In [127]:
cross_val = cross_validate(
    estimator = model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.08783283233642578 (+/- 0.004727937835386209)
avg score time: 0.010665798187255859 (+/- 0.0015450058047751566)
avg test score: 0.8085714285714285 (+/- 0.16822356718459935)


In [128]:
cross_val = cross_validate(
    estimator = model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.1553345203399658 (+/- 0.0064268288040401685)
avg score time: 0.011599922180175781 (+/- 0.0016500261087349635)
avg test score: 0.9718677224033534 (+/- 0.0195587047134823)


## AdaBoost 회귀

In [129]:
model = make_pipeline(
StandardScaler(),
AdaBoostRegressor())

In [130]:
cross_val = cross_validate(
    estimator = model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.10846438407897949 (+/- 0.0051777690684497845)
avg score time: 0.007131385803222656 (+/- 0.0018161302634834787)
avg test score: 0.6160840945348022 (+/- 0.21135705354987636)


In [131]:
cross_val = cross_validate(
    estimator = model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.08837351799011231 (+/- 0.022017803489331876)
avg score time: 0.005043220520019531 (+/- 0.0010938088923437234)
avg test score: 0.4171451395318332 (+/- 0.049227586137615256)


## Gradient Tree Boosting
* 임의의 차별화 가능한 손실함수로 일반화한 부스팅 알고리즘
* 웹 검색, 분류 및 회귀 등 다양한 분야에서 모두 사용 가능

In [132]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

## Gradient Tree Boosting 분류

In [133]:
model = make_pipeline(
StandardScaler(),
GradientBoostingClassifier())

In [134]:
cross_val = cross_validate(
    estimator = model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.23173623085021972 (+/- 0.004153613457429803)
avg score time: 0.001339292526245117 (+/- 0.0002843091162548875)
avg test score: 0.96 (+/- 0.024944382578492935)


In [135]:
cross_val = cross_validate(
    estimator = model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.26354212760925294 (+/- 0.0054722710515464216)
avg score time: 0.0011547088623046875 (+/- 7.268434292544008e-05)
avg test score: 0.9330158730158731 (+/- 0.03296317528191366)


In [136]:
cross_val = cross_validate(
    estimator = model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.3960637092590332 (+/- 0.00902702283154589)
avg score time: 0.0011098384857177734 (+/- 6.721915763488112e-05)
avg test score: 0.95960254618848 (+/- 0.021167671111828695)


## Gradient Tree Boosting 회귀

In [137]:
model = make_pipeline(
StandardScaler(),
GradientBoostingRegressor())

In [138]:
cross_val = cross_validate(
    estimator = model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.12599587440490723 (+/- 0.006877300163439731)
avg score time: 0.001219320297241211 (+/- 3.270539827512599e-05)
avg test score: 0.6807482440984366 (+/- 0.15697678240332968)


In [139]:
cross_val = cross_validate(
    estimator = model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.09940576553344727 (+/- 0.004817314011788784)
avg score time: 0.0012047767639160156 (+/- 2.1484887559964634e-05)
avg test score: 0.4064036670240882 (+/- 0.06891934812872876)


## 투표 기반 모델(Voting Classifier)

* 서로 다른 모델들의 결과를 투표를 통해 결합
* 두가지 방법으로 투표 가능
    + 가장 많이 예측된 클래스를 정답으로 채택(hard voting)
    + 예측된 확률의 가중치 평균(soft voting)

In [140]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings(action='ignore') # 에러 메시지 안 뜨게 하기 위함

## Hard Voting

In [141]:
model1 = SVC()
model2 = GaussianNB()
model3 = RandomForestClassifier()
# 각기 다른 알고리즘을 가진 모델을 이용해 투표 기반 모델 만듬
vote_model = VotingClassifier(
estimators = [('svc', model1), ('naive', model2), ('forest', model3)], # ('추정기 이름', 추정기)의 모음
voting = 'hard') # voting 속성으로 hard와 soft 지정


In [142]:
for model in (model1, model2, model3, vote_model):
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, iris.data, iris.target, cv=5)
    print('Accuracy: %0.2f [%s]' %(scores.mean(), model_name))

Accuracy: 0.97 [SVC]
Accuracy: 0.95 [GaussianNB]
Accuracy: 0.96 [RandomForestClassifier]
Accuracy: 0.97 [VotingClassifier]


## Soft Voting

In [143]:
model1 = SVC(probability=True) # svc모델을 soft voting에 사용하려면 probability=True로 설정해야한다.
model2 = GaussianNB()
model3 = RandomForestClassifier()
vote_model = VotingClassifier(
estimators = [('svc', model1), ('naive', model2), ('forest', model3)], 
voting = 'soft',
weights=[2,1,2]) # weights 가중치 (estimators에 하나씩 해당)

In [144]:
for model in (model1, model2, model3, vote_model):
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, iris.data, iris.target, cv=5)
    print('Accuracy: %0.2f [%s]' %(scores.mean(), model_name))

Accuracy: 0.97 [SVC]
Accuracy: 0.95 [GaussianNB]
Accuracy: 0.96 [RandomForestClassifier]
Accuracy: 0.96 [VotingClassifier]
