# # 앙상블
-----------------------------------------

## (1) Voting

* 위스콘신 유방암 dataset 을 이용한 실습

In [1]:

import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# dataset 로드
from sklearn.datasets import load_breast_cancer

In [2]:
# 데이터 프레임에 담아주기
cancer = load_breast_cancer()

data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
## 개별 모델은 로지스틱 회귀와 KNN 임
Ir_clf = LogisticRegression()
Knn_clf = KNeighborsClassifier(n_neighbors=8)

In [8]:
# 개별 모델을 소프트 보딩 기반의 앙상블 모델로 구현한 분류기
vo_clf = VotingClassifier(estimators= [('LR', Ir_clf),('KNN', Knn_clf)], voting='soft')

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size= 0.2, random_state=156)


# voting 분류 학습/예측/평가
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
print("Voting 분류기 정확도 :{0: 4f}".format(accuracy_score(y_test, pred)))


# 개별 모델의 학습/ 예측/ 평가
classifiers = [Ir_clf, Knn_clf]
for classfier in classifiers :
    classfier.fit(X_train, y_train)
    pred = classfier.predict(X_test)
    class_name = classfier.__class__.__name__
    print("{0} 정확도: {1: .4f}".format(class_name, accuracy_score(y_test, pred)))

Voting 분류기 정확도 : 0.947368
LogisticRegression 정확도:  0.9386
KNeighborsClassifier 정확도:  0.9386


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## (2) Bagging : 랜덤 포레스트


      * 랜덤 포레스트 : Bagging(배깅)의 대표적인 알고리즘

sklearn.ensemble.RandomForestClassifier

  * import 방법 : from sklearn.ensemble import RandomForestClassifier

  * 공식홈 : https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=randomforest#sklearn.ensemble.RandomForestClassifier

  * parameter 
> n_estimators=100, \
> criterion='gini', \
> max_depth=None, \
> min_samples_split=2, \
> min_samples_leaf=1, \
> min_weight_fraction_leaf=0.0,\ 
> max_features='sqrt', \
> max_leaf_nodes=None, \
> min_impurity_decrease=0.0,\
> bootstrap=True, \
> oob_score=False, \
> n_jobs=None, \
> random_state=None, \
> verbose=0, \
> warm_start=False, \
> class_weight=None, \
> ccp_alpha=0.0, \
> max_samples=None
-----------------------------------------------------

## (3) Boosting / BGM

        * BGM : 사이킷런에서 제공하는 라이브러리

sklearn.ensemble.GradientBoostingClassifie

  * import 방법 : from sklearn.ensemble import GradientBoostingClassifier

  * 공식홈 : https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html?highlight=gradientboost#sklearn.ensemble.GradientBoostingClassifier

  * parameter 
> loss='log_loss', \
> learning_rate=0.1,\ 
> n_estimators=100, \
> subsample=1.0, \
> criterion='friedman_mse', \
> min_samples_split=2, \
> min_samples_leaf=1, \
> min_weight_fraction_leaf=0.0,\
> max_depth=3, \
> min_impurity_decrease=0.0,\ 
> init=None, \
> random_state=None, \
> max_features=None, \
> verbose=0, \
> max_leaf_nodes=None, \
> warm_start=False, \
> validation_fraction=0.1, \
> n_iter_no_change=None, \
> tol=0.0001, \
> ccp_alpha=0.0
-----------------------------------------------------



In [3]:
# Boosting 라이브러리 로드
from sklearn.ensemble import GradientBoostingClassifier

# 그 외 분석을 위한 라이브러리 로드
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size= 0.2,
                                                    random_state= 110)

In [8]:
# GBM 수행시간 측정을 위해 시작 시간 설정
import time
start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state= 0)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print(f'GBM 정확도 : {gb_accuracy}')
print(f'GBM 수행시간 : {time.time() - start_time}')


GBM 정확도 : 0.9666666666666667
GBM 수행시간 : 0.11908984184265137


In [10]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators' : [100, 500],
    'learning_rate' : [0.05, 0.1]
}
grid_cv = GridSearchCV(gb_clf, param_grid = params, cv= 2, verbose= 1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼 파라미터 :', grid_cv.best_params_)
print('최고 예측 정확도 :', grid_cv.best_score_)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
최적 하이퍼 파라미터 : {'learning_rate': 0.05, 'n_estimators': 100}
최고 예측 정확도 : 0.95


In [12]:
# Grid Search CV 를 이용하여 최적으로 학습된 estimator 로 predict 수행
gb_pred = grid_cv.best_estimator_.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print("GBM 정확도 :", gb_accuracy)

GBM 정확도 : 0.9666666666666667


## (4) Boosting / XG Boost

In [15]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-win_amd64.whl (125.4 MB)
     -------------------------------------- 125.4/125.4 MB 9.4 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.6.1
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Jin\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


    ** XG Boost 사용 예시 **

> import xgboost as xgb

* read in data
   * dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
   * dtest = xgb.DMatrix('demo/data/agaricus.txt.test')

* specify parameters via map
    * param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
    * num_round = 2
    * bst = xgb.train(param, dtrain, num_round)\

* make prediction
    * preds = bst.predict(dtest)

In [17]:
import xgboost as xgb
from xgboost import XGBClassifier

In [19]:
# Extreme Gradient Boosting (xgboost) 모델 사용

# 1. 모델 선언
xgb  = XGBClassifier()

# 2. 모델 훈련 fit()함수
xgb.fit(X_train, y_train)

# 3. 모델 예측 predict()함수
y_pred = xgb.predict(X_test)

# 4. score()
xgb.score(X_train, y_train)  

## XG Boost 사용 시 계산 시간 : 0.1 초 소요

1.0

In [20]:
# score 계산 2 : 속도를 비교해보자  
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_pred, y_test)
print(acc)

## 계산시간 : 0.4 초 소요됨

0.9666666666666667


- [x] 체크
- [ ] 체크박스

In [1]:
# 테스트 
# 업로딩이 자동으로?