# 앙상블(Ensemble)
여러개의 분류기 생성 후 그 예측 결합 -> 정확환 최종 예측 도출

In [1]:
# 데이터 불러오기
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer() # 객체 생성 

In [45]:
# 스케일 줄이기
# 'breast_cancer' 데이터 차이 너무 커서 스케일 줄일 필요 있음.
from sklearn.preprocessing import MinMaxScaler
cancer_scaler = MinMaxScaler().fit_transform(cancer.data)

In [12]:
# test/train 데이터 
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test=train_test_split(
    cancer_scaler, cancer.target, stratify=cancer.target,test_size=0.2,random_state=2022)

## 1. Voting 방식 
### 1.1 Hard Voting
- 로지스틱 회귀 (Logistic Regression)
- SVC (Support Vector Machine)
- K-최근접 이웃(K Nearest neighbor)


In [7]:
# 분류기 
from sklearn.linear_model import LogisticRegression  # 로지스틱 회귀
from sklearn.svm import SVC                          # Support Vector Machine
from sklearn.neighbors import KNeighborsClassifier   # K-최근접 이웃

In [8]:
lrc = LogisticRegression(random_state=2022)
svc = SVC(random_state=2022)
knn = KNeighborsClassifier()

In [16]:
# Hard Voting 을 위한 앙상블 분류기
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC', svc), ('KNN', knn)] , voting='hard'
)

In [17]:
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

1.0

In [20]:
#개별 분류기의 성능 
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
# 점수
print(lrc.score(X_test, y_test),'\n' ,svc.score(X_test, y_test),'\n' ,knn.score(X_test, y_test))

0.9912280701754386 
 1.0 
 0.9824561403508771


### 1.2 Soft Voting

In [21]:
# 객체의 속성과 매서드
dir(lrc)
#-> __ __ : 속성과 연관이 있어!

['C',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_feature_names',
 '_check_n_features',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_predict_proba_lr',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_validate_data',
 'class_weight',
 'classes_',
 'coef_',
 'decision_function',
 'densify',
 'dual',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'intercept_scaling',
 'l1_ratio',
 'max_iter',
 'multi_class',
 'n_features_in_',
 'n_iter_',
 'n_jobs',
 'penalty',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'random_state',
 'score',
 'set_params',
 'solver',
 'sparsify'

In [22]:
lrc.predict(X_test[:5])

array([0, 1, 0, 1, 0])

In [24]:
lrc.predict_proba(X_test[:5])

array([[0.99792166, 0.00207834],
       [0.07775117, 0.92224883],
       [0.9774613 , 0.0225387 ],
       [0.05952966, 0.94047034],
       [0.99554778, 0.00445222]])

- `SVM`

In [33]:
svc.predict_proba(X_test[:5])
#-> 에러) 원래는 predict_log_proba 지원하지만 기본값 False 라서 오류! 

AttributeError: ignored

In [34]:
svc.get_params()
# 'probability': False, 확인 

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2022,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [31]:
# 
svc2 = SVC(probability=True, random_state=2022 ) # probability=True 정의해주면 predict_proba() 사용 가능! 
svc2.fit(X_train,y_train)
svc2.predict_proba(X_test[:5])

array([[9.99896299e-01, 1.03701492e-04],
       [3.84470713e-03, 9.96155293e-01],
       [9.99896384e-01, 1.03616009e-04],
       [5.97356113e-03, 9.94026439e-01],
       [9.99311796e-01, 6.88204061e-04]])

- `KNN`

In [32]:
knn.predict_proba(X_test[:5]) # KNN은 그냥 사용 가능 

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.]])

- `Soft Voting`

In [44]:
from sklearn.ensemble import VotingClassifier
voc2 = VotingClassifier( 
    estimators=[('LRC', lrc), ('SVC', svc2), ('KNN', knn)] , voting='soft' ) # svc2 = SVC(probability=True, random_state=2022 )

# 학습, 정확도
print(voc2.fit(X_train, y_train).score(X_test, y_test),'\n')
#voc2.score(X_test, y_test)
# 예측 
print(voc2.predict_proba(X_test[:5]))

1.0 

[[9.99272654e-01 7.27346212e-04]
 [2.71986265e-02 9.72801374e-01]
 [9.92452563e-01 7.54743719e-03]
 [2.18344062e-02 9.78165594e-01]
 [9.98286525e-01 1.71347474e-03]]


- `GridSearchCV`

In [46]:
lrc.C, svc2.C

(1.0, 1.0)

In [48]:
params={
    'LRC__C': [0.1, 1,10],
    'SVC__C': [0.1, 1,10]
}
# 파이프라인도 같은 방법 

In [52]:

from sklearn.model_selection import GridSearchCV
grid_voc2 = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 10, 'SVC__C': 0.1}

In [53]:
# 범위 좁히기 
params={
    'LRC__C': [5, 10, 30],
    'SVC__C': [0.05, 0.1, 0.3]
}
grid_voc2 = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 10, 'SVC__C': 0.05}

In [54]:
grid_voc2.best_estimator_.score(X_test, y_test)

1.0

In [55]:
# SV 사용시 predict_proba 존재하는지 확인 후 사용
# GridSearchCV 사용시 params는 'LRC__C', 'SVC__C'

## 2. Bagging 방식 - Random Forest

In [60]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2022)
rfc.get_params()
# 'n_estimators': 100 -> 기본 100, 결정 트리의 개수 지정 
# 'max_features': 'auto' -> 기본 'auto'

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2022,
 'verbose': 0,
 'warm_start': False}

In [62]:
rfc.fit(X_train, y_train).score(X_test, y_test)

1.0