# 앙상블(Ensemble)

In [2]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [3]:
from sklearn.preprocessing import MinMaxScaler
cancer_scaled = MinMaxScaler().fit_transform(cancer.data)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer_scaled, cancer.target, stratify = cancer.target, test_size=0.2, random_state=2022)


### 1. voting(투표) 방식
#### 1.1 Hard voting
 - Logistic Regression
 - Support Vector Machine
 - K 최근접 이웃 

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [6]:
lrc = LogisticRegression(random_state =2022)
svc = SVC(random_state=2022)
knn = KNeighborsClassifier()

In [7]:
# Hard voting을 위한 앙상블 분류기 
from sklearn.ensemble import VotingClassifier

# 각각의 분류기를 하나로 합치는 과정
voc = VotingClassifier([('LRC', lrc),('SVC', svc),('KNN', knn)], voting = 'hard')

In [8]:
voc.fit(X_train, y_train) # 학습
voc.score(X_test, y_test) # 평가

1.0

In [9]:
# 개별 분류기의 성능 보기
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [11]:
lrc.score(X_test, y_test), svc.score(X_test, y_test), knn.score(X_test, y_test)

(0.9912280701754386, 1.0, 0.9824561403508771)

#### 1.2 Soft Voting

 - LRC (Logisitic Regression Classifier)

In [13]:
# 객체의 속성과 메소드
dir(lrc)

['C',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_feature_names',
 '_check_n_features',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_predict_proba_lr',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_validate_data',
 'class_weight',
 'classes_',
 'coef_',
 'decision_function',
 'densify',
 'dual',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'intercept_scaling',
 'l1_ratio',
 'max_iter',
 'multi_class',
 'n_features_in_',
 'n_iter_',
 'n_jobs',
 'penalty',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'random_state',
 'score',
 'set_params',
 'solver',
 'sparsify'

In [14]:
lrc.predict(X_test[:5])

array([0, 1, 0, 1, 0])

In [15]:
lrc.predict_proba(X_test[:5])

array([[0.99792166, 0.00207834],
       [0.07775117, 0.92224883],
       [0.9774613 , 0.0225387 ],
       [0.05952966, 0.94047034],
       [0.99554778, 0.00445222]])

 - SVC

In [18]:
svc.get_params() 

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2022,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [19]:
# SVM 모델로 soft voting 할 때는 probabilty를 True로 설정해야 돌아감
svc2 = SVC(probability=True, random_state=2022)
svc2.fit(X_train, y_train)
svc2.predict_proba(X_test[:5])

array([[9.99896299e-01, 1.03701492e-04],
       [3.84470713e-03, 9.96155293e-01],
       [9.99896384e-01, 1.03616009e-04],
       [5.97356113e-03, 9.94026439e-01],
       [9.99311796e-01, 6.88204061e-04]])

 - KNN (K 근접 모델)

In [21]:
knn.predict_proba(X_test[-5:])

array([[0.8, 0.2],
       [1. , 0. ],
       [0.8, 0.2],
       [0. , 1. ],
       [0. , 1. ]])

 - Soft Voting

In [23]:
voc2 = VotingClassifier([('LRC', lrc),('SVC', svc2),('KNN', knn)], voting = 'soft')


In [24]:
voc2.fit(X_train,y_train)
voc2.score(X_test,y_test)

1.0

In [25]:
voc2.predict_proba(X_test[:5])

array([[9.99272654e-01, 7.27346212e-04],
       [2.71986265e-02, 9.72801374e-01],
       [9.92452563e-01, 7.54743719e-03],
       [2.18344062e-02, 9.78165594e-01],
       [9.98286525e-01, 1.71347474e-03]])

 - GridSearchCV

In [28]:
lrc.C, svc2.C

(1.0, 1.0)

In [29]:
params = {'LRC__C': [0.1,1,10], 'SVC__C':[0.1,1,10]}

In [30]:
from sklearn.model_selection import GridSearchCV
grid_voc2 = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 10, 'SVC__C': 0.1}

In [32]:
params = {'LRC__C': [5, 8, 10], 'SVC__C':[0.05, 0.07, 0.1]}
grid_voc2 = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 8, 'SVC__C': 0.07}

In [34]:
grid_voc2.best_estimator_.score(X_test,y_test)

1.0

### 2. Bagging 방식 => RandomForest

In [35]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2022)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2022,
 'verbose': 0,
 'warm_start': False}

In [36]:
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

1.0