### Bagging
#### 위스콘신 유방암 데이터

In [5]:
from sklearn.ensemble import BaggingClassifier
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

In [2]:
cancer=load_breast_cancer()
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [3]:
x_train,x_test,y_train,y_test=train_test_split(cancer.data,cancer.target,train_size=0.8,test_size=0.2,random_state=156)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(455, 30) (114, 30) (455,) (114,)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [6]:
lr_cif=LogisticRegression(max_iter=10000)
lr_cif.fit(x_train,y_train)

pred_lr=lr_cif.predict(x_test)

In [7]:
print(accuracy_score(y_test,pred_lr))
print(mean_squared_error(y_test,pred_lr))

0.9649122807017544
0.03508771929824561


In [8]:
bag_cif=BaggingClassifier(base_estimator=lr_cif,n_estimators=5,verbose=1)

In [9]:
lr_cif_bag=bag_cif.fit(x_train,y_train)
pred_lr_bag=lr_cif_bag.predict(x_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [10]:
pred_lr_bag

array([1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

In [11]:
print(accuracy_score(y_test,pred_lr_bag))
print(mean_squared_error(y_test,pred_lr_bag))

0.956140350877193
0.043859649122807015


In [12]:
from sklearn.tree import DecisionTreeClassifier

dt_cif=DecisionTreeClassifier()
dt_cif.fit(x_train,y_train)
pred_dt=dt_cif.predict(x_test)

print(accuracy_score(y_test,pred_dt))
print(mean_squared_error(y_test,pred_dt))

0.9473684210526315
0.05263157894736842


In [15]:
bag_dt_cif=BaggingClassifier(base_estimator=dt_cif,n_estimators=5,verbose=1)

In [17]:
bag_dt_cif.fit(x_train,y_train)
pred_dt_bag=bag_dt_cif.predict(x_test)

print(accuracy_score(y_test,pred_dt_bag))
print(mean_squared_error(y_test,pred_dt_bag))

0.9649122807017544
0.03508771929824561


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


### RandomForest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
rt_cif=RandomForestClassifier(n_estimators=6, max_depth=3, random_state=102,verbose=1)
rt_cif.fit(x_train,y_train)
pred=rt_cif.predict(x_test)
print(accuracy_score(y_test,pred))

0.9298245614035088


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s finished


In [19]:
rt_cif2=RandomForestClassifier(n_estimators=500, max_depth=3, random_state=103,verbose=1)
rt_cif2.fit(x_train,y_train)
pred2=rt_cif2.predict(x_test)
print(accuracy_score(y_test,pred2))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.9385964912280702


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished


In [20]:
rt_cif3=RandomForestClassifier(n_estimators=500, max_depth=10, random_state=103,verbose=1)
rt_cif3.fit(x_train,y_train)
pred3=rt_cif3.predict(x_test)
print(accuracy_score(y_test,pred3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.9473684210526315


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.0s finished


In [21]:
rf_cif4=RandomForestClassifier()

In [25]:
params={'n_estimators':[10,100,500,100],
       'max_depth':[3,5,10,15]}
rt_cif4=RandomForestClassifier(random_state=103,n_jobs=-1,verbose=1)
grid_cv=GridSearchCV(rf_cif4,param_grid=params,n_jobs=-1,verbose=1)
grid_cv.fit(x_train,y_train)

print('최적 하이퍼 파라미터: ',grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   11.8s finished


최적 하이퍼 파라미터:  {'max_depth': 15, 'n_estimators': 100}
최고 예측 정확도: 0.9670


In [26]:
rt_cif5=RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=103,verbose=1)
rt_cif5.fit(x_train,y_train)
pred5=rt_cif5.predict(x_test)
print(accuracy_score(y_test,pred5))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    3.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.9473684210526315


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:    0.1s finished
