In [1]:
import numpy as np
import matplotlib.pyplot as plt 
from sklearn import datasets

X, y = datasets.make_moons(n_samples=500, noise=0.3, random_state=42)

plt.scatter(X[y==0, 0], X[y==0, 1])
plt.scatter(X[y==1, 0], X[y==1, 1])
plt.show()

<Figure size 640x480 with 1 Axes>

#### 使用OOB

In [2]:
from sklearn.tree import DecisionTreeClassifier #决策树参数多(剪枝方式，叶子节点等)，适合用于子模型
from sklearn.ensemble import BaggingClassifier
#第一个参数是集成每一个学习所用的算法  n_estimators集成多少个决策树的模型 
#max_samples每个子模型所选的样本数据  bootstrap是否放回取样  oob_score使用未被取样的数据进行测试
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=100, bootstrap=True, oob_score=True)
bag_clf.fit(X, y)
bag_clf.oob_score_

0.918

#### 使用n_jobs进行并行化处理

In [3]:
%%time
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000,
                            max_samples=100, bootstrap=True, oob_score=True)
bag_clf.fit(X, y)

Wall time: 24.8 s


BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [4]:
%%time
bag_clf2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000,
                            max_samples=100, bootstrap=True, 
                            oob_score=True, n_jobs=2)
bag_clf2.fit(X, y) #尴尬，效果不明显

Wall time: 27.7 s


BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

#### bootstrap_features 特征可放回随机取样

In [6]:
# bootstrap_features 对特征进行可放回随机取样
#max_features 最大特征数
random_subspaces_clf2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=500, bootstrap=True, #这里只展示特征随机，故把样本随机设置为全数据
                            oob_score=True, n_jobs=2, max_features=1,
                            bootstrap_features=True) 
random_subspaces_clf2.fit(X, y)
random_subspaces_clf2.oob_score_

0.828

In [7]:
#对样本和特征同时随机
random_patches_clf3 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=100, bootstrap=True,
                            oob_score=True, n_jobs=2, max_features=1,
                            bootstrap_features=True) 
random_patches_clf3.fit(X, y)
random_patches_clf3.oob_score_

0.862