## 一、对分类器进行投票

In [1]:
# 获得数据
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
        estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
        voting='hard'
    ) # 为hard voting
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomF...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [3]:
# 观察每个分类器在测试集上的准确率
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.88
SVC 0.888
VotingClassifier 0.888


  if diff:


In [4]:
log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42) # 注意这里要设置probability=True

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft') # 为soft voting
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFor...bf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [5]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.872
SVC 0.888
VotingClassifier 0.912


  if diff:


## 二、Bagging和Pasting  
用相同的算法，在训练集不同的随机子集上进行训练，得到每个预测器  
Bagging为有放回的，Pasting为无放回的

**2.1 在Scikit-Learn中进行Bagging和Pasting**

In [6]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
        DecisionTreeClassifier(), n_estimators=500,
        max_samples=100, bootstrap=True, n_jobs=-1
    )
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [7]:
accuracy_score(y_test, y_pred)

0.912

**2.2 袋外（OOB）估计**

In [8]:
bag_clf = BaggingClassifier(
        DecisionTreeClassifier(), n_estimators=500,
        bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.8986666666666666

In [9]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.904

In [10]:
bag_clf.oob_decision_function_

array([[0.44632768, 0.55367232],
       [0.31      , 0.69      ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.1025641 , 0.8974359 ],
       [0.36956522, 0.63043478],
       [0.01886792, 0.98113208],
       [1.        , 0.        ],
       [0.96610169, 0.03389831],
       [0.76237624, 0.23762376],
       [0.01058201, 0.98941799],
       [0.73513514, 0.26486486],
       [0.87349398, 0.12650602],
       [0.96446701, 0.03553299],
       [0.06896552, 0.93103448],
       [0.        , 1.        ],
       [0.98412698, 0.01587302],
       [0.93956044, 0.06043956],
       [0.98888889, 0.01111111],
       [0.00571429, 0.99428571],
       [0.41847826, 0.58152174],
       [0.9047619 , 0.0952381 ],
       [1.        , 0.        ],
       [0.96315789, 0.03684211],
       [0.        , 1.        ],
       [0.99470899, 0.00529101],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.65921788, 0.34078212],
       [0.

## 三、随机贴片和随机子空间  
随机贴片：同时对训练样本数和特征进行取样  
随机子空间：保留所有的训练样本，对特征进行取样

## 四、随机森林

In [11]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [12]:
accuracy_score(y_test, y_pred_rf)

0.912

In [13]:
# 下面这个BaggingClassifier粗略等同于上面的RandomForestClassifier
bag_clf = BaggingClassifier(
        DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
        n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1
    )

**4.1 Extra-Trees**  
对每个特征用随机的阈值，而不是去找到最好可能的阈值  
在Scikit-Learn中，用ExtraTreesClassifier类  
对于用哪个，无法判断，通过用交叉验证进行比较（并且用网格搜索进行调参）

**4.2 特征重要性**  
越重要的特征可能出现在树的根部，越不重要的特征通常出现在树的叶节点

In [14]:
from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.09877444398444107
sepal width (cm) 0.024364337950547058
petal length (cm) 0.42293676241811373
petal width (cm) 0.4539244556468977


## 五、Boosting  
Boosting指的是任何能够组合几个弱学习器成强学习器的集成方法

**5.1 AdaBoost**  
第一个基分类器（如决策树）被用于训练并在测试集上做预测，然后误分类的训练样本权重增大，更新权重，再训练第二个分类器并做预测，更新权重，以此类推，此过程中每次迭代学习率不断减小一半

第j个预测器的加权错误率：  
$r_{j}=\frac{\sum_{i=1,\hat{y}_{j}^{(i)}\neq y^{(i)}}^{m}w^{(i)}}{\sum_{i=1}^{m}w^{(i)}}$  
其中，$\hat{y}_{j}^{(i)}$是第j个预测器对第i个样本的预测值

预测器权重：  
$\alpha _{j}=\eta log\frac{1-r_{j}}{r_{j}}$  
其中，$\alpha$为预测器权重，$\eta$为学习速率（通常默认为1）

权重更新公式：  
for i = 1, 2, … , m  
$w^{(i)}\leftarrow \left\{\begin{matrix}
w^{(i)} & if\;\hat{y}_{j}^{(i)}=y^{(i)}\\ 
 w^{(i)}exp(\alpha _{j})& if\;\hat{y}_{j}^{i}\neq y^{(i)}
\end{matrix}\right.$

AdaBoost预测：  
$\hat{y}(\textbf{x})=\mathop{\arg\max}_{k}\sum_{j=1,\hat{y}_{j}(\textbf{x})=k}^{N}\alpha _{j}$  
其中，N为预测器的数目

In [15]:
# 基于200个决策树桩训练一个AdaBoost分类器
# 一个决策树桩为一个最大深度为1的决策树，即由一个单独的决策节点和加上两个叶节点组成
# 决策树桩为AdaBoostClassifier类的默认基础估计器
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=1), n_estimators=200,
        algorithm="SAMME.R", learning_rate=0.5
    )
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=None)

**5.2 Gradient Boosting**

In [16]:
# 首先，在训练集之上拟合一个决策树回归
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [17]:
# 现在，在第一个预测器产生的残差基础上训练第二个决策树回归
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [19]:
# 然后，在第二个预测器产生的残差基础上训练第三个决策树回归
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [27]:
import numpy as np

X_new = np.array([[0.8, 0.6]])

In [28]:
# 最后，我们可以集成以上三个决策树
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [29]:
y_pred

array([0.15060283])

In [30]:
# Scikit-Learn实现
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [32]:
# 用认为指定的树的数目去训练GBRT，通过测量训练的每一步的验证误差，找到树的最优数目
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=75, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [35]:
# 通过设置warm_start=True，这样当调用fit()时保持当前的树数目，允许增量式训练
# 下面代码展示：当验证错误连续五次迭代都没有改善，则停止训练
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error =float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_errror = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break

Stochastic Gradient Boosting：对于每棵树，随机地用训练集的一部分

## 六、Stacking（模型融合）  
① 将数据集分成两个子集，第一个子集用于训练在第一层的预测器  
② 第一层的预测器用于在第二个子集上做预测  
③ 将每个样本得到的三个预测值（若有三个预测器）作为输入特征，目标值保持不变，产生新的训练集  
④ 在这新的训练集上训练出blender，因此它是学习去在给定第一层预测值的基础上去预测目标值