# 第7章 集成学习和随机森林

In [1]:
from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn.ensemble import VotingClassifier

In [3]:
from sklearn.linear_model import LogisticRegression

In [4]:
from sklearn.svm import SVC

In [5]:
log_clf = LogisticRegression()

In [6]:
rnd_clf = RandomForestClassifier()

In [7]:
svm_clf = SVC()

In [8]:
voting_clf = VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('rf', rnd_clf),
        ('svc', svm_clf)
    ],
    voting='hard'
)

- 软投票

In [21]:
svm_clf2 = SVC(probability=True)

In [22]:
voting_clf = VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('rf', rnd_clf),
        ('svc', svm_clf2)
    ],
    voting='soft'
)

- 看看每个分类器在测试集上的准确率

In [9]:
from sklearn.metrics import accuracy_score

In [10]:
from sklearn.datasets import make_moons

In [11]:
X, y = make_moons(n_samples=10000, noise=0.4)

In [12]:
X[:5]

array([[ 1.24229988, -0.51639796],
       [ 2.54602253,  0.18674523],
       [ 1.08990144,  0.55148478],
       [ 0.13585775,  1.66331124],
       [ 1.85888955,  0.81731345]])

In [13]:
y[:5]

array([1, 1, 0, 0, 1], dtype=int64)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
X_train.shape

(8000, 2)

In [18]:
X_test.shape

(2000, 2)

In [23]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))



LogisticRegression 0.8355
RandomForestClassifier 0.852
SVC 0.87




VotingClassifier 0.866


In [24]:
voting_clf.predict_proba(X_test)

array([[0.96945942, 0.03054058],
       [0.04639413, 0.95360587],
       [0.04409987, 0.95590013],
       ...,
       [0.54415572, 0.45584428],
       [0.30492191, 0.69507809],
       [0.95020841, 0.04979159]])

## bagging和pasting

In [25]:
from sklearn.ensemble import BaggingClassifier

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1
)

In [28]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [29]:
y_pred = bag_clf.predict(X_test)

In [30]:
accuracy_score(y_test, y_pred)

0.872

## 包外评估

In [31]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, bootstrap=True, n_jobs=-1, oob_score=True
)

In [32]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [33]:
bag_clf.oob_score_

0.837

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
y_pred = bag_clf.predict(X_test)

In [36]:
accuracy_score(y_test, y_pred)

0.8425

In [37]:
bag_clf.oob_decision_function_

array([[0.64044944, 0.35955056],
       [0.82989691, 0.17010309],
       [1.        , 0.        ],
       ...,
       [0.0625    , 0.9375    ],
       [0.        , 1.        ],
       [1.        , 0.        ]])

## 随机森林

In [3]:
from sklearn.ensemble import RandomForestClassifier

  return f(*args, **kwds)


In [39]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

In [40]:
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=16,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [41]:
y_pred_rf = rnd_clf.predict(X_test)

In [42]:
y_pred_rf

array([0, 1, 1, ..., 0, 1, 0], dtype=int64)

In [43]:
bag_clf= BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16), n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1
)

In [44]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=16,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='random'),
    

In [45]:
from sklearn.ensemble import ExtraTreesClassifier

In [46]:
clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

In [47]:
clf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=16,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [49]:
y_pred = clf.predict(X_test)

In [50]:
accuracy_score(y_test, y_pred)

0.8545

In [1]:
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()

In [4]:
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [5]:
rnd_clf.fit(iris["data"], iris["target"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [6]:
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.08931253184302644
sepal width (cm) 0.022616216120768962
petal length (cm) 0.419302636372133
petal width (cm) 0.46876861566407146
