## 集成学习和随机森林

### 投票分类器

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [7]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.896


### baggin和pasting

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), 
    n_estimators=500, max_samples=100,
    bootstrap=True, n_jobs=-1
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [10]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.904

In [11]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.912

In [12]:
bag_clf.oob_decision_function_

array([[0.40322581, 0.59677419],
       [0.29714286, 0.70285714],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.09356725, 0.90643275],
       [0.31395349, 0.68604651],
       [0.01685393, 0.98314607],
       [0.98895028, 0.01104972],
       [0.96      , 0.04      ],
       [0.8343949 , 0.1656051 ],
       [0.01162791, 0.98837209],
       [0.78089888, 0.21910112],
       [0.87292818, 0.12707182],
       [0.97633136, 0.02366864],
       [0.06666667, 0.93333333],
       [0.00621118, 0.99378882],
       [0.98314607, 0.01685393],
       [0.93258427, 0.06741573],
       [0.98918919, 0.01081081],
       [0.01117318, 0.98882682],
       [0.4       , 0.6       ],
       [0.90526316, 0.09473684],
       [1.        , 0.        ],
       [0.98913043, 0.01086957],
       [0.        , 1.        ],
       [0.99450549, 0.00549451],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.60773481, 0.39226519],
       [0.

### 随机补丁和随机子空间

### 随机森林

In [13]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [14]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])

for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.09391351658224556
sepal width (cm) 0.02214586137746098
petal length (cm) 0.4408314834932202
petal width (cm) 0.44310913854707323
