# Ensemble and Random Forests

### Voting Classifier

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=1000, noise=0.3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
svm_clf = SVC()
rnd_clf = RandomForestClassifier()

voting_clf = VotingClassifier(
    estimators=[('log', log_clf), ('svc', svm_clf), ('rf', rnd_clf)],
    voting='hard'
)

voting_clf.fit(X_train, y_train)

In [12]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, svm_clf, rnd_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.875
SVC 0.925
RandomForestClassifier 0.935
VotingClassifier 0.91


### Bagging and Pasting

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True
)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
bag_clf.oob_score_

0.8825

In [14]:
accuracy_score(y_test, y_pred)

0.915

In [15]:
bag_clf.oob_decision_function_

array([[0.90762125, 0.09237875],
       [0.09069767, 0.90930233],
       [0.922049  , 0.077951  ],
       ...,
       [0.18224299, 0.81775701],
       [0.99779249, 0.00220751],
       [0.12528474, 0.87471526]])

### Random Forests

In [16]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

is the same as

In [17]:
bag_clf = BaggingClassifier(
    estimator=DecisionTreeClassifier(splitter='random', max_leaf_nodes=16), n_estimators=500,
    max_samples=1.0, bootstrap=True, n_jobs=-1)

### AdaBoost

In [18]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=500,
    algorithm="SAMME.R", learning_rate=0.5)

ada_clf.fit(X_train, y_train)

### Gradient Boosting

In [19]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

In [20]:
import numpy as np
from sklearn.metrics import mean_squared_error

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_test, y_pred) for y_pred in gbrt.staged_predict(X_test)]
best_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
gbrt_best.fit(X_train, y_train)

In [21]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_test)

# Exercise - MNIST Ensemble

In [22]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml(name='mnist_784', version=1, as_frame=False, parser='auto')

In [23]:
X = mnist['data']
y = mnist['target']

In [24]:
X_train, y_train = X[:50_000], y[:50_000]
X_val, y_val = X[50_000:60_000], y[50_000:60_000]
X_test, y_test = X[60_000:], y[60_000:]

In [27]:
from sklearn.ensemble import ExtraTreesClassifier

rf_clf = RandomForestClassifier()
svm_clf = SVC()
et_clf = ExtraTreesClassifier()

vot_clf = VotingClassifier(
    estimators=[('rf', rf_clf), ('svc', svm_clf), ('et', et_clf)],
    voting='hard'
)

In [28]:
vot_clf.fit(X_train, y_train)

In [31]:
for clf in (rf_clf, svm_clf, et_clf, vot_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print(clf.__class__.__name__, accuracy_score(y_val, y_pred))

RandomForestClassifier 0.9719
SVC 0.9802
ExtraTreesClassifier 0.9754
VotingClassifier 0.9771


In [32]:
del vot_clf.estimators_[0]

In [33]:
accuracy_score(y_val, vot_clf.predict(X_val))

0.9781