In [2]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(random_state=42))
    ]
)
voting_clf.fit(X_train, y_train)

In [4]:
for name, clf in voting_clf.named_estimators_.items():
    print(f"{name} = {clf.score(X_test, y_test)}")

lr = 0.864
rf = 0.896
svc = 0.896


In [5]:
voting_clf.score(X_test, y_test)

0.912

Using soft voting, we weight more condfident votes higher:

In [6]:
voting_clf.voting = "soft"
voting_clf.named_estimators["svc"].probability = True

voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)


0.92

# Bagging and pasting

We can also train the same classifier on many training subsets. If we choose the subsets with replacement, it is called **bagging**, and without, it is called **pasting**.

This should end up with a classifer with similar bias, but lower variance

In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    n_jobs=-1,
    random_state=42,
    # bootstrap=False # to use pasting instead
)
bag_clf.fit(X_train, y_train)

For high-dimension models, it can also be advantageous to train on a subset of the features, using `max_features` and `bootstrap_features`.

# Random forests

A random forest is an ensemble of decision trees. However, each tree is trained on a random subset of the features, to increase tree diversity (by default $\sqrt{n}$ features) 

In [8]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(
    n_estimators=500, 
    max_leaf_nodes=16,
    n_jobs=-1,
    random_state=42)

rnd_clf.fit(X_train, y_train)

rnd_clf.predict(X_test)

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0], dtype=int64)

## Extremely random forests

These are made by selecting the thresholds randomly, rather than picking the best one, reducing computational costs

## Feature importances

We can estimate feature importances by considering, on average, how much nodes that use a given feature reduce impurity. This can aid with feature selection

In [9]:
from sklearn.datasets import load_iris

iris = load_iris(as_frame=True)
rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)

rnd_clf.fit(iris.data, iris.target)

for score, name in zip(rnd_clf.feature_importances_, iris.data.columns):
    print(f"{round(score, 2)*100}%, {name}")

11.0%, sepal length (cm)
2.0%, sepal width (cm)
44.0%, petal length (cm)
42.0%, petal width (cm)


# Boosting

Boosting algotithms aim to combine several weak learners into a strong learner

## Adaptive Boosting (AdaBoost)

Adaptive boosters train an algorithm, and then boost the weight of underfit training instances, and then repeats this many times.

Each predictor gets a higher wieght if it is more accurate, and the final prediction is a weighted combination of the results of all the predictors.

In [10]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), 
    n_estimators=30,
    learning_rate=0.5,
    random_state=42
)

ada_clf.fit(X_train, y_train)

## Gradient boosting

This time, rather than progressively weighting the later estimators, we instead fit these estimators to the residual error

In [11]:
# Generate noisy quadratic dataset and train a DecisionTreeRegressor
import numpy as np
from sklearn.tree import DecisionTreeRegressor

np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0] ** 2 + 0.05 * np.random.randn(100)  # y = 3x² + Gaussian noise

tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

In [12]:
# Train a new decision tree on the residuals

y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)

In [13]:
# and do it once more

y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=44)
tree_reg3.fit(X, y3)

In [14]:
X_new = np.array([[-0.4], [0.], [0.5]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_actual = 3 * X_new[:, 0] ** 2

print(y_pred)
print(y_actual)

[0.49484029 0.04021166 0.75026781]
[0.48 0.   0.75]


In [15]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(
    max_depth=2,
    n_estimators=3,
    learning_rate=1.0,
    random_state=42
)

gbrt.fit(X, y)


We can get it to stop training when the model stops making progress. To judge progress, it splits the data into a training and validation set.

In [17]:
gbrt_best = GradientBoostingRegressor(
    max_depth=2,
    learning_rate=0.05,
    n_estimators=500,
    n_iter_no_change=10,
    validation_fraction=0.2,
    random_state=42
)
gbrt_best.fit(X, y)
gbrt_best.n_estimators_

84

By introducing a `subsample` parameter, each tree will only be trained on a random subset of the data, which is called **Stochastic Gradient Boosting**. This speeds up training, increases bias, but reduces variance.

## Histogram-based Gradient Boosting

This time we bin the data, which can massively increase performance for larger datasets (see textbook for intricacies)

# Stacking

Here, we use another machine learning model to combine the predictions from the individual predictors in the ensemble. This extra model is called a blender.

The blender contains one input feature per predictor.

We can also have multiple layers of blenders to get better predictions.

In [18]:
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(probability=True, random_state=42))
    ],
    final_estimator=RandomForestClassifier(random_state=43),
    cv=5  # number of cross-validation folds
)
stacking_clf.fit(X_train, y_train)

# 8
Load the MNIST dataset (introduced in Chapter 3), and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing). Then train various classifiers, such as a random forest classifier, an extra-trees classifier, and an SVM classifier. Next, try to combine them into an ensemble that outperforms each individual classifier on the validation set, using soft or hard voting. Once you have found one, try it on the test set. How much better does it perform compared to the individual classifiers?

In [21]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', as_frame=False)

  warn(


In [25]:
X_mnist: np.array = mnist.data
y_mnist: np.array = mnist.target

In [26]:
X_train, y_train = X_mnist[:50_000], y_mnist[:50_000]
X_valid, y_valid = X_mnist[50_000:60_000], y_mnist[50_000:60_000]
X_test, y_test = X_mnist[60_000:], y_mnist[60_000:]

In [35]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC

In [39]:
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=42)

base_estimators = [("rnd", random_forest_clf), ("ext", extra_trees_clf), ("svm", svm_clf)]

for _, e in base_estimators:
    e.fit(X_train, y_train)



In [40]:
# check the scores
from sklearn.metrics import accuracy_score

for name, e in base_estimators:
    y_valid_pred = e.predict(X_valid)
    print(f"{name}: {accuracy_score(y_valid, y_valid_pred)}")


rnd: 0.9736
ext: 0.9743
svm: 0.8662


In [41]:
# now combine using voting

from sklearn.ensemble import VotingClassifier

hard_voting = VotingClassifier(base_estimators, voting="hard")

hard_voting.fit(X_train, y_train)



In [44]:
accuracy_score(y_valid, hard_voting.predict(X_valid))

0.9737

In [46]:
# we can also do soft voting:

soft_voting = VotingClassifier(base_estimators[:2], voting="soft")

soft_voting.fit(X_train, y_train)

accuracy_score(y_valid, soft_voting.predict(X_valid))

0.9749

# 9. Stacking Classifier

Run the individual classifiers from the previous exercise to make predictions on the validation set, and create a new training set with the resulting predictions: each training instance is a vector containing the set of predictions from all your classifiers for an image, and the target is the image’s class. Train a classifier on this new training set. Congratulations—you have just trained a blender, and together with the classifiers it forms a stacking ensemble! Now evaluate the ensemble on the test set. For each image in the test set, make predictions with all your classifiers, then feed the predictions to the blender to get the ensemble’s predictions. How does it compare to the voting classifier you trained earlier? Now try again using a `StackingClassifier` instead. Do you get better performance? If so, why?

In [52]:
X_stacking = []

X_stacking = np.array([e.predict(X_valid) for _, e in base_estimators]).transpose()

[['3' '3' '3']
 ['8' '8' '8']
 ['6' '6' '6']
 ['9' '9' '9']
 ['6' '6' '6']]


array(['3', '8', '6', '9', '6'], dtype=object)

In [53]:
stacking_clf = ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1)

stacking_clf.fit(X_stacking, y_valid)

In [54]:
test_individual_preds = np.array([e.predict(X_test) for _, e in base_estimators]).transpose()

test_preds = stacking_clf.predict(test_individual_preds)

accuracy_score(test_preds, y_test)

0.9687