In [None]:
import pathlib
import requests

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import compose, datasets, ensemble, linear_model, metrics
from sklearn import model_selection, neighbors, pipeline
from sklearn import preprocessing, svm, tree

# Ensemble Learning

Building a model on top of many other models is called [ensemble](https://scikit-learn.org/stable/modules/ensemble.html) learning and it is often a great approach to improve the predictions of your machine learning pipeline.

## Load and prepare the data

In [None]:
diabetes_data = datasets.load_diabetes(
    as_frame=True,
    scaled=False
)

In [None]:
features_df = diabetes_data.data
target = diabetes_data.target

In [None]:
features_df.head()

In [None]:
target

### Data preprocessing

In [None]:
transformer_0 = compose.make_column_transformer(
    (
        preprocessing.OneHotEncoder(
            drop="first",
            dtype=np.uint8,
            sparse_output=False,
        ),
        ["sex"]
    ),
    remainder="drop",
    verbose=True,
    verbose_feature_names_out=False
)

transformer_1 = compose.make_column_transformer(
    (
        preprocessing.StandardScaler(),
        ["age", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"]
    ),
    remainder="drop",
    verbose=True,
    verbose_feature_names_out=False
)

features_preprocessor = pipeline.make_union(
    transformer_0,
    transformer_1,
    verbose=True,
    n_jobs=-1
).set_output(transform="pandas")

target_preprocessor = preprocessing.FunctionTransformer(
    func=np.log,
    inverse_func=np.exp
)

### Feature engineering

In [None]:
feature_engineering = preprocessing.PolynomialFeatures(
    degree=2,
    include_bias=False,
    interaction_only=False
).set_output(transform="pandas")


## Voting

In [None]:
ensemble.VotingRegressor?

In [None]:
voting_regressor = ensemble.VotingRegressor(
    estimators=[
        ("sgd_regressor", linear_model.SGDRegressor()),
        ("k_neighbors_regressor", neighbors.KNeighborsRegressor()),
        ("linear_svr", svm.LinearSVR()),
        ("tree", tree.DecisionTreeRegressor()),
    ],
    weights=None,
    n_jobs=-1,
    verbose=True
)

voting_regressor_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    feature_engineering,
    voting_regressor,
    verbose=True,
)

In [None]:
voting_regressor_pipeline

In [None]:
ensemble_scores = model_selection.cross_val_score(
    voting_regressor_pipeline,
    features_df,
    target,
    cv=3,
    n_jobs=-1,
    scoring="neg_root_mean_squared_error"
)

In [None]:
np.mean(-ensemble_scores)

### Excercise

Use cross validation to score each of the individual estimators included in the voting regressor above. Compare the results of our ensemble with the best individual model.

#### Solution

In [None]:
_sgd_regressor_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    feature_engineering,
    linear_model.SGDRegressor()
)

sgd_regressor_scores = model_selection.cross_val_score(
    _sgd_regressor_pipeline,
    features_df,
    target,
    cv=5,
    n_jobs=1,
    scoring="neg_mean_squared_error"
)

print(np.mean(np.sqrt(-sgd_regressor_scores)))

In [None]:
_linear_svr_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    feature_engineering,
    svm.LinearSVR()
)

linear_svr_scores = model_selection.cross_val_score(
    _linear_svr_pipeline,
    features_df,
    target,
    cv=5,
    n_jobs=1,
    scoring="neg_mean_squared_error"
)

print(np.mean(np.sqrt(-linear_svr_scores)))

In [None]:
_neighbors_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    feature_engineering,
    neighbors.KNeighborsRegressor()
)

neighbors_scores = model_selection.cross_val_score(
    _neighbors_pipeline,
    features_df,
    target,
    cv=5,
    n_jobs=1,
    scoring="neg_mean_squared_error"
)

print(np.mean(np.sqrt(-neighbors_scores)))

In [None]:
_tree_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    feature_engineering,
    tree.DecisionTreeRegressor()
)

tree_scores = model_selection.cross_val_score(
    _tree_pipeline,
    features_df,
    target,
    cv=5,
    n_jobs=1,
    scoring="neg_mean_squared_error"
)

print(np.mean(np.sqrt(-tree_scores)))

## Bagging and Pasting

In [None]:
ensemble.BaggingRegressor?

In [None]:
bagging_regressor = ensemble.BaggingRegressor(
    estimator=tree.DecisionTreeRegressor(),
    n_estimators=10,
    max_samples=0.8,
    bootstrap=True,
    max_features=1.0,
    bootstrap_features=False,
    oob_score=True,
    n_jobs=-1,
    random_state=42,
    verbose=True
)

In [None]:
bagging_regressor

In [None]:
_ = bagging_regressor.fit(features_df, target)

In [None]:
oob_predictions = bagging_regressor.oob_prediction_
metrics.root_mean_squared_error(
    target,
    oob_predictions,
)

## Random Forests

Let’s try the [`ensemble.RandomForestRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html). Random forests work by training many decision trees on random subsets of the features, then averaging the predictions made by each of the decision trees to arrive at an overall prediction.

In [None]:
_random_forest_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    feature_engineering,
    ensemble.RandomForestRegressor()
)

random_forest_scores = model_selection.cross_val_score(
    _random_forest_pipeline,
    features_df,
    target,
    cv=5,
    n_jobs=1,
    scoring="neg_mean_squared_error"
)

print(np.mean(np.sqrt(-random_forest_scores)))

### Exercise

Compare the performance of the ExtraTreesRegressor with the RandomForestRegressor fit above.

In [None]:
ensemble.ExtraTreesRegressor?

#### Solution

In [None]:
_extra_trees_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    feature_engineering,
    ensemble.ExtraTreesRegressor()
)

extra_trees_scores = model_selection.cross_val_score(
    _extra_trees_pipeline,
    features_df,
    target,
    cv=5,
    n_jobs=1,
    scoring="neg_mean_squared_error"
)

print(np.mean(np.sqrt(-extra_trees_scores)))

### Exercise

Tune the hyperparameters of either the RandomForestRegressor or the ExtraTreesRegressor and see if you can get even better performance.

#### Solution

In [None]:
ensemble.ExtraTreesRegressor?

In [None]:
ml_pipeline = pipeline.make_pipeline(
    features_preprocessor,
    feature_engineering,
    ensemble.ExtraTreesRegressor(),
)

param_grid = {
    "extratreesregressor__bootstrap": [True],
    "extratreesregressor__max_depth": [1, 2, 4, None],
    "extratreesregressor__max_features": [0.1, 0.5, 1.0],
    "extratreesregressor__max_samples": [0.25, 0.5, 0.75, 1.0],
    "extratreesregressor__n_estimators": [4, 100, 200]
}

grid_search_cv = model_selection.GridSearchCV(
    ml_pipeline,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring="neg_root_mean_squared_error",
    verbose=True,
)

_ = grid_search_cv.fit(
    features_df,
    target
)

In [None]:
-grid_search_cv.best_score_

In [None]:
grid_search_cv.best_params_