This example demonstrates how to implement Boosting/random forest and stacking for ensemble learning.

Part I: Manual implementation of Bagging

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from scipy.stats import mode
from sklearn.model_selection import train_test_split

from sklearn.datasets import make_classification

# Create a synthetic, more complex dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5,
                           n_classes=3, flip_y=0.03, class_sep=0.5, random_state=42)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Parameters
B = 100  # Number of bootstrap samples/models

# Initialize lists to store predictions
consensus_predictions = []
probability_predictions = []

# Bagging
for b in range(B):
    # Bootstrap sample by using scikit-learn library. This approach does not estimate out-of-bag samples but is concise
    # It randomly selects samples from the original dataset with replacement, thus creating a new dataset
    # that is the same size as the original but potentially with some data points repeated and others omitted.
    X_train_b, y_train_b =

    ''' Remember before we used the following masking method from numpy to do boostrapping to estimate Err^(.632+).
        This is necessary if we want to know out-of-bag samples for analysis. It is more complex.
        for _ in range(B): # Run B times
            # Use the mask technique to select what are boostrap sets and what are out of bags
            indices = np.arange(N)
            bootstrap_indices = np.random.choice(indices, size=N, replace=True)
            oob_mask = np.ones(N, dtype=bool)
            oob_mask[bootstrap_indices] = 0
            X_b, y_b = X[bootstrap_indices], y[bootstrap_indices]
            X_oob, y_oob = X[oob_mask], y[oob_mask]
    '''


    # Train a model
    model =  #Depth control or slip control can be necessary. Otherwise, weak learners can have overfit.
    model.

    # Consensus prediction (class prediction)
    # X_test has 200 data
    consensus_predictions. #(100,) A list of 100 arrays, each array include 200 elements for X_test

    # Probability prediction based on the proportion of the training instances of each class in the leaf node
    # In other words, it is the relative frequency of that class within the leaf node where the test sample falls
    # E.g., if a leaf node contains 20 training samples, with 15 samples of class A and 5 samples of class B,
    # the predicted probability for a sample falling into this node would be 0.75 (or 75%) for class A and 0.25 (or 25%) for class B.
    probability_predictions. # (100,), a list of 100 (bagging size) 3-class probability predictions, each prediction for 200 points
    #print(model.predict_proba(X_test))

# Use axis=0 to find the mode along the first dimension - majority voting
consensus_predictions = np. # This will make it size (100,200), the values are class predictions 0, 1, 2

# Using mode function to return a ModeResult object that contains two arrays: .mode and .count.
# The .mode array contains the mode (most common value) of each class
consensus_bagged_prediction = mode( # (200,) It will pick one that has the most vote across all baggings. It includes predictions for 200 points

# Probability Bagging- predicted probabilities from each model are averaged to make the final prediction
probability_predictions = np. # (200,3), averaged across bagging direction (100 baggings), yielding 3-class prob predictions for 200 points
probability_bagged_prediction = np.argmax( # (200,), Find among 3 classes, which class has the highest value

# Evaluate performance
consensus_accuracy = accuracy_score(
probability_accuracy = accuracy_score(

print(f'Consensus Bagging Accuracy: {}')
print(f'Probability Bagging Accuracy: {}')


Part II: Show the plot how bootstrap size B is related to test error


In [None]:
import matplotlib.pyplot as plt

# Question:

# Based on the template above, you can add one extra loop for B to store all the consensus_error and probability_error corresponding to each B.
# Then make two plots: B vs. consensus_error and B vs. probability_error

# Initialize lists to store errors for different B values
consensus_errors = []
probability_errors = []

# Max number of bootstrap samples
max_B = 100

for B in range(1, max_B + 1):
    consensus_predictions = []
    probability_predictions = []

    # Bagging for B bootstrap samples
    for b in range(B):
        X_train_b, y_train_b =
        model =
        model.
        consensus_predictions.
        probability_predictions.

    # Consensus Bagging
    consensus_predictions =
    consensus_bagged_prediction
    ...
    consensus_errors

    # Probability Bagging
    probability_predictions =
    probability_bagged_prediction
    ...
    probability_errors.append


# Plotting
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_B + 1), , label='Consensus Bagging Error')
plt.plot(range(1, max_B + 1), , label='Probability Bagging Error')
plt.xlabel('Number of Bootstrap Samples (B)')
plt.ylabel('Test Error')
plt.title('Test Error vs. Number of Bootstrap Samples')
plt.legend()
plt.show()


Part III: Manual implementation of Random Forest

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from scipy.stats import mode

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Parameters for Random Forest
n_estimators = 100  # Number of trees in the forest
# max_features is set to the square root of the total number of features (np.sqrt(X_train.shape[1])).
# This is a common heuristic used in Random Forests for classification tasks.
Nmaxfeatures =   # Number of features to consider at every split for classification

# Initialize list to store predictions from all trees
forest_predictions = []

# Random Forest Algorithm
for i in range(n_estimators):
    # Create a bootstrap sample of the training data
    X_train_b, y_train_b =

    # Create a decision tree classifier with random subset of features
    # To ensure that each tree in the forest considers a random subset of features at each split,
    # we can use the max_features parameter of the DecisionTreeClassifier.
    # This parameter controls the number of features to consider when looking for the best split at each node.
    # It tells the classifier to consider only this many features at each split. The features are chosen randomly.
    tree = DecisionTreeClassifier( # depth control is still expected
    tree.

    # Store the predictions
    forest_predictions. # (100,) a list of 100 (bagging size) for classication of 200 points with values 0, 1, 2

# Aggregate predictions using majority vote
forest_predictions = np. # This make the size (100,200)
forest_majority_vote = mode( # (200,) Pick one (classification for 200 points) across all 100 baggings.

# Optional: Ensure the prediction is in the correct shape
# Flatten the array to 1D
#forest_majority_vote = forest_majority_vote.reshape(-1)

# Now use it in accuracy_score
forest_accuracy = accuracy_score(

print(f"Random Forest Accuracy: {}")


Use sklearn.ensemble library to run RandomForestClassifier

In [None]:
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_classifier =

# Train the model
rf_classifier.

# Make predictions
predictions = rf_classifier.

# Evaluate the model
accuracy = accuracy_score(
print(f"Random Forest Classifier Accuracy: {}")


Part IV: Manual implementation of Stacking

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
import numpy as np


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

'''
Train Multiple Base Learners: Train different models on the same dataset.
These can be models of different types or the same type with different hyperparameters.
Generate Meta-Features: Use the base learners to make predictions on a validation set.
These predictions serve as meta-features for the next step.
Train the Meta-Learner: Use the meta-features to train another model (the meta-learner).
The meta-learner learns how to combine the predictions of the base learners.
Final Predictions: Use the base learners to make predictions on the test set,
then use the meta-learner to combine these predictions into the final output.
'''

# Define base learners
base_learners = [
    RandomForestClassifier(n_estimators=10, random_state=42),
    GradientBoostingClassifier(n_estimators=10, random_state=42)
]

# Train base learners and generate meta-features
# For each fold, a part of the data is used as
# the validation set (the "held-out" part), and the remaining data is used as the training set
kf = KFold(n_splits=
meta_features = np.zeros((len(X_train), len(base_learners)))  # Initialize meta-features

for i, learner in enumerate(base_learners): #iterates over each base learner.
    # Further iterates over the different folds created by kf.split(X_train),
    # which generates indices for splitting the X_train dataset into training and validation sets
    for train_index, val_index in :
        # In each iteration (for each fold), the base learner is trained on the training part of the fold and
        # then makes predictions on the validation part. These predictions are "out-of-sample"
        # because the validation data was not used for training.
        learner.
        predictions = learner.) # out-of-sample prediction
        # The predictions for the validation set are stored in the corresponding positions of the meta_features matrix.
        meta_features[val_index, i] =

# Note for meta learner: cv out-of-sample prediction (meta features) => meta learner (logistic regression) ==> response
# Train the meta-learner all using training data
meta_learner = LogisticRegression(random_state=42)
meta_learner.

# Retrain base learners on the entire training set
for learner in base_learners:
    learner.

# Generate test meta-features
test_meta_features = np.column_stack([ for learner in base_learners])

# Final predictions
final_predictions = meta_learner.

# Evaluate the model
accuracy = accuracy_score(
print(f"Stacking Model Accuracy: {}")


Use sklearn.ensemble to run StackingClassifier

In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base learners
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=10, random_state=42))
]

# Train and evaluate each base learner
for name, learner in base_learners:
    learner.
    base_predictions = learner.
    base_accuracy = accuracy_score(
    print(f"Base Learner {n} Accuracy: {}")

# Define the stacking ensemble
stacked_ensemble = (
    estimators=, final_estimator=, cv=
)

# Train the stacked model
stacked_ensemble.

# Make predictions and evaluate the model
predictions = stacked_ensemble.
accuracy = accuracy_score(

print(f"Stacked Model Accuracy: {}")

# Question: Compared to the random forest example before, why is the accuracy worse? How to fix it?
