# Notebook 11 — Ensemble Methods (Solutions)

Here are the completed solutions for Bagging, Random Forests, AdaBoost, Gradient Boosting, and Stacking.

## Solution 1 — Bagging with Decision Trees

In [1]:
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Single Decision Tree
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
tree_acc = accuracy_score(y_test, tree.predict(X_test))

# Bagging Classifier (use 'estimator' for sklearn >= 1.2)
bagging = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
bagging.fit(X_train, y_train)
bagging_acc = accuracy_score(y_test, bagging.predict(X_test))

print(f"Decision Tree Accuracy: {tree_acc:.3f}")

Decision Tree Accuracy: 0.797


## Solution 2 — Random Forests

In [2]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_acc = accuracy_score(y_test, rf.predict(X_test))

print(f"Random Forest Accuracy: {rf_acc:.3f}")
print("Top 5 Feature Importances:")
importances = sorted(zip(rf.feature_importances_, range(X.shape[1])), reverse=True)[:5]
for imp, idx in importances:
    print(f"Feature {idx}: {imp:.3f}")

Random Forest Accuracy: 0.887
Top 5 Feature Importances:
Feature 12: 0.134
Feature 6: 0.068
Feature 2: 0.067
Feature 5: 0.067
Feature 17: 0.058


## Solution 3 — AdaBoost

In [3]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost.fit(X_train, y_train)
adaboost_acc = accuracy_score(y_test, adaboost.predict(X_test))

print(f"AdaBoost Accuracy: {adaboost_acc:.3f}")

AdaBoost Accuracy: 0.833


## Solution 4 — Gradient Boosting

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
gb_acc = accuracy_score(y_test, gb.predict(X_test))

print(f"Gradient Boosting Accuracy: {gb_acc:.3f}")

Gradient Boosting Accuracy: 0.887


## Solution 5 — Stacking

In [5]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimators = [
    ('dt', DecisionTreeClassifier()),
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=50, random_state=42))
]

stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5)
stacking.fit(X_train, y_train)
stacking_acc = accuracy_score(y_test, stacking.predict(X_test))

print(f"Stacking Accuracy: {stacking_acc:.3f}")

Stacking Accuracy: 0.887


## 🔎 Summary of Results
- Decision Tree → baseline performance
- Bagging → reduces variance, generally better than a single tree
- Random Forest → often best among bagging-based methods, adds feature randomness
- AdaBoost → sequentially focuses on hard examples, strong learner
- Gradient Boosting → similar to AdaBoost but uses gradient descent, often best boosting method
- Stacking → can combine different models for potential performance gains

✅ The best model will depend on dataset complexity, but usually **Gradient Boosting or Random Forest** wins.