
# Random Forest — Conceptual Visualizations (Classification)

**visual explanations** of how a Random Forest works:

- A single decision tree produces **jagged** boundaries; a forest **smooths** them by voting.
- **Bootstrapping (bagging)** gives each tree a different view of the data and enables **OOB** estimates.
- More trees → **lower variance** up to diminishing returns.
- We compare **Gini** vs **permutation** importances and show a **PDP** for intuition.

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt, warnings

from sklearn.datasets import make_moons, make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, average_precision_score)
from sklearn.inspection import permutation_importance, PartialDependenceDisplay

np.random.seed(12)
warnings.filterwarnings("ignore")

In [None]:

def plot_decision_boundary(clf, X, y, title="Decision boundary"):
    x_min, x_max = X[:,0].min()-0.8, X[:,0].max()+0.8
    y_min, y_max = X[:,1].min()-0.8, X[:,1].max()+0.8
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300),
                         np.linspace(y_min, y_max, 300))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    plt.figure(figsize=(6,5))
    plt.contourf(xx, yy, Z, alpha=0.25)
    plt.scatter(X[:,0], X[:,1], c=y, s=12, alpha=0.75)
    plt.title(title)
    plt.xlabel("x1"); plt.ylabel("x2")
    plt.tight_layout(); plt.show()



## Single Tree vs Forest (2D Moons)

We use a 2D "moons" dataset to literally see the boundary: trees create **piecewise-constant** regions; forests **average** them.


In [None]:

X, y = make_moons(n_samples=1200, noise=0.30, random_state=12)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=12)

tree = DecisionTreeClassifier(max_depth=4, random_state=12)
tree.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=300, max_depth=None, min_samples_leaf=3,
                            oob_score=True, random_state=12, n_jobs=-1)
rf.fit(X_train, y_train)

print("Tree  — test accuracy:", round(accuracy_score(y_test, tree.predict(X_test)), 3))
print("Forest— test accuracy:", round(accuracy_score(y_test, rf.predict(X_test)), 3))
print("Forest— OOB score   :", round(rf.oob_score_, 3))

In [None]:

plot_decision_boundary(tree, X_train, y_train, title="Single Decision Tree — Decision Boundary")

In [None]:

plot_decision_boundary(rf, X_train, y_train, title="Random Forest — Decision Boundary (Smoothed by Voting)")


## Bagging / Bootstrapping Intuition

Each tree trains on a **bootstrap sample** of the data (sampled with replacement). Points not sampled for that tree are **OOB (out-of-bag)** for that tree.


In [None]:
n = X_train.shape[0]
idx = np.arange(n)
boot_idx = np.random.choice(idx, size=n, replace=True)
oob_mask = np.ones(n, dtype=bool); oob_mask[boot_idx] = False

plt.figure(figsize=(6,5))
plt.scatter(X_train[boot_idx,0], X_train[boot_idx,1], s=12, alpha=0.55, label="Bootstrap sample")
plt.scatter(X_train[oob_mask,0], X_train[oob_mask,1], s=12, alpha=0.95, label="OOB (this tree)")
plt.title("Bootstrapping: In-Bag vs Out-of-Bag Points")
plt.legend(); plt.tight_layout(); plt.show()


## Voting Intuition (Averaging Trees)

The forest averages per-tree probabilities → **smoother** decisions than any single tree.


In [None]:
x_min, x_max = X[:,0].min()-0.8, X[:,0].max()+0.8
y_min, y_max = X[:,1].min()-0.8, X[:,1].max()+0.8
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 220),
                     np.linspace(y_min, y_max, 220))
grid = np.c_[xx.ravel(), yy.ravel()]

probs_grid = np.zeros(grid.shape[0])
for est in rf.estimators_[:60]:  # sample 60 trees for speed
    probs_grid += est.predict_proba(grid)[:,1]
probs_grid /= 60.0
Z = (probs_grid >= 0.5).astype(int).reshape(xx.shape)

plt.figure(figsize=(6,5))
plt.contourf(xx, yy, Z, alpha=0.25)
plt.scatter(X[:,0], X[:,1], c=y, s=12, alpha=0.75)
plt.title("Forest Majority Vote (Averaged over Trees)")
plt.xlabel("x1"); plt.ylabel("x2")
plt.tight_layout(); plt.show()


## OOB vs Validation Metrics

OOB approximates test performance without a separate validation set (a *built-in* cross-validation from bootstrapping).


In [None]:
y_prob_test = rf.predict_proba(X_test)[:,1]
print("Validation ROC-AUC:", round(roc_auc_score(y_test, y_prob_test), 3))
print("Validation PR-AUC :", round(average_precision_score(y_test, y_prob_test), 3))
print("OOB score (accuracy-like):", round(rf.oob_score_, 3))


## Effect of Number of Trees

More trees reduce variance and stabilize metrics, up to diminishing returns.


In [None]:
sizes = [1, 5, 10, 30, 60, 100, 200, 400]
accs = []; aucs = []
for m in sizes:
    mrf = RandomForestClassifier(n_estimators=m, min_samples_leaf=3, random_state=12, n_jobs=-1, oob_score=True)
    mrf.fit(X_train, y_train)
    p = mrf.predict_proba(X_test)[:,1]
    accs.append(accuracy_score(y_test, mrf.predict(X_test)))
    aucs.append(roc_auc_score(y_test, p))

plt.figure(figsize=(7,4))
plt.plot(sizes, accs, marker="o")
plt.xlabel("# Trees"); plt.ylabel("Accuracy"); plt.title("Accuracy vs Number of Trees")
plt.tight_layout(); plt.show()

plt.figure(figsize=(7,4))
plt.plot(sizes, aucs, marker="o")
plt.xlabel("# Trees"); plt.ylabel("ROC-AUC"); plt.title("ROC-AUC vs Number of Trees")
plt.tight_layout(); plt.show()


## What Drives Predictions? (Importances + PDP on Tabular Data)

We switch to a 10‑feature tabular dataset to compare **Gini** vs **Permutation** importances, then plot a **PDP**.


In [None]:
Xtab, ytab = make_classification(n_samples=2500, n_features=10, n_informative=4, n_redundant=2,
                                 n_clusters_per_class=2, class_sep=1.1, random_state=12)
Xtr, Xte, ytr, yte = train_test_split(Xtab, ytab, test_size=0.25, random_state=12, stratify=ytab)

rf_tab = RandomForestClassifier(n_estimators=500, min_samples_leaf=2, random_state=12, n_jobs=-1, oob_score=True)
rf_tab.fit(Xtr, ytr)

gini_imp = rf_tab.feature_importances_
order = np.argsort(gini_imp)[::-1]

plt.figure(figsize=(7,4))
plt.bar(range(len(order)), gini_imp[order])
plt.xticks(range(len(order)), [f"x{j}" for j in order])
plt.ylabel("Gini Importance"); plt.title("Model-based (Gini) Importances")
plt.tight_layout(); plt.show()

pi = permutation_importance(rf_tab, Xte, yte, n_repeats=20, random_state=12, n_jobs=-1, scoring="roc_auc")
pi_mean, pi_std = pi.importances_mean, pi.importances_std
order2 = np.argsort(pi_mean)[::-1]

plt.figure(figsize=(7,4))
plt.bar(range(len(order2)), pi_mean[order2], yerr=1.96*pi_std[order2])
plt.xticks(range(len(order2)), [f"x{j}" for j in order2])
plt.ylabel("Permutation Importance (ΔAUC)"); plt.title("Permutation Importances (95% CI)")
plt.tight_layout(); plt.show()

print("Tabular Validation ROC-AUC:", round(roc_auc_score(yte, rf_tab.predict_proba(Xte)[:,1]), 3))

In [None]:
top_feat = int(np.argmax(rf_tab.feature_importances_))
PartialDependenceDisplay.from_estimator(rf_tab, Xte, [top_feat], kind="average", grid_resolution=30)
plt.suptitle(f"Partial Dependence — feature x{top_feat}")
plt.tight_layout(); plt.show()


### Key Takeaways
- A Random Forest **reduces variance** vs a single tree by averaging many de-correlated trees.
- **OOB** estimates come for free from bootstrapping and usually track validation metrics.
- **Permutation importances** tell you which features most change real predictions; **PDP** shows average effect shape.
- Adding more trees helps until the curve flattens; then you’re at diminishing returns.
