# Random Forest — Visualizing Tree Shape & Structure
This notebook gives **practical visuals** to understand the shape and behavior of Random Forests:
1) **Single-tree diagram** (truncated)  
2) **Depth distribution across trees**  
3) **Feature usage frequency**  
4) **1D prediction curve (step-like)**  
5) **2D prediction surface (heatmap)**  
6) **Decision path for one sample (1D tree)**  
7) **Per-tree leaf prediction distribution**  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split

# Clean, modern-ish defaults
plt.rcParams['figure.figsize'] = (8, 5)
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.alpha'] = 0.3
plt.rcParams['figure.dpi'] = 140
plt.rcParams['font.size'] = 12


## 1) Datasets
- **1D**: Nonlinear target for step-like RF prediction and decision path.
- **2D**: Nonlinear surface for heatmap and feature-usage analysis.


In [None]:
rng = np.random.default_rng(42)

# 1D data
n1 = 200
x1 = rng.uniform(-3.5, 3.5, size=n1)
def f1(t):
    return 0.6*t**3 - 1.0*t**2 + 2.2*t + 1.0
y1 = f1(x1) + rng.normal(0, 2.8, size=n1)
# add outliers
idx_out1 = rng.choice(np.arange(n1), size=max(6, n1//30), replace=False)
y1[idx_out1] += rng.normal(0, 12.0, size=idx_out1.size)
X1 = x1.reshape(-1,1)
X1_tr, X1_te, y1_tr, y1_te = train_test_split(X1, y1, test_size=0.3, random_state=7)

print(f"1D: n={n1}, train={len(X1_tr)}, test={len(X1_te)}; outliers={idx_out1.size}")

# 2D data
n2 = 500
x = rng.uniform(-3.0, 3.0, size=n2)
y = rng.uniform(-3.0, 3.0, size=n2)
def f2(a,b):
    # Smooth nonlinear surface
    return 2.5*np.sin(a) + 1.8*np.cos(1.3*b) + 0.5*a*b
z = f2(x,y) + rng.normal(0, 1.2, size=n2)
X2 = np.c_[x,y]
X2_tr, X2_te, z_tr, z_te = train_test_split(X2, z, test_size=0.3, random_state=7)

print(f"2D: n={n2}, train={len(X2_tr)}, test={len(X2_te)}")

## 2) Train Random Forest Models

In [None]:
rf1d = RandomForestRegressor(n_estimators=200, max_depth=None, random_state=42)
rf1d.fit(X1_tr, y1_tr)

rf2d = RandomForestRegressor(n_estimators=200, max_depth=None, random_state=42)
rf2d.fit(X2_tr, z_tr)

print("Trained rf1d and rf2d.")

## 3) Single Tree Diagram (Truncated)
Visualize one tree (depth-limited) to see split structure and thresholds.


In [None]:
plt.figure(figsize=(12,6))
plot_tree(rf1d.estimators_[0], filled=True, max_depth=3, fontsize=8)
plt.title('One Tree from rf1d (max_depth=3 view)')
plt.tight_layout()
plt.show()

## 4) Depth Distribution Across Trees
How deep do trees grow in the forest?


In [None]:
depths = [est.tree_.max_depth for est in rf1d.estimators_]
plt.figure()
plt.hist(depths, bins=15)
plt.title('Histogram of Tree Depths (rf1d)')
plt.xlabel('max_depth')
plt.ylabel('count')
plt.tight_layout()
plt.show()

print(f"Mean depth: {np.mean(depths):.2f}, Min: {np.min(depths)}, Max: {np.max(depths)}")

## 5) Feature Usage Frequency (2D)
Count how often each feature appears in splits across all trees (rf2d).

In [None]:
n_features = X2.shape[1]
counts = np.zeros(n_features, dtype=int)

for est in rf2d.estimators_:
    feats = est.tree_.feature
    # Count only real splits (feature index >= 0)
    for f in feats:
        if f >= 0:
            counts[f] += 1

plt.figure()
plt.bar(range(n_features), counts)
plt.xticks(range(n_features), [f'feature_{i}' for i in range(n_features)])
plt.title('Split Usage Frequency per Feature (rf2d)')
plt.xlabel('feature index')
plt.ylabel('split count')
plt.tight_layout()
plt.show()

counts

## 6) 1D Prediction Curve (Step-like)
Random Forest in 1D yields a piecewise-constant/step-like prediction. Overlay with training points.


In [None]:
x_grid = np.linspace(X1.min(), X1.max(), 600).reshape(-1,1)
y_grid = rf1d.predict(x_grid)

plt.figure()
plt.scatter(X1_tr[:,0], y1_tr, s=14, alpha=0.7, label='train points')
plt.plot(x_grid[:,0], y_grid, linewidth=2, label='RF (1D) prediction')
plt.title('RF (1D) Step-like Prediction')
plt.xlabel('x')
plt.ylabel('prediction')
plt.legend()
plt.tight_layout()
plt.show()

## 7) 2D Prediction Surface (Heatmap)
Visualize how the forest partitions 2D space by plotting predicted values on a grid.


In [None]:
g = 120  # grid resolution
gx = np.linspace(X2[:,0].min(), X2[:,0].max(), g)
gy = np.linspace(X2[:,1].min(), X2[:,1].max(), g)
GX, GY = np.meshgrid(gx, gy)
G = np.c_[GX.ravel(), GY.ravel()]
Z = rf2d.predict(G).reshape(GX.shape)

plt.figure(figsize=(7,6))
plt.imshow(Z, origin='lower', extent=[gx.min(), gx.max(), gy.min(), gy.max()], aspect='auto')
plt.scatter(X2_tr[:,0], X2_tr[:,1], s=8, alpha=0.4, label='train pts')
plt.title('RF (2D) Prediction Surface')
plt.xlabel('x0')
plt.ylabel('x1')
plt.legend()
plt.tight_layout()
plt.show()

## 8) Decision Path for One Sample (1D)
Trace which thresholds a given sample crosses in a **single tree** of the 1D forest.


In [None]:
sample_x = np.array([[0.8]])  # pick a point
tree = rf1d.estimators_[1]
node_indicator = tree.decision_path(sample_x)
leaf_id = tree.apply(sample_x)[0]

# Extract the node indices on path
node_index = node_indicator.indices[node_indicator.indptr[0]: node_indicator.indptr[1]]

# Gather thresholds used along the path
thresholds = []
for nid in node_index:
    feat = tree.tree_.feature[nid]
    thr = tree.tree_.threshold[nid]
    if feat >= 0:  # skip leaf (=-2)
        thresholds.append(thr)

print("Path node ids:", node_index)
print("Thresholds along path:", thresholds)
print("Predicted value at leaf:", tree.predict(sample_x)[0])

# Visualize thresholds on x-axis
xs = np.linspace(X1.min(), X1.max(), 400)
plt.figure()
plt.scatter(X1_tr[:,0], y1_tr, s=10, alpha=0.5, label='train')
for thr in thresholds:
    plt.axvline(thr, linestyle='--', linewidth=1)
plt.axvline(sample_x[0,0], linestyle='-', linewidth=2, label='sample x')
plt.title('Decision Thresholds Along Path (One Tree, 1D)')
plt.xlabel('x')
plt.ylabel('y (scatter shown for context)')
plt.legend()
plt.tight_layout()
plt.show()

## 9) Per-tree Leaf Prediction Distribution (1D)
For the same sample \(x = 0.8\), collect each tree's **leaf prediction** and plot a histogram.  
This shows how diverse individual trees are **before averaging**.


In [None]:
sample_x = np.array([[0.8]])
leaf_preds = np.array([est.predict(sample_x)[0] for est in rf1d.estimators_])

plt.figure()
plt.hist(leaf_preds, bins=20)
plt.title('Per-tree Leaf Predictions for x=0.8 (rf1d)')
plt.xlabel('leaf prediction')
plt.ylabel('count')
plt.tight_layout()
plt.show()

print(f"Mean per-tree prediction: {leaf_preds.mean():.3f}; RF ensemble prediction: {rf1d.predict(sample_x)[0]:.3f}")