# Random Forest

Train a RandomForestClassifier and examine OOB score (if enabled) and feature importances.

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, warnings
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import numpy as np, matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.metrics import brier_score_loss

!wget -q https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Classification/classification_utils.py
import classification_utils as utils
csv_path = "https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Classification/classification.csv"
warnings.filterwarnings("ignore")

df = pd.read_csv(csv_path, parse_dates=["ts"]).sort_values("ts")
train, test = utils.chrono_split(df, "ts", test_frac=0.2)

features = ["ad_channel","device","region","campaign","spend_l7","pages_per_session","sessions_l30","time_on_site_s","pricing_views_l7","email_opens_l30","past_purchases","tenure_days","discount_flag","competitor_visits"]
target = "converted"

X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]

pre = ColumnTransformer([
    ("num", StandardScaler(), ["spend_l7","pages_per_session","sessions_l30","time_on_site_s","pricing_views_l7","email_opens_l30","past_purchases","tenure_days"]),
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["ad_channel","device","region","campaign"]),
    ("bin", "passthrough", ["discount_flag","competitor_visits"])
])

rf = Pipeline([("pre", pre), ("clf", RandomForestClassifier(n_estimators=300, max_depth=None, min_samples_leaf=20, n_jobs=-1, random_state=42, oob_score=False))])
rf.fit(X_train, y_train)
probs = rf.predict_proba(X_test)[:,1]
_ = utils.evaluate_classifier(y_test, probs, title_prefix="Random Forest")

# Feature importances
est = rf.named_steps["clf"]
imp = est.feature_importances_
fn = rf.named_steps["pre"].get_feature_names_out()
imp_df = pd.DataFrame({"feature": fn, "importance": imp}).sort_values("importance", ascending=False)
imp_df.head(12)

### Advanced diagnostics
#### OOB vs Validation PR-AUC / ROC-AUC (side-by-side)
- OOB uses the RF’s internal bootstrap to approximate out-of-sample performance on the train set; validation metrics use your test set.

In [None]:
# Fit an OOB-enabled RF with same params
rf_oob = clone(rf)
rf_oob.set_params(clf__oob_score=True, clf__n_estimators=300)  # ensure enough trees for stable OOB
rf_oob.fit(X_train, y_train)

# OOB probabilities (class 1) from the RF stage
oob_probs = rf_oob.named_steps["clf"].oob_decision_function_[:, 1]

# Metrics
val_probs = probs  # from your fitted rf on test
oob_roc = roc_auc_score(y_train, oob_probs)
oob_pr  = average_precision_score(y_train, oob_probs)
val_roc = roc_auc_score(y_test,  val_probs)
val_pr  = average_precision_score(y_test,  val_probs)

print(f"OOB → ROC-AUC={oob_roc:.3f}  PR-AUC={oob_pr:.3f}")
print(f"VAL → ROC-AUC={val_roc:.3f}  PR-AUC={val_pr:.3f}")

# Simple side-by-side bars
labels = ["ROC-AUC","PR-AUC"]
oob_vals = [oob_roc, oob_pr]
val_vals = [val_roc, val_pr]

x = np.arange(len(labels)); w = 0.35
plt.figure(figsize=(6,4))
plt.bar(x - w/2, oob_vals, width=w, label="OOB")
plt.bar(x + w/2, val_vals, width=w, label="Validation")
plt.xticks(x, labels); plt.ylim(0,1); plt.ylabel("Score"); plt.title("OOB vs Validation AUCs")
plt.legend(); plt.tight_layout(); plt.show()

#### Confusion matrix + threshold/utility curve (proposed operating point)
- Utility/threshold uses a simple expected-value formula; set gains/costs to your economics.

In [None]:
# Economics (customize)
gain_tp = 100.0   # benefit when acting on a true positive
cost_fp = 10.0    # cost when acting on a false positive
cost_fn = 40.0    # opportunity cost of missing a positive
cost_tn = 0.0

def expected_value(y_true, p, t, g_tp=gain_tp, c_fp=cost_fp, c_fn=cost_fn, c_tn=cost_tn):
    yhat = (p >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, yhat).ravel()
    return tp*g_tp - fp*c_fp - fn*c_fn - tn*c_tn

grid = np.linspace(0.01, 0.99, 99)
evs = [expected_value(y_test, probs, t) for t in grid]
t_star = float(grid[int(np.argmax(evs))])
print(f"Proposed threshold (EV-opt): {t_star:.2f}")

# Confusion matrix at t_star
yhat = (probs >= t_star).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, yhat).ravel()
print(f"TP={tp} FP={fp} FN={fn} TN={tn}")

# Plot utility curve
plt.figure(figsize=(6,4))
plt.plot(grid, evs)
plt.axvline(t_star, linestyle="--")
plt.xlabel("threshold"); plt.ylabel("Expected value")
plt.title("Threshold vs Expected Value (validation)")
plt.tight_layout(); plt.show()

#### Reliability (calibration) curves pre/post calibration (+ Brier)
- Calibration uses isotonic (non-parametric). Compare Brier scores and reliability curves.

In [None]:
rf_cal = CalibratedClassifierCV(estimator=clone(rf), method="isotonic", cv=3)
rf_cal.fit(X_train, y_train)
probs_cal = rf_cal.predict_proba(X_test)[:, 1]

brier_raw = brier_score_loss(y_test, probs)
brier_cal = brier_score_loss(y_test, probs_cal)
print(f"Brier (raw): {brier_raw:.4f}  |  Brier (calibrated): {brier_cal:.4f}")

pt_raw, pp_raw = calibration_curve(y_test, probs,     n_bins=10, strategy="quantile")
pt_cal, pp_cal = calibration_curve(y_test, probs_cal, n_bins=10, strategy="quantile")

plt.figure(figsize=(6,5))
plt.plot(pp_raw, pt_raw, marker="o", label="raw")
plt.plot(pp_cal, pt_cal, marker="o", label="calibrated")
plt.plot([0,1],[0,1],'--',lw=1,label="perfect")
plt.xlabel("Predicted probability"); plt.ylabel("Observed frequency")
plt.title("Calibration (validation)")
plt.legend(); plt.tight_layout(); plt.show()

#### Fairness metrics by segment at the operating threshold

- Fairness metrics give TPR/FPR/Precision/SelectionRate per segment, plus disparity ranges; align with your org’s fairness policy.

In [None]:
def seg_report(df_eval, seg_col, y_true, y_prob, t):
    rows = []
    for val, idx in df_eval.groupby(seg_col).groups.items():
        yt = y_true.loc[idx]
        yp = y_prob.loc[idx]
        yhat = (yp >= t).astype(int)
        tp = int(((yhat==1) & (yt==1)).sum())
        fp = int(((yhat==1) & (yt==0)).sum())
        fn = int(((yhat==0) & (yt==1)).sum())
        tn = int(((yhat==0) & (yt==0)).sum())
        tpr = tp / (tp + fn + 1e-12)
        fpr = fp / (fp + tn + 1e-12)
        prec = tp / (tp + fp + 1e-12)
        sel = (yhat==1).mean()
        rows.append({"segment": seg_col, "value": val, "n": len(idx),
                     "TPR": tpr, "FPR": fpr, "Precision": prec, "SelectionRate": sel})
    return pd.DataFrame(rows)

test_eval = test.reset_index(drop=True)
y_true_s = pd.Series(y_test.values, index=test_eval.index, name="y_true")
y_prob_s = pd.Series(probs,       index=test_eval.index, name="y_prob")

segments = [c for c in ["device","region","ad_channel","campaign"] if c in test_eval.columns]
fair_list = [seg_report(test_eval, seg, y_true_s, y_prob_s, t_star) for seg in segments]
fair_df = pd.concat(fair_list, ignore_index=True)
display(fair_df.sort_values(["segment","value"]))

# Disparity summary
disp = (fair_df.groupby("segment")[["TPR","FPR","SelectionRate","Precision"]]
               .agg(lambda s: float(s.max() - s.min())).reset_index())
disp.columns = ["segment","ΔTPR","ΔFPR","ΔSelection","ΔPrecision"]
display(disp)

#### Plain-language summary(example): aggregate importances to original features & translate to policy

In [None]:
# Map transformed features back to original columns
fn_all = rf.named_steps["pre"].get_feature_names_out()
imp_all = rf.named_steps["clf"].feature_importances_

def to_original(feat_name: str) -> str:
    # Strip transformer prefixes if present
    if "__" in feat_name:
        feat_name = feat_name.split("__",1)[1]
    # Map OHE groups to their base feature
    for base in ["ad_channel","device","region","campaign"]:
        if feat_name.startswith(base + "_"):
            return base
    # Numeric or binary passthroughs
    for base in ["spend_l7","pages_per_session","sessions_l30","time_on_site_s",
                 "pricing_views_l7","email_opens_l30","past_purchases","tenure_days",
                 "discount_flag","competitor_visits"]:
        if feat_name.startswith(base):
            return base
    return feat_name

orig_map = pd.Series([to_original(f) for f in fn_all], index=fn_all)
agg = (pd.DataFrame({"orig_feature": orig_map, "importance": imp_all})
         .groupby("orig_feature", as_index=False)["importance"].sum()
         .sort_values("importance", ascending=False)
         .reset_index(drop=True))
agg["cum_share"] = agg["importance"].cumsum() / agg["importance"].sum()

display(agg)

# Plain-language summary to ~70% coverage
top = agg[agg["cum_share"] <= 0.70]
if top.empty:
    top = agg.head(1)
covered = top["importance"].sum() / agg["importance"].sum()
feat_list = ", ".join(top["orig_feature"].tolist())
print(f"Policy summary: The model’s decisions are driven mainly by {feat_list}, "
      f"which together account for ~{covered*100:.0f}% of feature importance. "
      "Prioritize data quality and governance for these fields, and consider segment-specific policies "
      "if fairness gaps align with them.")