# Density-Based Clustering — DBSCAN / HDBSCAN

Use a k-distance plot to pick eps, run DBSCAN to discover shape-based clusters and noise, and optionally compare to HDBSCAN (if available).

In [None]:
# !pip install hdbscan

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, warnings
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.utils import resample
from sklearn.neighbors import NearestNeighbors

!wget -q https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Unsupervised/unsup_utils.py
import unsup_utils as utils
csv_path = "https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Unsupervised/unsup.csv"
warnings.filterwarnings("ignore")

df = pd.read_csv(csv_path)
X, cols, sc = utils.feature_matrix(df, use_emb=True)

kd = utils.k_distance_plot(X, k=5)
plt.figure(figsize=(8,3.5)); plt.plot(kd); plt.title("k-distance plot (k=5)"); plt.xlabel("points (sorted)"); plt.ylabel("distance to 5th neighbor"); plt.tight_layout(); plt.show()

# Heuristic eps choice: knee around 80th percentile
eps = float(np.percentile(kd, 80))
db = DBSCAN(eps=eps, min_samples=10, n_jobs=-1).fit(X)
df["cluster_db"] = db.labels_  # -1 = noise

X2, _ = utils.pca_2d(X)
utils.plot_xy(X2, title="PCA (colored by DBSCAN label)", labels=df["cluster_db"].values)

# Try HDBSCAN if installed
try:
    import hdbscan
    hdb = hdbscan.HDBSCAN(min_cluster_size=25, min_samples=10).fit(X)
    df["cluster_hdb"] = hdb.labels_
    utils.plot_xy(X2, title="PCA (colored by HDBSCAN label)", labels=df["cluster_hdb"].values)
except Exception as e:
    print("hdbscan not installed; skipping HDBSCAN demo.")

In [None]:
# DBSCAN: highlight the chosen eps on the existing k-distance curve
plt.figure(figsize=(8,3.5))
plt.plot(kd)
plt.axhline(y=eps, linestyle="--")
plt.title(f"k-distance plot (k=5) — chosen eps ≈ {eps:.3f}")
plt.xlabel("points (sorted)"); plt.ylabel("distance to 5th neighbor")
plt.tight_layout(); plt.show()

# HDBSCAN: condensed tree with selection (if available)
try:
    import hdbscan
    if 'hdb' in globals() and hasattr(hdb, "condensed_tree_"):
        _ = hdb.condensed_tree_.plot(select_clusters=True, label_clusters=True)
        plt.title(f"HDBSCAN condensed tree (min_cluster_size={getattr(hdb, 'min_cluster_size', 'n/a')}, "
                  f"min_samples={getattr(hdb, 'min_samples', 'n/a')})")
        plt.tight_layout(); plt.show()
    else:
        print("HDBSCAN object not present or no condensed_tree_; skipping condensed tree plot.")
except Exception as e:
    print("hdbscan not installed or plotting failed; skipping condensed tree.")

In [None]:
# Cluster summary with sizes & plain-language names
# Work with DBSCAN first; fall back to HDBSCAN if DBSCAN labels absent
label_col = "cluster_db" if "cluster_db" in df.columns else ("cluster_hdb" if "cluster_hdb" in df.columns else None)
if label_col is None:
    raise RuntimeError("No cluster label column found (expected cluster_db or cluster_hdb).")

# Sizes (include noise = -1)
sizes = df[label_col].value_counts().sort_index()
display(pd.DataFrame({"label": sizes.index, "size": sizes.values}))

# Numeric feature set excluding embeddings (emb*) and non-features
non_feat = {label_col, "ts", "id"}
num_cols = [c for c in X.columns.tolist() if c in getattr(df, 'columns', [])] if hasattr(X, "columns") else []
# If X is ndarray, infer from original 'cols'
if not num_cols and 'cols' in globals():
    num_cols = list(cols)

num_cols = [c for c in num_cols
            if c not in non_feat
            and (not str(c).startswith("emb"))
            and np.issubdtype(df[c].dtype, np.number)]

if len(num_cols) == 0:
    print("No numeric columns (excluding emb*) available for plain-language summaries.")
    summaries = pd.DataFrame({"label": sizes.index, "size": sizes.values, "summary": "n/a"})
else:
    scaler = StandardScaler().fit(df[num_cols])
    Znum = pd.DataFrame(scaler.transform(df[num_cols]), columns=num_cols, index=df.index)

    rows = []
    for k in sorted(df[label_col].unique()):
        idx = df[label_col] == k
        if idx.sum() < 2:
            summary = "too few points"
        else:
            mean_z = Znum[idx].mean().sort_values(ascending=False)
            top_pos = mean_z.head(3).index.tolist()
            top_neg = mean_z.tail(3).index.tolist()
            summary = f"↑ {', '.join(top_pos)} | ↓ {', '.join(top_neg)}"
        rows.append({"label": int(k), "size": int(idx.sum()), "summary": summary})
    summaries = pd.DataFrame(rows).sort_values("label").reset_index(drop=True)

# Auto names & simple playbooks
def make_name(summary_text: str) -> str:
    ups = [t.strip() for t in summary_text.split("|")[0].replace("↑","").split(",")]
    ups = [u for u in ups if u][:2]
    return " & ".join(ups) if ups else ("Noise" if "−1" in str(summary_text) else "Cluster")

def make_playbook(summary_text: str) -> str:
    ups = [t.strip() for t in summary_text.split("|")[0].replace("↑","").split(",") if t.strip()]
    downs = [t.strip() for t in summary_text.split("|")[-1].replace("↓","").split(",") if t.strip()]
    return (f"Lean into {', '.join(ups[:3])}; watch {', '.join(downs[:2])}. "
            "Pilot targeted actions and track next-period lift.")

summaries["name"] = summaries["summary"].apply(make_name)
summaries["playbook"] = summaries["summary"].apply(make_playbook)
display(summaries)

In [None]:
#Silhouette on non-noise + stability via bootstraps
labels = df[label_col].values
mask_nn = labels != -1
if mask_nn.sum() > 1 and len(np.unique(labels[mask_nn])) > 1:
    try:
        sil_nn = silhouette_score(X[mask_nn] if isinstance(X, np.ndarray) else X.values[mask_nn], labels[mask_nn])
        print(f"Silhouette (non-noise): {sil_nn:.3f}")
    except Exception as e:
        print("Silhouette failed:", e)
else:
    print("Not enough non-noise clusters for silhouette.")

# Bootstrap stability: fit DBSCAN on bootstrap samples and ARI vs original on overlap
def dbscan_bootstrap_ari(Xarr, base_labels, eps, min_samples=10, n_boot=8, frac=0.8, seed=42):
    rng = np.random.RandomState(seed)
    aris = []
    n = Xarr.shape[0]
    for b in range(n_boot):
        idx = rng.choice(n, int(n*frac), replace=True)
        db_b = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1).fit(Xarr[idx])
        # Compare labels on sampled indices
        aris.append(adjusted_rand_score(base_labels[idx], db_b.labels_))
    return np.array(aris)

if label_col == "cluster_db":
    Xarr = X if isinstance(X, np.ndarray) else X.values
    aris = dbscan_bootstrap_ari(Xarr, labels, eps=eps, min_samples=10, n_boot=10)
    print(f"Bootstrap ARI (DBSCAN): mean={aris.mean():.3f} ± {aris.std():.3f}")
else:
    print("Bootstrap stability for HDBSCAN not implemented here (requires prediction utilities).")

In [None]:
# Noise triage plan example (owners & SLA)
noise_mask = (df[label_col] == -1)
noise_rate = noise_mask.mean()
print(f"Noise label rate: {noise_rate:.3%}  (count = {noise_mask.sum()})")

triage = pd.DataFrame([
    {"priority":"P1 (weekly)",  "owner":"Data Eng",     "action":"Check feature scaling/missingness; recompute KD plot; verify eps/min_samples.", "SLA":"7 days"},
    {"priority":"P1 (weekly)",  "owner":"Analytics",    "action":"Sample 50 noise points; inspect PCA distance; label drift check.",            "SLA":"7 days"},
    {"priority":"P2 (monthly)","owner":"Marketing Ops","action":"If segment-skewed noise >10pp, adjust routing thresholds or add rules.",       "SLA":"30 days"},
    {"priority":"P3 (quarter)","owner":"Product",       "action":"Consider richer features or alt. algorithm (HDBSCAN/OPTICS).",               "SLA":"90 days"},
])
display(triage)

In [None]:
# Routing function/API for new points
# Build a core-sample router for DBSCAN (predict-like)
if label_col == "cluster_db":
    est = DBSCAN(eps=eps, min_samples=10, n_jobs=-1).fit(X if isinstance(X, np.ndarray) else X.values)
    core_idx = est.core_sample_indices_ if hasattr(est, "core_sample_indices_") else np.array([], dtype=int)
    if core_idx.size > 0:
        core_labels = est.labels_[core_idx]
        nn = NearestNeighbors(n_neighbors=1).fit((X if isinstance(X, np.ndarray) else X.values)[core_idx])
    else:
        core_labels = np.array([]); nn = None

    def route_dbscan(x_new, max_dist=None):
        """
        Assign new point to the nearest core sample's cluster; if farther than max_dist (or no cores), return -1 (noise).
        x_new: array-like shape (n_features,)
        """
        if nn is None or core_idx.size == 0:
            return -1
        d, idx = nn.kneighbors(np.asarray(x_new).reshape(1,-1), n_neighbors=1, return_distance=True)
        if (max_dist is not None) and (d[0,0] > max_dist):
            return -1
        return int(core_labels[idx[0,0]])

    # Example usage:
    example_label = route_dbscan((X[0] if isinstance(X, np.ndarray) else X.values[0]), max_dist=eps)
    print("DBSCAN router example label:", example_label)
else:
    # HDBSCAN: use approximate_predict if available
    try:
        import hdbscan
        from hdbscan.prediction import approximate_predict
        def route_hdbscan(x_new):
            """
            Assign via HDBSCAN approximate_predict; returns (label, strength).
            """
            lab, strength = approximate_predict(hdb, np.asarray(x_new).reshape(1,-1))
            return int(lab[0]), float(strength[0])
        lab_ex, s_ex = route_hdbscan(X[0] if isinstance(X, np.ndarray) else X.values[0])
        print("HDBSCAN router example:", lab_ex, "strength:", round(s_ex,3))
    except Exception as e:
        print("HDBSCAN routing not available:", e)

In [None]:
# Fairness audit focused on NOISE and playbook exposure
# Pick candidate segment columns (low-cardinality categoricals)
cand = [c for c in df.columns if df[c].dtype == 'object' or pd.api.types.is_categorical_dtype(df[c])]
cand += [c for c in df.columns if df[c].nunique() <= 8 and c not in [label_col] and c not in cand]
cand = list(dict.fromkeys(cand))  # dedupe, keep order

def segment_table(col):
    g = df.groupby(col, dropna=False)
    out = g.apply(lambda s: pd.Series({
        "n": len(s),
        "noise_rate": np.mean(s[label_col] == -1),
        "clustered_rate": np.mean(s[label_col] != -1),
    }))
    return out.sort_values("noise_rate", ascending=False)

audits = {}
for c in cand[:6]:  # limit to a few
    try:
        tab = segment_table(c)
        audits[c] = tab
        print(f"\n== Segment: {c} ==")
        display(tab)
        # Simple flag if disparity > 10 percentage points between max and min noise_rate
        if tab["noise_rate"].max() - tab["noise_rate"].min() > 0.10:
            print("Disparity > 10pp in noise rate across segments — review routing thresholds/features.")
    except Exception:
        pass

# Optional: list which segments are over/under-exposed to each cluster playbook (excluding noise)
if "name" in summaries.columns:
    exposed = df[df[label_col] != -1].merge(summaries[["label","name"]], left_on=label_col, right_on="label", how="left")
    for c in cand[:4]:
        try:
            share = exposed.pivot_table(index=c, columns="name", values="label", aggfunc='count', fill_value=0)
            share = (share.T / share.sum(axis=1)).T  # normalize per segment
            print(f"\nPlaybook exposure by segment: {c}")
            display(share.round(3))
        except Exception:
            pass