# Hierarchical Clustering

Explore agglomerative clustering with Ward and Average linkage. Show a truncated dendrogram and cut at a chosen distance/k.

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, warnings
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score
from sklearn.neighbors import NearestCentroid
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, cophenet
from scipy.spatial.distance import pdist

!wget -q https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Unsupervised/unsup_utils.py.py
import unsup_utils as utils
csv_path = "https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Unsupervised/unsup.csv"
warnings.filterwarnings("ignore")

df = pd.read_csv(csv_path)
X, cols, sc = utils.feature_matrix(df, use_emb=True)

# Subsample for dendrogram readability
sub = min(600, X.shape[0])
Xsub = X[:sub]

# Dendrogram (using scipy if available)
try:
    from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
    Z = linkage(Xsub, method="ward")
    plt.figure(figsize=(10,4)); dendrogram(Z, truncate_mode="lastp", p=20, no_labels=True); plt.title("Truncated dendrogram (Ward)"); plt.tight_layout(); plt.show()
except Exception as e:
    print("scipy not available for dendrogram; skipping plot.")

# Agglomerative with chosen k
ward = AgglomerativeClustering(n_clusters=4, linkage="ward")
labels = ward.fit_predict(X)
df["cluster_ward"] = labels

X2, p = utils.pca_2d(X)
utils.plot_xy(X2, title="PCA (colored by Ward clusters)", labels=df["cluster_ward"].values)
pd.crosstab(df["cluster_ward"], df["doc_type"])

In [None]:
# Suggest coarse/fine cuts: use big “elbows” in linkage heights
heights = Z[:,2]
# Heuristic: coarse at ~95th pct height; fine at ~85th pct height
h_coarse = np.percentile(heights, 95)
h_fine   = np.percentile(heights, 85)
# Convert heights to cluster labels on the subsample
labs_coarse = fcluster(Z, t=h_coarse, criterion="distance")
labs_fine   = fcluster(Z, t=h_fine,   criterion="distance")
print(f"Coarse cut @ height≈{h_coarse:.2f} → k={len(np.unique(labs_coarse))}")
print(f"Fine   cut @ height≈{h_fine:.2f} → k={len(np.unique(labs_fine))}")

# Simple rationale text
print("Rationale: Coarse cut targets large merges near the top (big height jump) to yield few, robust clusters;")
print("Fine cut sets a lower height to capture secondary structure without over-fragmenting.")

In [None]:
# Cluster sizes & plain-language summaries
cluster_col = "cluster_ward"
sizes = df[cluster_col].value_counts().sort_index()
display(pd.DataFrame({"cluster": sizes.index, "size": sizes.values}))

# Numeric columns only, but drop any that start with "emb"
non_feat = set([cluster_col, "doc_type", "ts", "id"])
num_cols = [
    c for c in df.columns
    if c not in non_feat
    and np.issubdtype(df[c].dtype, np.number)
    and not c.startswith("emb")
]

if len(num_cols) == 0:
    print("No usable numeric columns for summaries; skipping.")
else:
    scaler = StandardScaler().fit(df[num_cols])
    Znum = pd.DataFrame(scaler.transform(df[num_cols]), columns=num_cols, index=df.index)

    summary_rows = []
    for k in sorted(df[cluster_col].unique()):
        idx = df[cluster_col] == k
        mean_z = Znum[idx].mean().sort_values(ascending=False)
        top_pos = mean_z.head(3).index.tolist()
        top_neg = mean_z.tail(3).index.tolist()
        summary = f"↑ {', '.join(top_pos)} | ↓ {', '.join(top_neg)}"
        summary_rows.append({
            "cluster": k,
            "size": int(idx.sum()),
            "summary": summary
        })

    summaries = pd.DataFrame(summary_rows).sort_values("cluster").reset_index(drop=True)
    display(summaries)

In [None]:
# Internal validity & cophenetic correlation
# Compute on full X with current labels
try:
    sil = silhouette_score(X, df[cluster_col].values)
except Exception:
    sil = np.nan
db  = davies_bouldin_score(X, df[cluster_col].values)
ch  = calinski_harabasz_score(X, df[cluster_col].values)
print(f"Silhouette: {sil:.3f}  (higher is better)")
print(f"Davies-Bouldin: {db:.3f}  (lower is better)")
print(f"Calinski–Harabasz: {ch:.1f}  (higher is better)")

# Cophenetic correlation (how well the tree preserves distances)
D = pdist(Xsub)
Z_full = linkage(Xsub, method="ward")
coph_corr, coph_dists = cophenet(Z_full, D)
print(f"Cophenetic correlation (Ward/tree fidelity on subsample): {coph_corr:.3f}")

In [None]:
# Cluster names & playbooks (auto, using summaries w/out emb*)
def make_name(summary_text: str) -> str:
    """Pick first two 'up' features from summary as cluster name."""
    ups = [t.strip() for t in summary_text.split("|")[0].replace("↑", "").split(",")]
    ups = [u for u in ups if u][:2]
    return " & ".join(ups) if ups else "Cluster"

def make_playbook(summary_text: str) -> str:
    """Generate a lightweight playbook guidance from top/bottom features."""
    ups = [t.strip() for t in summary_text.split("|")[0].replace("↑", "").split(",")]
    downs = [t.strip() for t in summary_text.split("|")[-1].replace("↓", "").split(",")]
    ups = [u for u in ups if u][:3]
    downs = [d for d in downs if d][:2]

    return (
        f"Lean into {', '.join(ups)}; "
        f"watch for {', '.join(downs)}. "
        "Use this cluster’s signature to tailor campaigns or ops playbooks."
    )

# Build cards from the summaries DataFrame (already excludes 'emb*')
cards = []
for _, row in summaries.iterrows():
    name = make_name(row["summary"])
    play = make_playbook(row["summary"])
    cards.append({
        "cluster": int(row["cluster"]),
        "name": name,
        "size": int(row["size"]),
        "summary": row["summary"],
        "playbook": play
    })

cluster_cards = pd.DataFrame(cards).sort_values("cluster").reset_index(drop=True)
display(cluster_cards)

In [None]:
# Anomaly policy: long branches & late joins
# Heuristic policy:
#  - Low silhouette samples (e.g., bottom 5%) within each cluster → “ambiguous membership”
#  - Large distance to cluster centroid in PCA space (top 2%) → “outliers / late joins”
# PCA to 2D for distance intuition
X2, _ = utils.pca_2d(X)
cent = NearestCentroid().fit(X2, df[cluster_col].values)
d2 = ((X2 - cent.centroids_[df[cluster_col].values])**2).sum(axis=1)**0.5

# Compute per-sample silhouette (falls back if k<2)
try:
    from sklearn.metrics import silhouette_samples
    sil_samp = silhouette_samples(X, df[cluster_col].values)
except Exception:
    sil_samp = np.full(X.shape[0], np.nan)

th_low_sil = np.nanpercentile(sil_samp, 5)
th_hi_dist = np.nanpercentile(d2, 98)

anomalies = pd.DataFrame({
    "low_silhouette": sil_samp < th_low_sil,
    "far_from_centroid": d2 > th_hi_dist,
}, index=df.index)
anomalies["policy_flag"] = anomalies.any(axis=1)

print(f"Policy thresholds → silhouette < {th_low_sil:.3f}, PCA-distance > {th_hi_dist:.3f}")
print("Flag rate:", anomalies["policy_flag"].mean().round(4))
display(df.loc[anomalies["policy_flag"], [cluster_col, "doc_type"]].head(10))