In [None]:
# Cell 1: assemble summary DataFrame with theme, prevalences, delta, and quotes
import pandas as pd
import joblib
from pathlib import Path

proc_dir  = Path("data/processed")
model_dir = Path("models")

# (A) Topic themes
topic_keywords = {
    0: "Cuisine & Prestige",
    1: "Desire & Evaluation",
    2: "Dessert & Comparisons",
    3: "Queue & Wait Times",
    4: "Awards & Hype",
    5: "Reservation Frustration",
    6: "Ambiance & Location",
    7: "Menu Variety (Sushi)"
}

# (B) Load prevalences from previous step (or recompute)
# Recompute prevalences for both sets:
prev = {}
for slug in ["high_rating","most_commented"]:
    df2   = pd.read_parquet(proc_dir / f"{slug}_with_clusters.parquet")
    lda2, vec2 = joblib.load(model_dir / f"{slug}_lda.pkl")
    X2    = vec2.transform(df2["clean_joined"])
    dom   = lda2.transform(X2).argmax(axis=1)
    prev[slug] = pd.Series(dom).value_counts(normalize=True).sort_index() * 100

# (C) Collect 1–2 example quotes per topic (from high_rating)
df_hr = pd.read_parquet(proc_dir/"high_rating_with_clusters.parquet")
# reuse your examples dict from Cell 3, or re-generate quickly:
from src.topic_inspection import _NEG_KEYWORDS
quotes = {}
for t in sorted(df_hr["dominant_topic"].unique()):
    sub = df_hr[df_hr["dominant_topic"]==t].copy()
    sub["hits"] = sub["clean_joined"].map(lambda txt: sum(kw in txt for kw in _NEG_KEYWORDS))
    tops = sub.sort_values("hits", ascending=False).head(2)["comment"].tolist()
    quotes[t] = " / ".join(q[:100] + "…" for q in tops)

# (D) Build the summary table
rows = []
for t in range(len(topic_keywords)):
    hr = prev["high_rating"].get(t, 0.0)
    mc = prev["most_commented"].get(t, 0.0)
    rows.append({
        "Topic ID": t,
        "Theme": topic_keywords[t],
        "HighRating (%)": round(hr,1),
        "MostCommented (%)": round(mc,1),
        "Δ (%)": round(hr - mc,1),
        "Example Quotes": quotes.get(t, "")
    })

summary = pd.DataFrame(rows)
summary = summary.sort_values("Δ (%)", ascending=False)
summary

: 

In [None]:
# Cell 2a: save to CSV
out_csv = proc_dir / "topic_summary.csv"
summary.to_csv(out_csv, index=False)
print("Saved summary CSV to", out_csv)

# Cell 2b: display as Markdown for copy-paste
from IPython.display import Markdown
md = summary.to_markdown(index=False)
display(Markdown(md))

In [None]:
# Cell 3: horizontal bar chart of Δ (%)
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.barh(summary["Theme"], summary["Δ (%)"], color=[
    "#2ca02c" if d>0 else "#d62728" for d in summary["Δ (%)"]
])
plt.axvline(0, color="black", linewidth=0.8)
plt.title("High-Rating vs Most-Commented Δ by Topic")
plt.xlabel("Δ = HighRating – MostCommented (%)")
plt.tight_layout()
plt.show()