In [2]:
# bertopic_run.py
from pathlib import Path
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import umap

DATA = Path("data"); REPORTS = Path("reports"); REPORTS.mkdir(parents=True, exist_ok=True)



ModuleNotFoundError: No module named 'bertopic'

In [None]:
# --- Load texts + (optional) precomputed embeddings ---
df = pd.read_parquet(DATA / "shows_with_cluster_labels.parquet")
df = df[df["ai_summary"].notna() & (df["ai_summary"].str.strip() != "")].reset_index(drop=True)
texts = df["ai_summary"].tolist()

# If you already have SentenceTransformer embeddings (same order as vectors/summaries_index.parquet):
X = np.load("vectors/summaries.npy")
idx = pd.read_parquet("vectors/summaries_index.parquet")
# align to df by id
align = pd.merge(idx, df[["id"]], on="id", how="inner").reset_index(drop=True)
mask = align.index.values  # rows to select from X in correct order
embeddings = X[mask]       # (n, d)

print(f"Texts: {len(texts)} | Embeddings: {embeddings.shape}")

# --- Configure UMAP & Vectorizer ---
umap_model = umap.UMAP(
    n_neighbors=15, min_dist=0.0, n_components=5, metric="cosine", random_state=42
)
vectorizer_model = CountVectorizer(
    stop_words="english", ngram_range=(1, 2), min_df=2, max_df=0.8
)

# --- Build & fit BERTopic ---
topic_model = BERTopic(
    umap_model=umap_model,                    # dimensionality reduction
    vectorizer_model=vectorizer_model,        # c-TF-IDF representation
    min_topic_size=10,                        # tweak based on dataset size
    calculate_probabilities=True,
    verbose=True,
    nr_topics=None,                           # or set an int to force a target #topics
    seed_topic_list=None
)

topics, probs = topic_model.fit_transform(texts, embeddings=embeddings)
df["bertopic_topic"] = topics
print("Unique topics (incl. -1):", sorted(pd.unique(topics)))

# --- Topic info (top terms per topic) ---
topic_info = topic_model.get_topic_info()  # columns: Topic, Count, Name, Representation
topic_terms = []
for t in topic_info["Topic"]:
    if t == -1:  # outliers
        continue
    terms = topic_model.get_topic(t)  # list of (term, ctfidf)
    topic_terms.append({
        "Topic": t,
        "TopTerms": ", ".join([w for w, _ in terms[:10]]),
        "Count": int(topic_info.loc[topic_info["Topic"]==t, "Count"].values[0]),
        "Name": topic_info.loc[topic_info["Topic"]==t, "Name"].values[0]
    })
topic_terms_df = pd.DataFrame(topic_terms).sort_values("Count", ascending=False)

# --- Save per-show topics + per-topic terms ---
out_shows = DATA / "shows_with_bertopic.parquet"
out_topics = DATA / "bertopic_topics.csv"
df.to_parquet(out_shows, index=False)
topic_terms_df.to_csv(out_topics, index=False, encoding="utf-8")
print("Saved:", out_shows.resolve())
print("Saved:", out_topics.resolve())

# --- (Optional) quick text preview per topic
for t in topic_terms_df["Topic"].head(5):
    examples = df[df["bertopic_topic"]==t]["ai_summary"].head(2).tolist()
    print(f"\nTopic {t} — {topic_model.get_topic_info().set_index('Topic').loc[t,'Name']}")
    print("Top terms:", topic_terms_df[topic_terms_df["Topic"]==t]["TopTerms"].values[0])
    for e in examples:
        print("-", (e[:160] + "…"))
