In [3]:
# train_bertopic_unified.py
import os
import json
import pandas as pd
from tqdm import tqdm
import umap
import hdbscan
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

# ============================
# Paths
# ============================
CORPUS_CSV = r"D:\newsprep2\data\merged\unified_corpus.csv"
MODELS_DIR = r"D:\newsprep2\backend\app\ml\models\topics"
os.makedirs(MODELS_DIR, exist_ok=True)

# ============================
# Load Data
# ============================
df = pd.read_csv(CORPUS_CSV)

# SAMPLE for faster first-time testing
docs = df["text"].astype(str).tolist()
docs = docs[:20000]   # keep sampling for now

print("Documents loaded:", len(docs))

# ============================
# Embedding Model
# ============================
EMBEDDER = "all-mpnet-base-v2"   # strong but slower
# EMBEDDER = "all-MiniLM-L6-v2"  # faster for CPU

print(f"Loading SBERT embedder: {EMBEDDER}")
sbert = SentenceTransformer(EMBEDDER)

# ============================
# UMAP + HDBSCAN (legacy compatible)
# ============================
print("Building UMAP model...")
umap_model = umap.UMAP(
    n_neighbors=15,
    n_components=5,
    metric="cosine",
    low_memory=True
)

print("Building HDBSCAN model...")
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=20,
    min_samples=5,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

# ============================
# BERTopic Model
# ============================
print("Initializing BERTopic...")

topic_model = BERTopic(
    embedding_model=sbert,
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    verbose=True,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model
)

# ============================
# Train Model
# ============================
print("Training BERTopic model on", len(docs), "documents...")
topics, probs = topic_model.fit_transform(docs)
print("Training complete!")

# ============================
# Save Model
# ============================
save_path = os.path.join(MODELS_DIR, "bertopic_unified")
print("Saving BERTopic model to:", save_path)

topic_model.save(save_path)

# ============================
# Topic Info
# ============================
topic_info = topic_model.get_topic_info()

topic_info.to_csv(os.path.join(MODELS_DIR, "bertopic_info.csv"), index=False)
topic_info.to_json(
    os.path.join(MODELS_DIR, "bertopic_info.json"),
    orient="records",
    force_ascii=False
)

# Save topic keywords
keywords = {}
for t in topic_info.Topic.tolist():
    if t == -1:
        continue
    keywords[int(t)] = [w for w, _ in topic_model.get_topic(t)]

with open(os.path.join(MODELS_DIR, "bertopic_keywords.json"), "w", encoding="utf-8") as f:
    json.dump(keywords, f, indent=2, ensure_ascii=False)

# ============================
# Diagnostics
# ============================
frac_outliers = (pd.Series(topics) == -1).mean()
print(f"Outliers Rate (-1): {frac_outliers:.4f}")
print("Sample topic info:\n", topic_info.head())

# ============================
# Optional: Reduce Topics
# ============================
print("\nReducing topics to 50 (optional)...")
reduced_model = topic_model.reduce_topics(docs, nr_topics=50)
reduced_save_path = os.path.join(MODELS_DIR, "bertopic_unified_reduced")
reduced_model.save(reduced_save_path)

print("Reduced model saved at:", reduced_save_path)
print("\nðŸŽ‰ BERTopic training completed successfully!")


Documents loaded: 20000
Loading SBERT embedder: all-mpnet-base-v2
Building UMAP model...
Building HDBSCAN model...
Initializing BERTopic...
Training BERTopic model on 20000 documents...


2025-11-21 13:48:41,890 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/625 [00:00<?, ?it/s]

2025-11-21 14:04:43,837 - BERTopic - Embedding - Completed âœ“
2025-11-21 14:04:43,890 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-21 14:05:30,075 - BERTopic - Dimensionality - Completed âœ“
2025-11-21 14:05:30,079 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-21 14:06:27,790 - BERTopic - Cluster - Completed âœ“
2025-11-21 14:06:27,828 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-21 14:06:31,315 - BERTopic - Representation - Completed âœ“


Training complete!
Saving BERTopic model to: D:\newsprep2\backend\app\ml\models\topics\bertopic_unified


2025-11-21 14:06:53,599 - BERTopic - Topic reduction - Reducing number of topics
2025-11-21 14:06:53,649 - BERTopic - Representation - Fine-tuning topics using representation models.


Outliers Rate (-1): 0.2276
Sample topic info:
    Topic  Count                                Name  \
0     -1   4553                    -1_the_to_of_and   
1      0    560       0_season_quarterback_ap_coach   
2      1    519  1_bush_kerry_convention_john kerry   
3      2    514             2_najaf_sadr_iraq_iraqi   
4      3    244           3_fund_mutual_enron_funds   

                                      Representation  \
0      [the, to, of, and, in, for, on, with, 39, is]   
1  [season, quarterback, ap, coach, ap ap, opener...   
2  [bush, kerry, convention, john kerry, presiden...   
3  [najaf, sadr, iraq, iraqi, cleric, shrine, al ...   
4  [fund, mutual, enron, funds, securities, shell...   

                                 Representative_Docs  
0  [Stocks Fall on Security, Earnings Worries  NE...  
1  [Warner Snaps Back As rookie Eli Manning strug...  
2  [Anti-Kerry Ads Not Unfair, Laura Bush Says (R...  
3  [Militants Remove Arms From Najaf Shrine NAJAF...  
4  [Two In

2025-11-21 14:06:56,972 - BERTopic - Representation - Completed âœ“
2025-11-21 14:06:56,982 - BERTopic - Topic reduction - Reduced number of topics from 293 to 50


Reduced model saved at: D:\newsprep2\backend\app\ml\models\topics\bertopic_unified_reduced

ðŸŽ‰ BERTopic training completed successfully!


In [None]:
from bertopic import BERTopic
import os

MODELS_DIR = r"D:\newsprep2\backend\app\ml\models\topics"
topic_model = BERTopic.load(os.path.join(MODELS_DIR, "bertopic_unified"))

# Visualize topics (interactive)
fig = topic_model.visualize_topics()
fig.show()   # in notebook, this will open interactive plot

# Visualize topic probabilities for a single topic id
fig2 = topic_model.visualize_barchart(top_n_topics=20)
fig2.show()

# Visualize topic similarity (map)
topic_model.visualize_topics()


In [None]:
# assign_topics_to_corpus.py
import os, pandas as pd
from bertopic import BERTopic

CSV = r"D:\newsprep2\data\merged\unified_corpus.csv"

MODELS_DIR = r"D:\newsprep2\backend\app\ml\models\topics"
OUT_CSV = r"D:\newsprep2\data\merged/unified_corpus_with_topics.csv"

topic_model = BERTopic.load(os.path.join(MODELS_DIR, "bertopic_unified"))

df = pd.read_csv(CSV)
docs = df["text"].astype(str).tolist()

# Transform in batches to avoid memory issues
batch_size = 5000
topic_ids = []
topic_probs = []
for i in range(0, len(docs), batch_size):
    batch = docs[i:i+batch_size]
    t, p = topic_model.transform(batch)
    topic_ids.extend([int(x) for x in t])
    # p may be None if probabilities not available for some models; handle
    if p is None:
        topic_probs.extend([None]*len(t))
    else:
        # p is array of probabilities per doc (float)
        # if p is an array of arrays, take first item
        try:
            topic_probs.extend([float(x) for x in p])
        except Exception:
            topic_probs.extend([None]*len(t))

df["bertopic_topic"] = topic_ids
df["bertopic_topic_prob"] = topic_probs
df.to_csv(OUT_CSV, index=False)
print("Saved:", OUT_CSV)


In [None]:
# train_lda.py
import os, json
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel
from ml.data_preprocessing import clean_text   # if running from ml/ adjust path

CORPUS_CSV = "data/merged/unified_corpus.csv"
OUT_DIR = "backend/app/ml/models/topics/lda"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(CORPUS_CSV)
# Use the cleaned text if present, else preprocess raw
if "clean_text" in df.columns:
    texts = df["clean_text"].astype(str).tolist()
else:
    texts = [clean_text(t).split() for t in df["text"].astype(str).tolist()]
# If clean_text exists it's a string; split into tokens:
if isinstance(texts[0], str):
    texts = [t.split() for t in texts]

# Create Dictionary & Corpus
id2word = corpora.Dictionary(texts)
id2word.filter_extremes(no_below=10, no_above=0.5, keep_n=50000)
corpus = [id2word.doc2bow(t) for t in texts]

# Train LDA
NUM_TOPICS = 20   # tune
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=NUM_TOPICS, random_state=42, passes=10, chunksize=2000)

# Save model & topics
lda_model.save(os.path.join(OUT_DIR, "lda.model"))
id2word.save(os.path.join(OUT_DIR, "id2word.dict"))

lda_topics = {i: [word for word,_ in lda_model.show_topic(i, topn=10)] for i in range(NUM_TOPICS)}
with open(os.path.join(OUT_DIR, "lda_topics.json"), "w", encoding="utf-8") as f:
    json.dump(lda_topics, f, ensure_ascii=False, indent=2)

print("LDA topics saved to", OUT_DIR)


In [None]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.display(vis)   # in notebook
# Or save html:
pyLDAvis.save_html(vis, "backend/app/ml/models/topics/lda/lda_vis.html")


In [None]:
# generate_topic_labels.py
import json
from bertopic import BERTopic
MODELS_DIR = "backend/app/ml/models/topics"
topic_model = BERTopic.load(os.path.join(MODELS_DIR, "bertopic_unified"))
topic_info = topic_model.get_topic_info()

# Auto-generate short labels using top keywords
labels = {}
for t in topic_info.Topic:
    if t == -1:
        labels[-1] = "No clear topic"
        continue
    words = topic_model.get_topic(t)
    label = " / ".join([w for w,_ in words[:4]])
    labels[int(t)] = label

with open(os.path.join(MODELS_DIR, "bertopic_labels.json"), "w", encoding="utf-8") as f:
    json.dump(labels, f, ensure_ascii=False, indent=2)

print("Saved topic labels:", os.path.join(MODELS_DIR, "bertopic_labels.json"))


In [None]:
# fallback_nearest_topics.py
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

MODELS_DIR = "backend/app/ml/models/topics"
topic_model = BERTopic.load(os.path.join(MODELS_DIR, "bertopic_unified"))
sbert = SentenceTransformer("all-mpnet-base-v2")

# ensure topic embeddings exist
if hasattr(topic_model, "topic_embeddings_") and topic_model.topic_embeddings_ is not None:
    topic_embs = topic_model.topic_embeddings_
    topic_info = topic_model.get_topic_info()
    valid_topics = topic_info[topic_info.Topic != -1]["Topic"].tolist()

    def nearest_topics_for_text(text, top_k=3):
        emb = sbert.encode([text])
        sims = cosine_similarity(emb, topic_embs)[0]
        # map sims to topic ids (topic_info excluding -1)
        pairs = list(zip(valid_topics, sims[:len(valid_topics)]))
        pairs = sorted(pairs, key=lambda x: x[1], reverse=True)[:top_k]
        return pairs
else:
    print("topic_embeddings_ not available. Refit BERTopic with 'calculate_probabilities=True' or compute centroids manually.")
