# BerTopic

## Install Dependencies

In [1]:
!pip install -r requirements.txt


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Loading Dataset


In [2]:
import random
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("SetFit/20_newsgroups")
random.seed(42)
text_label = list(zip(dataset["train"]["text"], dataset["train"]["label_text"]))
sampled_text_label = random.sample(text_label, 10000)

Repo card metadata block was not found. Setting CardData to empty.


## Clean Data


In [4]:
import re
def clean_for_embedding(text, max_sentences=5):
    lines = text.split("\n")
    lines = [line for line in lines if not line.strip().startswith(">")]
    lines = [line for line in lines if not re.match(r"^\s*(from|subject|organization|lines|writes|article)\s*:", line, re.IGNORECASE)]
    text = " ".join(lines)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[!?]{3,}", "", text)
    sentence_split = re.split(r'(?<=[.!?]) +', text)
    sentence_split = [
        s for s in sentence_split
        if len(s.strip()) > 15 and not s.strip().isupper()
      ]
    return " ".join(sentence_split[:max_sentences])
texts_clean = [clean_for_embedding(text) for text,_ in sampled_text_label]
labels = [label for _, label in sampled_text_label]

## BerTopic

In [5]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embedding_model = SentenceTransformer("all-mpnet-base-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model, # Step 1 - Extract embeddings
    umap_model=umap_model, # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model, # Step 5 - Extract topic words
    representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
)
topics, probs = topic_model.fit_transform(texts_clean)

topic_model.save("topic_model", serialization="pytorch")

## Result

## Topic Info (Monogram)

In [6]:
monogram_topic_model = BERTopic.load("topic_model")
monogram_topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3456,-1_different_version_better_does,"[different, version, better, does, doesn, run,...",
1,0,940,0_det_think_stl_pts,"[det, think, stl, pts, win, gm, van, vs, leafs...",
2,1,363,1_funding_contest_gehrels_money,"[funding, contest, gehrels, money, center, bas...",
3,2,332,2_wanted_just__,"[wanted, just, , , , , , , , ]",
4,3,316,3_say_doesn_does_s1,"[say, doesn, does, s1, scheme, s2, details, pr...",
...,...,...,...,...,...
79,78,17,78_informational_ila_officer_impede,"[informational, ila, officer, impede, illinois...",
80,79,16,79_sifting_usnail_vigilanties_cbw,"[sifting, usnail, vigilanties, cbw, protect, f...",
81,80,16,80_format_dlg_nfotis_digital,"[format, dlg, nfotis, digital, files, author, ...",
82,81,16,81_680x0_powerpcs_powerpc_computercity,"[680x0, powerpcs, powerpc, computercity, vram,...",


## Topic Info (Multigram)

In [7]:
multigram_topic_model = BERTopic.load("topic_model")
multigram_topic_model.update_topics(texts_clean, vectorizer_model=CountVectorizer(stop_words="english", ngram_range=(2,3)))
multigram_topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3456,-1_don know_does know_years ago_thanks advance,"[don know, does know, years ago, thanks advanc...",
1,0,940,0_00 00_new york_00 00 00_st john,"[00 00, new york, 00 00 00, st john, los angel...",
2,1,363,1_space station_sci space_archive space_modifi...,"[space station, sci space, archive space, modi...",
3,2,332,2_just wanted know_wanted know_just wanted_,"[just wanted know, wanted know, just wanted, ,...",
4,3,316,3_clipper chip_serial number_escrow houses_s1 s2,"[clipper chip, serial number, escrow houses, s...",
...,...,...,...,...,...
79,78,17,78_concealed carry_carry concealed_concealed w...,"[concealed carry, carry concealed, concealed w...",
80,79,16,79_weapons mass destruction_weapons mass_mass ...,"[weapons mass destruction, weapons mass, mass ...",
81,80,16,80_site maps_map thanks_surf gif_cross streets,"[site maps, map thanks, surf gif, cross street...",
82,81,16,81_lc iii_nubus card_slot pds_lc lc,"[lc iii, nubus card, slot pds, lc lc, pds slot...",


## Comparison

In [8]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

# Tokenize Document
tokenized_texts = [doc.split() for doc in texts_clean]

# Create Dictionary
dictionary = Dictionary(tokenized_texts)

# Extract Topics
topics = [ [word for word, _ in monogram_topic_model.get_topic(topic)] 
           for topic in monogram_topic_model.get_topics().keys()
           if topic != -1 ]  # skip outliers (-1)

# Compute Coherence
coherence_model = CoherenceModel(
    topics=topics,
    texts=tokenized_texts,
    dictionary=dictionary,
    coherence='c_v'
)

monogram_coherence = coherence_model.get_coherence()
print("Monogram C_v Coherence:", monogram_coherence)

Monogram C_v Coherence: 0.32972380315451427


In [9]:
tokenized_texts = [doc.split() for doc in texts_clean]
dictionary = Dictionary(tokenized_texts)

# Topics have to be split into singular words
topics = [
    sum([word.split() for word, _ in multigram_topic_model.get_topic(topic)], [])
    for topic in multigram_topic_model.get_topics().keys()
    if topic != -1
]

coherence_model = CoherenceModel(
    topics=topics,
    texts=tokenized_texts,
    dictionary=dictionary,
    coherence='c_v'
)

multigram_coherence = coherence_model.get_coherence()
print("Multigram C_v Coherence:", multigram_coherence)

Multigram C_v Coherence: 0.4175222509377205


## Using LLM to Improve Representation (WIP)

In [10]:
import os
import openai
from dotenv import load_dotenv
from bertopic.representation import OpenAI

# Load variables from .env file
load_dotenv()

client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
topic_model.update_topics(texts_clean, representation_model=OpenAI(client, model="gpt-4o-mini", delay_in_seconds=3))
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3456,-1_Gun control debate,[Gun control debate],"[Concerning the proposed newsgroup split, I pe..."
1,0,940,0_Hockey Game Statistics,[Hockey Game Statistics],[---------------------------------------------...
2,1,363,1_Space exploration challenges,[Space exploration challenges],[NASA Headquarters distributed the following p...
3,2,332,2_Uncertainty and Inquiry,[Uncertainty and Inquiry],"[, , ]"
4,3,316,3_Clipper chip encryption,[Clipper chip encryption],[One more time... If they released the algorit...
...,...,...,...,...,...
79,78,17,78_Gun carry laws,[Gun carry laws],"[Carrying a pistol, loaded or unloaded, in the..."
80,79,16,79_Political influence of gun organizations,[Political influence of gun organizations],[[This is a co-authored report from two of us ...
81,80,16,80_Graphics and Image Data,[Graphics and Image Data],"[Hello, I've been trying to bump map a gif ont..."
82,81,16,81_PDS and NuBus compatibility,[PDS and NuBus compatibility],[Forgive me if this has been asked before... b...
