# BerTopic

## Install Dependencies

In [1]:
!pip install -r requirements.txt


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Loading Dataset


In [2]:
import random
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("SetFit/20_newsgroups")
random.seed(42)
text_label = list(zip(dataset["train"]["text"], dataset["train"]["label_text"]))
sampled_text_label = random.sample(text_label, 10000)

Repo card metadata block was not found. Setting CardData to empty.


## Clean Data


In [4]:
import re
def clean_for_embedding(text, max_sentences=5):
    lines = text.split("\n")
    lines = [line for line in lines if not line.strip().startswith(">")]
    lines = [line for line in lines if not re.match(r"^\s*(from|subject|organization|lines|writes|article)\s*:", line, re.IGNORECASE)]
    text = " ".join(lines)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[!?]{3,}", "", text)
    sentence_split = re.split(r'(?<=[.!?]) +', text)
    sentence_split = [
        s for s in sentence_split
        if len(s.strip()) > 15 and not s.strip().isupper()
      ]
    return " ".join(sentence_split[:max_sentences])
texts_clean = [clean_for_embedding(text) for text,_ in sampled_text_label]
labels = [label for _, label in sampled_text_label]

## BerTopic

In [5]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Step 1 - Extract embeddings
print(f"CUDA Available: {torch.cuda.is_available()}")
#embedding_model = SentenceTransformer("all-MiniLM-L6-v2",device)
embedding_model = SentenceTransformer("all-mpnet-base-v2",device)

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model, # Step 1 - Extract embeddings
    umap_model=umap_model, # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model, # Step 5 - Extract topic words
    representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
)
topics, probs = topic_model.fit_transform(texts_clean)

topic_model.save("topic_model", serialization="pytorch")

CUDA Available: True


## Result

## Topic Info (Monogram)

In [6]:
monogram_topic_model = BERTopic.load("topic_model")
monogram_topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3038,-1_graphics_heard_version_display,"[graphics, heard, version, display, looking, a...",
1,0,992,0_toyota_ford_didn_don,"[toyota, ford, didn, don, did, think, good, do...",
2,1,412,1_announcement_possible_patent_new,"[announcement, possible, patent, new, make, us...",
3,2,394,2_det_82_81_60,"[det, 82, 81, 60, 73, 58, nhl, van, 78, 72]",
4,3,367,3_offensive_fan_east_don,"[offensive, fan, east, don, hit, houston, defe...",
...,...,...,...,...,...
71,70,17,70_premises_discussion_argue_evolutionism,"[premises, discussion, argue, evolutionism, th...",
72,71,16,71_operators_results_announcing_previdi,"[operators, results, announcing, previdi, aren...",
73,72,15,72_motif_odd_translations_translation,"[motif, odd, translations, translation, insert...",
74,73,15,73_0f000_supposedly_0a000_yielded,"[0f000, supposedly, 0a000, yielded, 0e000, res...",


## Topic Info (Multigram)

In [7]:
multigram_topic_model = BERTopic.load("topic_model")
multigram_topic_model.update_topics(texts_clean, vectorizer_model=CountVectorizer(stop_words="english", ngram_range=(2,3)))
multigram_topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3038,-1_don know_does know_years ago_thanks advance,"[don know, does know, years ago, thanks advanc...",
1,0,992,0_send requests_000 miles_don know_new car,"[send requests, 000 miles, don know, new car, ...",
2,1,412,1_clipper chip_serial number_law enforcement_p...,"[clipper chip, serial number, law enforcement,...",
3,2,394,2_st john_cape breton_pts pt_maple leafs,"[st john, cape breton, pts pt, maple leafs, 15...",
4,3,367,3_00 00_00 00 00_01 00_00 01,"[00 00, 00 00 00, 01 00, 00 01, red sox, 00 00...",
...,...,...,...,...,...
71,70,17,70_assume god just_assume god_did create_just ...,"[assume god just, assume god, did create, just...",
72,71,16,71_junk mail_improper etiquette_printing busin...,"[junk mail, improper etiquette, printing busin...",
73,72,15,72_menu accelerators_f1 key_arrow keys_string ...,"[menu accelerators, f1 key, arrow keys, string...",
74,73,15,73_version winbench_stealth 24_revision board_...,"[version winbench, stealth 24, revision board,...",


## Comparison

In [8]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

# Tokenize Document
tokenized_texts = [[str(token) for token in doc.split() if token.strip() != ''] for doc in texts_clean]

# Create Dictionary
dictionary = Dictionary(tokenized_texts)

# Extract Topics
# Filter topic words to exist in the dictionary
topics = [
    [str(word) for word, _ in words_probs if str(word) in dictionary.token2id]
    for topic_id, words_probs in monogram_topic_model.get_topics().items()
    if topic_id != -1
]

# Remove empty topics (just in case)
topics = [t for t in topics if len(t) > 0]

# Compute Coherence
coherence_model = CoherenceModel(
    topics=topics,
    texts=tokenized_texts,
    dictionary=dictionary,
    coherence='c_v'
)

monogram_coherence = coherence_model.get_coherence()
print("Monogram C_v Coherence:", monogram_coherence)

Monogram C_v Coherence: 0.34327752059953076


In [9]:
tokenized_texts = [doc.split() for doc in texts_clean]
dictionary = Dictionary(tokenized_texts)

# Topics have to be split into singular words
topics = [
    sum([word.split() for word, _ in multigram_topic_model.get_topic(topic)], [])
    for topic in multigram_topic_model.get_topics().keys()
    if topic != -1
]

# Remove empty topics (just in case)
topics = [t for t in topics if len(t) > 0]

coherence_model = CoherenceModel(
    topics=topics,
    texts=tokenized_texts,
    dictionary=dictionary,
    coherence='c_v'
)

multigram_coherence = coherence_model.get_coherence()
print("Multigram C_v Coherence:", multigram_coherence)

Multigram C_v Coherence: 0.4047546849501282


## Using LLM to Improve Representation (WIP)

In [10]:
import os
import openai
from dotenv import load_dotenv
from bertopic.representation import OpenAI

# Load variables from .env file
load_dotenv()

client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
topic_model.update_topics(texts_clean, representation_model=OpenAI(client, model="gpt-4o-mini", delay_in_seconds=3))
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3038,-1_Religion and navigation,[Religion and navigation],[Title just 'bout says it all: Grasshopper Rem...
1,0,992,0_Motorcycle purchasing advice,[Motorcycle purchasing advice],[I'm new to motorcycles so no flames please. I...
2,1,412,1_Clipper encryption system,[Clipper encryption system],[One more time... If they released the algorit...
3,2,394,2_NHL Player Statistics,[NHL Player Statistics],[Here is the price list for the week April 13 ...
4,3,367,3_MLB Standings Updates,[MLB Standings Updates],"[MLB Standings and Scores for Tuesday, April 6..."
...,...,...,...,...,...
71,70,17,70_Existence of the Universe,[Existence of the Universe],[= = : [ The discussion begins: why does the u...
72,71,16,71_Usenet Advertising Etiquette,[Usenet Advertising Etiquette],"[""Jack Previdi"" <p00020@psilink.com> writes, i..."
73,72,15,72_Keyboard Event Handling,[Keyboard Event Handling],"[Unfortunately, the key event handling is pret..."
74,73,15,73_Video Card Benchmarks,[Video Card Benchmarks],[On ftp.cica.indiana.edu in pub/pc/win3/misc/w...
