# BerTopic

## Install Dependencies

In [1]:
!pip install -r requirements.txt


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Loading Dataset


In [2]:
import random
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("SetFit/20_newsgroups")
random.seed(42)
text_label = list(zip(dataset["train"]["text"], dataset["train"]["label_text"]))
sampled_text_label = random.sample(text_label, 500)

Repo card metadata block was not found. Setting CardData to empty.


## Clean Data


In [4]:
import re
def clean_for_embedding(text, max_sentences=5):
    lines = text.split("\n")
    lines = [line for line in lines if not line.strip().startswith(">")]
    lines = [line for line in lines if not re.match(r"^\s*(from|subject|organization|lines|writes|article)\s*:", line, re.IGNORECASE)]
    text = " ".join(lines)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[!?]{3,}", "", text)
    sentence_split = re.split(r'(?<=[.!?]) +', text)
    sentence_split = [
        s for s in sentence_split
        if len(s.strip()) > 15 and not s.strip().isupper()
      ]
    return " ".join(sentence_split[:max_sentences])
texts_clean = [clean_for_embedding(text) for text,_ in sampled_text_label]
labels = [label for _, label in sampled_text_label]

## BerTopic

In [5]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")
# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()
# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()
# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model, # Step 1 - Extract embeddings
    umap_model=umap_model, # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model, # Step 5 - Extract topic words
    representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
)
topics, probs = topic_model.fit_transform(texts_clean)

In [6]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,84,-1_tc_vga_opinion_clock,"[tc, vga, opinion, clock, unless, col, probabl...","[# #From the Santa Rosa (Cal.) Press-Democrat,..."
1,0,86,0_xterm_sun_doesn_does,"[xterm, sun, doesn, does, floppy, truetype, te...",[one way to get the system going with one flop...
2,1,68,1_hell_sets_jesus_nice,"[hell, sets, jesus, nice, right, expect, belie...","[Pardon me for being a little confused, but at..."
3,2,59,2_window_1993_performance_status,"[window, 1993, performance, status, various, y...",[You'll have fun looking for the rear-end gear...
4,3,57,3_fan_fans_exact_said,"[fan, fans, exact, said, jets, say, news, game...",[Can someone send me ticket ordering informati...
5,4,27,4_unprecedented_reality_eugenic_mainstream,"[unprecedented, reality, eugenic, mainstream, ...","[From Israel Line, Thursday, April 22, 1993: T..."
6,5,22,5_traffic_government_navy_wiretap,"[traffic, government, navy, wiretap, public, p...",[Most of the content of the White House announ...
7,6,19,6_ethically_certainly_commercial_individuals,"[ethically, certainly, commercial, individuals...",[Better idea for use of NASA Shuttle Astronaut...
8,7,19,7_use_disk_netx_support,"[use, disk, netx, support, read, section, vfin...","[DOS 5.0/6.0 cannot read the NTFS file system,..."
9,8,18,8_yeah_buy_new_does,"[yeah, buy, new, does, better, doesn, worthles...",[Essential tremor is a progressive hereditary ...


In [7]:
topic_model.update_topics(texts_clean, vectorizer_model=CountVectorizer(stop_words="english", ngram_range=(2,3)))
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,84,-1_edu au_day wrap_stuff deleted_canberra edu au,"[edu au, day wrap, stuff deleted, canberra edu...","[# #From the Santa Rosa (Cal.) Press-Democrat,..."
1,0,86,0_dist p_c_distsq p_c_tek term_startup screen,"[dist p_c, distsq p_c, tek term, startup scree...",[one way to get the system going with one flop...
2,1,68,1_dead sea scrolls_sea scrolls_dead sea_prince...,"[dead sea scrolls, sea scrolls, dead sea, prin...","[Pardon me for being a little confused, but at..."
3,2,59,2_don think_car driver_road track_harley riders,"[don think, car driver, road track, harley rid...",[You'll have fun looking for the rear-end gear...
4,3,57,3_games pick_exact games_pick exact_pick exact...,"[games pick, exact games, pick exact, pick exa...",[Can someone send me ticket ordering informati...
5,4,27,4_world war_hairenik weekly_pray pray_history ...,"[world war, hairenik weekly, pray pray, histor...","[From Israel Line, Thursday, April 22, 1993: T..."
6,5,22,5_clipper chip_government stop_commit criminal...,"[clipper chip, government stop, commit crimina...",[Most of the content of the White House announ...
7,6,19,6_answers questions_groups individuals_drag fr...,"[answers questions, groups individuals, drag f...",[Better idea for use of NASA Shuttle Astronaut...
8,7,19,7_address space_file manager_config sys_window...,"[address space, file manager, config sys, wind...","[DOS 5.0/6.0 cannot read the NTFS file system,..."
9,8,18,8_anecdotal evidence_cadre dsl_cadre dsl pitt_...,"[anecdotal evidence, cadre dsl, cadre dsl pitt...",[Essential tremor is a progressive hereditary ...
