# BerTopic

## 1. Setup

### Install Dependencies

In [1]:
!pip install -r requirements.txt

ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'

[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


### Config

In [2]:
MODEL_FOLDER = "models"
DATASET_FOLDER = "data"
MODEL_TRAINING_LOG = "training_log.csv"
RESULT_FILE = "result.csv"

### Common Imports

In [3]:
import pandas as pd
import os
import time

## 2. Data Preprocessing


### Data Loading

In [4]:
import random
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
dataset = load_dataset("SetFit/20_newsgroups")
random.seed(42)
text_label = list(zip(dataset["train"]["text"], dataset["train"]["label_text"]))
sampled_text_label = random.sample(text_label, 10000)

Repo card metadata block was not found. Setting CardData to empty.


### Clean Data


In [6]:
import re
def clean_for_embedding(text, max_sentences=5):
    lines = text.split("\n")
    lines = [line for line in lines if not line.strip().startswith(">")]
    lines = [line for line in lines if not re.match(r"^\s*(from|subject|organization|lines|writes|article)\s*:", line, re.IGNORECASE)]
    text = " ".join(lines)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[!?]{3,}", "", text)
    sentence_split = re.split(r'(?<=[.!?]) +', text)
    sentence_split = [
        s for s in sentence_split
        if len(s.strip()) > 15 and not s.strip().isupper()
      ]
    return " ".join(sentence_split[:max_sentences])

In [7]:
texts_clean = [clean_for_embedding(text) for text,_ in sampled_text_label]
labels = [label for _, label in sampled_text_label]

## 3. BerTopic Training

In [8]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
import torch

def train_bertopic(embedding_model,n_neighbors,n_components,min_cluster_size):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_name = f"{embedding_model}_{n_neighbors}_{n_components}_{min_cluster_size}"

    # Step 1 - Extract embeddings
    print(f"CUDA Available: {torch.cuda.is_available()}")
    embedding_model = SentenceTransformer(embedding_model,device)
    
    # Step 2 - Reduce dimensionality
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine', random_state=42)
    
    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    
    # Step 4 - Tokenize topics
    vectorizer_model = CountVectorizer(stop_words="english")
    
    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer()
    
    # Step 6 - (Optional) Fine-tune topic representations with
    # a `bertopic.representation` model
    representation_model = KeyBERTInspired()
    
    # All steps together
    topic_model = BERTopic(
        embedding_model=embedding_model, # Step 1 - Extract embeddings
        umap_model=umap_model, # Step 2 - Reduce dimensionality
        hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings
        vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics
        ctfidf_model=ctfidf_model, # Step 5 - Extract topic words
        representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
    )
    topics, probs = topic_model.fit_transform(texts_clean)
    
    topic_model.save(f"{MODEL_FOLDER}/{model_name}", serialization="pytorch")

    return model_name

## 4. Benchmarking

In [9]:
trained_models = pd.DataFrame(columns=["model_name","train_time"])

# Create Log file if it doesn't exist
if not os.path.exists(MODEL_TRAINING_LOG):
    trained_models = pd.DataFrame(columns=["model_name","train_time"])
    trained_models.to_csv(MODEL_TRAINING_LOG, index=False)
else:
    trained_models = pd.read_csv(MODEL_TRAINING_LOG)

In [10]:
# Values to test for
embedding_models = ["all-mpnet-base-v2","all-MiniLM-L6-v2"]
n_neighbors_range = [x for x in range(5,21)]
n_components_range = [x for x in range(5,21)]
min_cluster_size_range = [x for x in range(5,26)]

In [11]:
for embedding_model in embedding_models:
    for n_neighbors in n_neighbors_range:
        for n_components in n_components_range:
            for min_cluster_size in min_cluster_size_range:
                model_name = f"{embedding_model}_{n_neighbors}_{n_components}_{min_cluster_size}"
                if model_name in trained_models["model_name"].values:
                    print(f"{model_name} had already been trained")
                    continue
                else:
                    start_time = time.time()
                    model_name = train_bertopic(embedding_model,n_neighbors,n_components,min_cluster_size)
                    end_time = time.time()
                    train_time = end_time-start_time
                    
                    # Write to training log
                    new_row = pd.DataFrame({"model_name": [model_name],"train_time":[train_time]})
                    trained_models = pd.concat([trained_models, new_row], ignore_index=True)
                    trained_models.to_csv(MODEL_TRAINING_LOG, index=False)
                    
                    # Print Status
                    print(f"Trained {model_name} in {train_time}")

all-mpnet-base-v2_5_5_5 had already been trained
all-mpnet-base-v2_5_5_6 had already been trained
all-mpnet-base-v2_5_5_7 had already been trained
all-mpnet-base-v2_5_5_8 had already been trained
all-mpnet-base-v2_5_5_9 had already been trained
all-mpnet-base-v2_5_5_10 had already been trained
all-mpnet-base-v2_5_5_11 had already been trained
all-mpnet-base-v2_5_5_12 had already been trained
all-mpnet-base-v2_5_5_13 had already been trained
all-mpnet-base-v2_5_5_14 had already been trained
all-mpnet-base-v2_5_5_15 had already been trained
all-mpnet-base-v2_5_5_16 had already been trained
all-mpnet-base-v2_5_5_17 had already been trained
all-mpnet-base-v2_5_5_18 had already been trained
all-mpnet-base-v2_5_5_19 had already been trained
all-mpnet-base-v2_5_5_20 had already been trained
all-mpnet-base-v2_5_5_21 had already been trained
all-mpnet-base-v2_5_5_22 had already been trained
all-mpnet-base-v2_5_5_23 had already been trained
all-mpnet-base-v2_5_5_24 had already been trained
all-m

KeyboardInterrupt: 

## 5. Result (Best Model) - WIP

### Topic Info (Monogram)

In [None]:
monogram_topic_model = BERTopic.load("topic_model",embedding_model=embedding_model)
monogram_topic_model.get_topic_info()

### Topic Info (Multigram)

In [None]:
multigram_topic_model = BERTopic.load("topic_model",embedding_model=embedding_model)
multigram_topic_model.update_topics(texts_clean, vectorizer_model=CountVectorizer(stop_words="english", ngram_range=(2,3)))
multigram_topic_model.get_topic_info()

### Comparison

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

# Tokenize Document
tokenized_texts = [[str(token) for token in doc.split() if token.strip() != ''] for doc in texts_clean]

# Create Dictionary
dictionary = Dictionary(tokenized_texts)

# Extract Topics
# Filter topic words to exist in the dictionary
topics = [
    [str(word) for word, _ in words_probs if str(word) in dictionary.token2id]
    for topic_id, words_probs in monogram_topic_model.get_topics().items()
    if topic_id != -1
]

# Remove empty topics (just in case)
topics = [t for t in topics if len(t) > 0]

# Compute Coherence
coherence_model = CoherenceModel(
    topics=topics,
    texts=tokenized_texts,
    dictionary=dictionary,
    coherence='c_v'
)

monogram_coherence = coherence_model.get_coherence()
print("Monogram C_v Coherence:", monogram_coherence)

In [None]:
tokenized_texts = [doc.split() for doc in texts_clean]
dictionary = Dictionary(tokenized_texts)

# Topics have to be split into singular words
topics = [
    sum([word.split() for word, _ in multigram_topic_model.get_topic(topic)], [])
    for topic in multigram_topic_model.get_topics().keys()
    if topic != -1
]

# Remove empty topics (just in case)
topics = [t for t in topics if len(t) > 0]

coherence_model = CoherenceModel(
    topics=topics,
    texts=tokenized_texts,
    dictionary=dictionary,
    coherence='c_v'
)

multigram_coherence = coherence_model.get_coherence()
print("Multigram C_v Coherence:", multigram_coherence)

## 6. Using LLM to Improve Representation (WIP)

In [None]:
import os
import openai
from dotenv import load_dotenv
from bertopic.representation import OpenAI

# Load variables from .env file
load_dotenv()

client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
topic_model.update_topics(texts_clean, representation_model=OpenAI(client, model="gpt-4o-mini", delay_in_seconds=3))
topic_model.get_topic_info()