# BerTopic

## 1. Setup

### Install Dependencies

In [1]:
!pip install -r requirements.txt

ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'

[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


### Config

In [2]:
MODEL_FOLDER = "models"
BEST_MODEL_NAME = "best_model"
DATASET_FOLDER = "data"
TEST_DATA_FOLDER = "test_data"
MODEL_TRAINING_LOG = "training_log.csv"
MODEL_EVALUATION_LOG = "eval_log.csv"
RESULT_FILE = "result.csv"

### Common Imports

In [3]:
import pandas as pd
import os
import time
import pickle
import re

### Packages for Bertopic

In [4]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
import torch

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


### Packages for Gensim Coherence Score

In [5]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

## 2. Data Preprocessing


### Data Loading

In [6]:
import random
from datasets import load_dataset

In [7]:
dataset = load_dataset("SetFit/20_newsgroups")
random.seed(42)
text_label = list(zip(dataset["train"]["text"], dataset["train"]["label_text"]))
sampled_text_label = random.sample(text_label, 10000)

Repo card metadata block was not found. Setting CardData to empty.


### Clean Data


In [8]:
import re
def clean_for_embedding(text, max_sentences=5):
    lines = text.split("\n")
    lines = [line for line in lines if not line.strip().startswith(">")]
    lines = [line for line in lines if not re.match(r"^\s*(from|subject|organization|lines|writes|article)\s*:", line, re.IGNORECASE)]
    text = " ".join(lines)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[!?]{3,}", "", text)
    sentence_split = re.split(r'(?<=[.!?]) +', text)
    sentence_split = [
        s for s in sentence_split
        if len(s.strip()) > 15 and not s.strip().isupper()
      ]
    return " ".join(sentence_split[:max_sentences])

In [9]:
texts_clean = [clean_for_embedding(text) for text,_ in sampled_text_label]
labels = [label for _, label in sampled_text_label]

## 3. BerTopic Training

In [10]:
def train_bertopic(embedding_model_name,n_neighbors,n_components,min_cluster_size,embedding_model=None):
    # Step 1 - Extract embeddings
    print(f"CUDA Available: {torch.cuda.is_available()}")
    if embedding_model is None:
        embedding_model = SentenceTransformer(embedding_model_name, device=DEVICE)

    model_name = f"{embedding_model_name}_{n_neighbors}_{n_components}_{min_cluster_size}"
    
    # Step 2 - Reduce dimensionality
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine', random_state=42)
    
    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    
    # Step 4 - Tokenize topics
    vectorizer_model = CountVectorizer(stop_words="english")
    
    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer()
    
    # Step 6 - (Optional) Fine-tune topic representations with
    # a `bertopic.representation` model
    representation_model = KeyBERTInspired()
    
    # All steps together
    topic_model = BERTopic(
        embedding_model=embedding_model, # Step 1 - Extract embeddings
        umap_model=umap_model, # Step 2 - Reduce dimensionality
        hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings
        vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics
        ctfidf_model=ctfidf_model, # Step 5 - Extract topic words
        representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
    )
    topics, probs = topic_model.fit_transform(texts_clean)
    
    topic_model.save(f"{MODEL_FOLDER}/{model_name}", serialization="pytorch")

    return model_name

## 4. Benchmarking

In [11]:
trained_models = pd.DataFrame(columns=["model_name","train_time"])

# Create Log file if it doesn't exist
if not os.path.exists(MODEL_TRAINING_LOG):
    trained_models = pd.DataFrame(columns=["model_name","train_time"])
    trained_models.to_csv(MODEL_TRAINING_LOG, index=False)
else:
    trained_models = pd.read_csv(MODEL_TRAINING_LOG)

In [12]:
# Values to test for
embedding_model_names = ["all-mpnet-base-v2","all-MiniLM-L6-v2"]
n_neighbors_range = [5, 10, 15, 25, 50]
n_components_range = [5, 10, 15, 20, 25]
min_cluster_size_range = [5, 10, 15, 20, 30, 50]

In [13]:
for embedding_model_name in embedding_model_names:
    embedding_model = SentenceTransformer(embedding_model_name, device=DEVICE)
    for n_neighbors in n_neighbors_range:
        for n_components in n_components_range:
            for min_cluster_size in min_cluster_size_range:
                model_name = f"{embedding_model_name}_{n_neighbors}_{n_components}_{min_cluster_size}"
                if model_name in trained_models["model_name"].values:
                    print(f"{model_name} had already been trained")
                    continue
                else:
                    start_time = time.time()
                    model_name = train_bertopic(embedding_model_name,n_neighbors,n_components,min_cluster_size,embedding_model)
                    end_time = time.time()
                    train_time = end_time-start_time
                    
                    # Write to training log
                    new_row = pd.DataFrame({"model_name": [model_name],"train_time":[train_time]})
                    trained_models = pd.concat([trained_models, new_row], ignore_index=True)
                    trained_models.to_csv(MODEL_TRAINING_LOG, index=False)
                    
                    # Print Status
                    print(f"Trained {model_name} in {train_time}")

all-mpnet-base-v2_5_5_5 had already been trained
all-mpnet-base-v2_5_5_10 had already been trained
all-mpnet-base-v2_5_5_15 had already been trained
all-mpnet-base-v2_5_5_20 had already been trained
all-mpnet-base-v2_5_5_30 had already been trained
all-mpnet-base-v2_5_5_50 had already been trained
all-mpnet-base-v2_5_10_5 had already been trained
all-mpnet-base-v2_5_10_10 had already been trained
all-mpnet-base-v2_5_10_15 had already been trained
all-mpnet-base-v2_5_10_20 had already been trained
all-mpnet-base-v2_5_10_30 had already been trained
all-mpnet-base-v2_5_10_50 had already been trained
all-mpnet-base-v2_5_15_5 had already been trained
all-mpnet-base-v2_5_15_10 had already been trained
all-mpnet-base-v2_5_15_15 had already been trained
all-mpnet-base-v2_5_15_20 had already been trained
all-mpnet-base-v2_5_15_30 had already been trained
all-mpnet-base-v2_5_15_50 had already been trained
all-mpnet-base-v2_5_20_5 had already been trained
all-mpnet-base-v2_5_20_10 had already bee

## 5. Evaluation of Models

In [14]:
# Functions for getting coherence score

def monogram_coherence_score(model,embedding_model,tokenized_texts,dictionary):
    monogram_topic_model = BERTopic.load(model,embedding_model=embedding_model)
    monogram_topic_model.get_topic_info()
    
    # Extract Topics
    # Filter topic words to exist in the dictionary
    topics = [
        [str(word) for word, _ in words_probs if str(word) in dictionary.token2id]
        for topic_id, words_probs in monogram_topic_model.get_topics().items()
        if topic_id != -1
    ]
    
    # Remove empty topics (just in case)
    topics = [t for t in topics if len(t) > 0]
    
    # Compute Coherence
    coherence_model = CoherenceModel(
        topics=topics,
        texts=tokenized_texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    
    monogram_coherence = coherence_model.get_coherence()
    
    return monogram_coherence

def multigram_coherence_score(model,embedding_model,tokenized_texts,dictionary,texts_clean):
    multigram_topic_model = BERTopic.load(model,embedding_model=embedding_model)
    multigram_topic_model.update_topics(texts_clean, vectorizer_model=CountVectorizer(stop_words="english", ngram_range=(2,3)))
    multigram_topic_model.get_topic_info()
    
    # Topics have to be split into singular words
    topics = [
        sum([word.split() for word, _ in multigram_topic_model.get_topic(topic)], [])
        for topic in multigram_topic_model.get_topics().keys()
        if topic != -1
    ]
    
    # Remove empty topics (just in case)
    topics = [t for t in topics if len(t) > 0]
    
    coherence_model = CoherenceModel(
        topics=topics,
        texts=tokenized_texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    
    multigram_coherence = coherence_model.get_coherence()
    return multigram_coherence



In [15]:
# Helper function for extracting hyperparameter from model name
def parse_model_name(model_name):
    pattern = r"(.+)_(\d+)_(\d+)_(\d+)$"
    match = re.match(pattern, model_name)
    
    if not match:
        raise ValueError(f"Invalid model name format: {model_name}")
    
    embedding_model_name, n_neighbors, n_components, min_cluster_size = match.groups()
    
    return (
        embedding_model_name,
        int(n_neighbors),
        int(n_components),
        int(min_cluster_size)
    )

In [16]:
# Create or get test_data

os.makedirs(TEST_DATA_FOLDER, exist_ok=True)

tokenized_texts_file = os.path.join(TEST_DATA_FOLDER, "tokenized_texts.pkl")
dictionary_file = os.path.join(TEST_DATA_FOLDER, "dictionary.pkl")
texts_clean_file = os.path.join(TEST_DATA_FOLDER, "texts_clean.pkl")

tokenized_texts = None
dictionary = None

if os.path.exists(tokenized_texts_file) and os.path.exists(dictionary_file):
    with open(tokenized_texts_file, "rb") as f:
        tokenized_texts = pickle.load(f)
    with open(dictionary_file, "rb") as f:
        dictionary = pickle.load(f)
    with open(texts_clean_file, "rb") as f:
        texts_clean = pickle.load(f)
else:
    # Tokenize Document
    tokenized_texts = [[str(token) for token in doc.split() if token.strip() != ''] for doc in texts_clean]
    # Create Dictionary
    dictionary = Dictionary(tokenized_texts)
    
    with open(tokenized_texts_file, "wb") as f:
        pickle.dump(tokenized_texts, f)
    with open(dictionary_file, "wb") as f:
        pickle.dump(dictionary, f)
    with open(texts_clean_file, "wb") as f:
        pickle.dump(texts_clean, f)



In [17]:
# Initialise training and evaluation logs 
training_log_df = pd.read_csv("training_log.csv")
evaluation_log_df = None

# load evaluation into df
if os.path.exists(MODEL_EVALUATION_LOG):
    evaluation_log_df = pd.read_csv(MODEL_EVALUATION_LOG)
else:
    evaluation_log_df = pd.DataFrame(columns=["model_name",
                                              "embedding_model",
                                              "n_neighbors",
                                              "n_components",
                                              "min_cluster_size",
                                              "train_time", 
                                              "monogram_cv", 
                                              "multigram_cv", 
                                              "eval_time"])


In [18]:
for _, row in training_log_df.iterrows():
    model_name = row["model_name"]
    train_time = row["train_time"]

    embedding_model_name,n_neighbors,n_components,min_cluster_size = parse_model_name(model_name)
    
    # Skip evaluation if evaluated before
    if model_name in evaluation_log_df["model_name"].values:
        print(f"{model_name} has already evaluated.")
        continue
    
    model = os.path.join(MODEL_FOLDER,model_name)
    embedding_model = SentenceTransformer(embedding_model_name,device=DEVICE) # Hardcoded for quick test, will change later

    start_time = time.time()
    monogram_cv = monogram_coherence_score(model,embedding_model,tokenized_texts,dictionary)
    multigram_cv = multigram_coherence_score(model,embedding_model,tokenized_texts,dictionary,texts_clean)
    end_time = time.time()
    eval_time = end_time-start_time

    print(f"{model_name} in {eval_time} : {monogram_cv}, {multigram_cv}")

    new_row = {
        "model_name": model_name,
        "embedding_model":embedding_model_name,
        "n_neighbors":n_neighbors,
        "n_components":n_components,
        "min_cluster_size":min_cluster_size,
        "train_time": train_time,
        "monogram_cv": monogram_cv,
        "multigram_cv": multigram_cv,
        "eval_time": eval_time
    }

    evaluation_log_df = pd.concat([evaluation_log_df, pd.DataFrame([new_row])], ignore_index=True)
    evaluation_log_df.to_csv(MODEL_EVALUATION_LOG, index=False)
    


all-mpnet-base-v2_5_5_5 in 82.06482124328613 : 0.3169934458652391, 0.398788266411994
all-mpnet-base-v2_5_5_10 in 40.29389977455139 : 0.31667664430618864, 0.40297466225762124
all-mpnet-base-v2_5_5_15 in 32.284870624542236 : 0.30475759592027607, 0.3765922846713736
all-mpnet-base-v2_5_5_20 in 32.33454966545105 : 0.340007685369821, 0.38996022016882687
all-mpnet-base-v2_5_5_30 in 28.28744602203369 : 0.3743542628814256, 0.44481123297707165
all-mpnet-base-v2_5_5_50 in 29.418080806732178 : 0.41043809380653284, 0.5373819755575944
all-mpnet-base-v2_5_10_5 in 82.65697431564331 : 0.30543355980651477, 0.40334723629343955
all-mpnet-base-v2_5_10_10 in 41.00922417640686 : 0.30996029891519933, 0.40041722357398807
all-mpnet-base-v2_5_10_15 in 33.86795234680176 : 0.30857550056605537, 0.3873626238519822
all-mpnet-base-v2_5_10_20 in 30.947529315948486 : 0.33113750782288054, 0.39218163386109217
all-mpnet-base-v2_5_10_30 in 28.657154321670532 : 0.3477115339440953, 0.45471484532131595
all-mpnet-base-v2_5_10_5

## 6. Result (Best Model) - WIP

### Topic Info (Monogram)

In [19]:
monogram_topic_model = BERTopic.load("topic_model",embedding_model=embedding_model)
monogram_topic_model.get_topic_info()

ValueError: Make sure to either pass a valid directory or HF model.

### Topic Info (Multigram)

In [None]:
multigram_topic_model = BERTopic.load("topic_model",embedding_model=embedding_model)
multigram_topic_model.update_topics(texts_clean, vectorizer_model=CountVectorizer(stop_words="english", ngram_range=(2,3)))
multigram_topic_model.get_topic_info()

### Comparison

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

# Tokenize Document
tokenized_texts = [[str(token) for token in doc.split() if token.strip() != ''] for doc in texts_clean]

# Create Dictionary
dictionary = Dictionary(tokenized_texts)

# Extract Topics
# Filter topic words to exist in the dictionary
topics = [
    [str(word) for word, _ in words_probs if str(word) in dictionary.token2id]
    for topic_id, words_probs in monogram_topic_model.get_topics().items()
    if topic_id != -1
]

# Remove empty topics (just in case)
topics = [t for t in topics if len(t) > 0]

# Compute Coherence
coherence_model = CoherenceModel(
    topics=topics,
    texts=tokenized_texts,
    dictionary=dictionary,
    coherence='c_v'
)

monogram_coherence = coherence_model.get_coherence()
print("Monogram C_v Coherence:", monogram_coherence)

In [None]:
tokenized_texts = [doc.split() for doc in texts_clean]
dictionary = Dictionary(tokenized_texts)

# Topics have to be split into singular words
topics = [
    sum([word.split() for word, _ in multigram_topic_model.get_topic(topic)], [])
    for topic in multigram_topic_model.get_topics().keys()
    if topic != -1
]

# Remove empty topics (just in case)
topics = [t for t in topics if len(t) > 0]

coherence_model = CoherenceModel(
    topics=topics,
    texts=tokenized_texts,
    dictionary=dictionary,
    coherence='c_v'
)

multigram_coherence = coherence_model.get_coherence()
print("Multigram C_v Coherence:", multigram_coherence)

## 7. Using LLM to Improve Representation (WIP)

In [None]:
import os
import openai
from dotenv import load_dotenv
from bertopic.representation import OpenAI

# Load variables from .env file
load_dotenv()

client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
topic_model.update_topics(texts_clean, representation_model=OpenAI(client, model="gpt-4o-mini", delay_in_seconds=3))
topic_model.get_topic_info()