In [None]:
import os
import sys

module_path = os.path.abspath(os.path.join("..", "src"))

if module_path not in sys.path:
    sys.path.insert(0, module_path)

In [None]:
import polars as pl
import numpy as np
import pathlib

DATA_DIR = pathlib.Path("../data/processed")

In [None]:
df = pl.read_parquet(DATA_DIR / "yelp_reviews.parquet")
df

In [None]:
from sentence_transformers import SentenceTransformer

df = df[:10_000] # Otherwise this would take too long

"""
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(df["text"].to_list(), show_progress_bar=True)
np.save("../data/processed/test_embeddings.npy", embeddings)
"""
embeddings = np.load("../data/processed/test_embeddings.npy")

In [None]:
from umap import UMAP
from append_umap import AppendUMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

metadata_arr = AppendUMAP.shape_dims(df.drop("text"))
vanilla_bertopic = BERTopic(
    umap_model=UMAP(n_components=5, min_dist=0.0, metric="cosine"),
    hdbscan_model=HDBSCAN(min_cluster_size=15, prediction_data=True),
    vectorizer_model=CountVectorizer(stop_words="english"),
    ctfidf_model=ClassTfidfTransformer()
)
modded_bertopic = BERTopic(
    umap_model=AppendUMAP(additional_dimensions=metadata_arr, n_components=5, min_dist=0.0, metric="cosine"),
    hdbscan_model=HDBSCAN(min_cluster_size=15, prediction_data=True),
    vectorizer_model=CountVectorizer(stop_words="english"),
    ctfidf_model=ClassTfidfTransformer()
)

In [None]:
vanilla_topics, vanilla_probs = vanilla_bertopic.fit_transform(
    df["text"].to_list(), embeddings=embeddings
)
modded_topics, modded_probs = modded_bertopic.fit_transform(
    df["text"].to_list(), embeddings=embeddings
)

In [None]:
vanilla_bertopic.get_topic_info()

In [None]:
modded_bertopic.get_topic_info()

In [None]:
vanilla_bertopic.get_topics()

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

def coherence(m: BERTopic, docs: list[str], topics: list[int], coherence: str = "c_v") -> float:
    """
    Computes coherence for topic model.
    Code taken from https://github.com/MaartenGr/BERTopic/issues/90
    """
    # Gets the same vectorizer instance used in the model
    vectorizer = m.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()

    tokens = [tokenizer(doc) for doc in docs]
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]

    n_topics = len(set(topics))
    topic_words = [
        [word for word, _ in m.get_topic(topic)] # type: ignore
        for topic in range(n_topics - 1) # Ignores noise topic number -1
    ]

    cm = CoherenceModel(
        topics=topic_words,
        texts=tokens,
        dictionary=dictionary,
        corpus=corpus,
        coherence=coherence
    )
    return cm.get_coherence()

In [None]:
comparison = [
    {"model": "vanilla", "coherence": coherence(vanilla_bertopic, df["text"].to_list(), vanilla_topics)},
    {"model": "append", "coherence": coherence(modded_bertopic, df["text"].to_list(), modded_topics)}
]
pl.DataFrame(comparison)