In [None]:
import polars as pl
import bertopic


DATA = "../data/processed/yelp_reviews_with_embeddings.parquet"
SAMPLE_SIZE = 5_000

In [None]:
df = pl.scan_parquet(DATA).slice(0, SAMPLE_SIZE).collect()
df

In [None]:
from octis.models.LDA import LDA
from octis.dataset.dataset import Dataset

dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")

lda = LDA(num_topics=20)
lda_output = lda.train_model(dataset)

In [None]:
lda_output

In [None]:
lda_output.keys()

In [None]:
docs = df["text"].to_list()
embeddings = df["embedding"].to_numpy()
bertopic = bertopic.BERTopic()
topics, probs = bertopic.fit_transform(docs, embeddings=embeddings)

In [None]:
wl = [val[0] for val in bertopic.get_topic(1)]
wl

In [None]:
bertopic.get_topic(0)[:3]

In [None]:
from bertopic import BERTopic

def bertopic_output_to_octis(
    m: BERTopic,
    topic_assignments: list[int],
    topk: int = 10
) -> dict[str, list[list[str]]]:
    """
    Reshapes BERTopic output so that it can be readily passed to OCTIS
    for evaluation.
    """
    # Excludes noise topic -1
    n_topics = len(set(topic_assignments)) - 1
    topics = [
        m.get_topic(i)[:topk]
        for i in range(n_topics)
    ]

    # Extracts words that represent each topic
    topic_words = [
        [word for word, _ in t]
        for t in topics
    ]
    return {"topics": topic_words}

bertopic_output_to_octis(bertopic, topics)

In [None]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

bertopic_output = bertopic_output_to_octis(bertopic, topics)
metric = TopicDiversity(topk=10)
score = metric.score(bertopic_output)
score