This Notebook is used as short demo to demonstrate the usage and potential of BERTopic to cluster text data.

In [None]:
from bertopic import BERTopic
import pandas as pd
import sys
from pathlib import Path
import pickle

#### 1. Data Loading and Analysis

In [None]:
# we already prepared a file with the data as well as their embeddings, since the computation without GPU-usage takes a lot of time.
with open("data/demo_reviews.pickle", "rb") as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    data = pickle.load(f)

In [None]:
data.keys()

In [None]:
reviews_df = data["reviews_df"]
docs = reviews_df["text"]
embeddings = data["embeddings"]
td_embeddings = data["embeddings"]

In [None]:
data["reviews_df"].head()

In [None]:
def print_example_data(df):
    for _,r in df.sample(10).iterrows():
        review = r["text"]
        print(f"Review: {review}")
        print("--"*50)
        
        

In [None]:
print_example_data(reviews_df)

In [None]:
embeddings

In [None]:
print("Shape of the embeddings",embeddings.shape)
print("Shape of the 2d reduced embeddings",td_embeddings.shape)

#### 2. Perform Topic Modelling

In [None]:
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

#### 3. Visualize the results

In [None]:
topic_model.visualize_hierarchy()

In [None]:
reviews_df["published_date"]

In [None]:
topics_over_time = topic_model.topics_over_time(docs, data["reviews_df"]["published_date"])

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics=[0, 5, 10])

In [None]:
topic_model.visualize_documents(docs, reduced_embeddings=data["2D_embeddings"], sample=0.05)