# Setup

In [None]:
!git clone https://github.com/JonasHendl/TopicModelingDemo.git  # clone
%cd TopicModelingDemo
%pip install -qr requirements.txt

### Hint: You can execute cells with "strg" + "enter"

In [None]:
from bertopic import BERTopic
import pandas as pd
import sys
from pathlib import Path
import pickle
from typing import List
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from bertopic.vectorizers import ClassTfidfTransformer
import datetime as dt

random_seed = 42

### Load data

In [None]:
with open('data/demo_reviews.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    data = pickle.load(f)

In [None]:
number_of_reviews = 2000 #we limit the number of data points for faster processing. If you are using your own GPU feel free to set this to a higher number. 
reviews_df = data["reviews_df"][:number_of_reviews]
docs = reviews_df["text"][:number_of_reviews]
embeddings = data["embeddings"][:number_of_reviews]

### Examine some examples

In [None]:
number_of_examples_you_want_to_view = 10

with pd.option_context('display.max_colwidth', None):
    # This will display the sample with full column texts
    # display(reviews_df.sample(n=number_of_examples_you_want_to_view, random_state=random_seed))
    display(reviews_df.sample(n=number_of_examples_you_want_to_view))

The function new_model initializes all sub-modules with relevant parameters. I already chose some parameters for your, so you do not need to do anything right now. Later, you can experiment with different values. 

In [None]:
    def new_model(
        n_neighbors: int = 15,
        n_components: int = 5,
        min_dist: float = 0.0,
        metric_umap: str = "cosine",
        min_cluster_size: int = 20,
        metric_hdbscan: str = "euclidean",
        cluster_selection_method: str = "leaf",  # eom
        cluster_selection_epsilon: float = 0.2,
        prediction_data: bool = True,
        alpha: float = 1.0,
        max_features: int = None,
        min_samples: int = 5,
        seed_topic_list: List[List[str]] = None,
        seed: int = 42,
    ) -> None:
        """
        Create a new BERTopic model configured with various parameters for UMAP, HDBSCAN, and vectorization.

        Parameters
        ----------
        n_neighbors : int, optional, default=15
            Number of neighbors to consider for UMAP.
        n_components : int, optional, default=5
            Number of components for dimensionality reduction in UMAP.
        min_dist : float, optional, default=0.0
            Minimum distance between points in the low-dimensional representation in UMAP.
        metric_umap : str, optional, default='cosine'
            Metric to use for UMAP.
        min_cluster_size : int, optional, default=10
            Minimum cluster size for HDBSCAN.
        metric_hdbscan : str, optional, default='euclidean'
            Metric to use for HDBSCAN clustering.
        cluster_selection_method : str, optional, default='eom'
            Method for selecting clusters in HDBSCAN.
        cluster_selection_epsilon : float, optional, default=0.1
            Cluster selection epsilon parameter for HDBSCAN.
            Larger Values: Setting a larger cluster_selection_epsilon can result in fewer, larger clusters. This is because more points will be included in clusters, as the algorithm is less strict about the density required to form a cluster.
            Smaller Values: A smaller cluster_selection_epsilon can lead to more, smaller clusters, as it requires higher density (closer points) to consider points as part of the same cluster.
            HDBSCAN does not require the specification of an epsilon value upfront, as it builds a hierarchy of clusters based on varying epsilons. However, the cluster_selection_epsilon parameter is used during the cluster selection process after this hierarchy is built. Here’s how it works:

            Hierarchy Creation: HDBSCAN first creates a hierarchy of clusters by varying the density (distance) threshold, effectively exploring a range of epsilon values.
            Cluster Selection: After the hierarchy is built, HDBSCAN needs to decide which clusters in this hierarchy to select as the final clusters. This is where cluster_selection_epsilon comes into play.
            Thresholding: The cluster_selection_epsilon parameter sets a minimum distance threshold. Clusters formed below this threshold are considered too sparsely connected to be valid and are merged with their nearest neighbor clusters.
            Stability-Based Selection: HDBSCAN selects clusters based on their stability across the hierarchy. The cluster_selection_epsilon parameter adds an additional constraint to this process by not allowing the selection of clusters that fall below the specified distance threshold.

        prediction_data : bool, optional, default=True
            Whether to generate prediction data in HDBSCAN.
        alpha : float, optional, default=1.0
            Alpha parameter for HDBSCAN.
        max_features : int, optional, default=None
            Maximum number of features for CountVectorizer.
        min_samples: int, optional, default=5
            Similar to min_cluster_size. Indepth documentation at HDBSCAN repo.

        Returns
        -------
        None
            This method modifies the internal state of the ClusteringModel instance but does not return anything.

        Notes
        -----
        This method modifies the internal state of the ClusteringModel instance.
        """

        umap_model = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            min_dist=min_dist,
            metric=metric_umap,
            random_state=seed,
        )

        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size,
            metric=metric_hdbscan,
            cluster_selection_method=cluster_selection_method,
            prediction_data=prediction_data,
            cluster_selection_epsilon=cluster_selection_epsilon,
            alpha=alpha,
            min_samples=min_samples,
        )

        vectorizer_model = CountVectorizer(
            stop_words="english", max_features=max_features, ngram_range=(1, 3)
        )

        ctfidf_model = ClassTfidfTransformer(
            bm25_weighting=True, reduce_frequent_words=True
        )

        model = BERTopic(
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            vectorizer_model=vectorizer_model,
            ctfidf_model=ctfidf_model,
            calculate_probabilities=True,
            seed_topic_list=seed_topic_list,
            verbose=True
        )
        return model

In [None]:
# create new model
topic_model = new_model()
topics, probs = topic_model.fit_transform(docs, embeddings) #we initialize our model with pre-computed embeddings

We will look at a scatter plot of the reviews. The same color signifies the same class. You can hover to look at the reviews. Keep in mind that you only see a small fraction of the data (to save your browser from crashing).

In [None]:
fraction_of_answers_visulaized = 0.10
topic_model.visualize_documents(docs, reduced_embeddings=data["2D_embeddings"][:number_of_reviews], sample=fraction_of_answers_visulaized)

In [None]:
topic_number = 5
topic_model.get_topic(topic_number)

In [None]:
topic_model.get_representative_docs(topic=topic_number)

In [None]:
topic_model.visualize_hierarchy() #shows hierarchy

In [None]:
topic_model.visualize_heatmap()

In [None]:
#LOWER the minimum number of messages per cluster to get a more granular cluster
topic_model_2 = new_model(min_cluster_size=10)
topics, probs = topic_model_2.fit_transform(docs, embeddings)
fraction_of_answers_visulaized = 0.10
topic_model_2.visualize_documents(docs, reduced_embeddings=data["2D_embeddings"][:number_of_reviews], sample=fraction_of_answers_visulaized)

In [None]:
topics_per_class  = topic_model.topics_per_class(docs, reviews_df['rating'])
topic_model.visualize_topics_per_class(topics_per_class)

The cherry on top - Topics over time

In [None]:
# For every review, we have the date it was published on.
reviews_df["published_date"]

In [None]:
reviews_df["published_date"] = pd.to_datetime(reviews_df["published_date"], utc=True)
reviews_df["published_date"] = reviews_df["published_date"].dt.to_period('Q').dt.start_time

In [None]:
list_of_interesting_topics = [0, 8, 9]
topics_over_time = topic_model.topics_over_time(docs, reviews_df["published_date"])
topic_model.visualize_topics_over_time(topics_over_time, topics=list_of_interesting_topics)