In [None]:
import numpy as np
import pandas as pd
import torch
from umap.umap_ import UMAP
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
import plotly.io as pio

In [3]:
torch.cuda.is_available()

True

In [2]:
df = pd.read_csv('data/reddit_prep.csv')

In [3]:
abstracts = df["body"]

In [None]:
abstracts

In [None]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

In [8]:
# Save embeddings
np.save("./model/embeddings_250710.npy", embeddings)

In [10]:
#umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
umap_model = UMAP(n_neighbors=15, n_components=7, min_dist=0.0, metric='cosine', random_state=42)

In [11]:
hdbscan_model = HDBSCAN(min_cluster_size=500, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [12]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [14]:
mmr_model = MaximalMarginalRelevance(diversity=0.3)

In [None]:
#training

from bertopic import BERTopic

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  #representation_model=mmr_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True,
  calculate_probabilities=True,
  min_topic_size = 500
)

topics, probs = topic_model.fit_transform(abstracts, embeddings)

In [30]:
topic_info_df = topic_model.get_topic_info()

In [31]:
topic_info_df.to_csv("./results/topic_info_df.csv", index=False)

In [None]:
topic_model.get_topic(1, full=True)

In [35]:
#save
topic_model.save("./model/topic_model", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [None]:
#topic_model = BERTopic.load("./model/topic_model", embedding_model=embedding_model_save)

In [17]:
figure = topic_model.visualize_hierarchy()

In [None]:
config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'custom_image',
    'height': 1400,
    'width': 600,
    'scale':6 # Multiply title/legend/axis/canvas sizes by this factor
  }
}
figure.show(config=config)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap()

In [None]:
timestamps = df['created_utc']
timestamps

In [None]:
topics_over_time = topic_model.topics_over_time(docs=abstracts, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)