In [12]:
!pip install nbformat>=4.2.0


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from umap import UMAP
from datasets import load_dataset
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from datetime import datetime

In [3]:
dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]
abstracts = dataset["abstract"][:2_000]

Generating train split: 100%|██████████| 117592/117592 [00:02<00:00, 53278.03 examples/s]


In [28]:
# For illustration purposes, we make sure the output is fixed when running this code multiple times
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

# We can choose any number of seed words for which we want their representation
# to be strengthen. We increase the importance of these words as we want them to be more
# likely to end up in the topic representations.
ctfidf_model = ClassTfidfTransformer(
    seed_words=["agent", "robot", "behavior", "policies", "environment"], 
    seed_multiplier=2,
    reduce_frequent_words=True
)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

vectorizer_model = CountVectorizer(min_df=10, stop_words="english")

In [29]:
# We run the topic model with the seeded words
topic_model = BERTopic(
    umap_model=umap_model,
    min_topic_size=15,
    ctfidf_model=ctfidf_model,
    embedding_model=embedding_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model
).fit(abstracts)


In [30]:
topic_labels = topic_model.generate_topic_labels(nr_words=3,
                                            topic_prefix=True,
                                            word_length=10,
                                            separator=", ")

In [None]:
topic_model.set_topic_labels(topic_labels)
topic_model.custom_labels_


{0: '-1, behavior, agent, used',
 1: '0, agent, behavior, value',
 2: '1, classifier, classifica, classifier',
 3: '2, clustering, clusters, means',
 4: '3, dimension, distributi, log',
 5: '4, online, descent, gradient',
 6: '5, images, image, object',
 7: '6, regret, sqrt, action',
 8: '7, graphical, structure, bayesian',
 9: '8, users, agent, game',
 10: '9, topic, words, modeling',
 11: '10, group, sparsity, penalty',
 12: '11, ranking, pairwise, search',
 13: '12, belief, propagatio, free',
 14: '13, rank, matrix, low',
 15: '14, kernel, kernels, valued',
 16: '15, equivalenc, variables, conditiona',
 17: '16, experts, randomized, prediction',
 18: '17, metric, distance, metrics',
 19: '18, coding, sparse, elements',
 20: '19, collaborat, filtering, implicit',
 21: '20, behavior, detection, agent',
 22: '21, active, label, noise',
 23: '22, network, agent, latent',
 24: '23, gaussian, expectatio, predictive',
 25: '24, manifold, embedding, spectral',
 26: '25, sensing, signal, rec

In [None]:
custom_label = topic_model.custom_labels_[0]
if isinstance(custom_label, str):
	i, labels = custom_label.split(", ")
	print(labels)
else:
	print("The custom label is not a string.")

ValueError: too many values to unpack (expected 2)

In [46]:
def create_custom_labels_dict(custom_labels):
    custom_labels_dict = {}
    for custom_label in custom_labels:
        labels = custom_label.split(", ")
        i = labels.pop(0)
        custom_labels_dict[i] = ", ".join(labels)
    
    return custom_labels_dict
    

custom_labels_dict = create_custom_labels_dict(topic_model.custom_labels_)
custom_labels_dict

{'-1': 'behavior, agent, used',
 '0': 'agent, behavior, value',
 '1': 'classifier, classifica, classifier',
 '2': 'clustering, clusters, means',
 '3': 'dimension, distributi, log',
 '4': 'online, descent, gradient',
 '5': 'images, image, object',
 '6': 'regret, sqrt, action',
 '7': 'graphical, structure, bayesian',
 '8': 'users, agent, game',
 '9': 'topic, words, modeling',
 '10': 'group, sparsity, penalty',
 '11': 'ranking, pairwise, search',
 '12': 'belief, propagatio, free',
 '13': 'rank, matrix, low',
 '14': 'kernel, kernels, valued',
 '15': 'equivalenc, variables, conditiona',
 '16': 'experts, randomized, prediction',
 '17': 'metric, distance, metrics',
 '18': 'coding, sparse, elements',
 '19': 'collaborat, filtering, implicit',
 '20': 'behavior, detection, agent',
 '21': 'active, label, noise',
 '22': 'network, agent, latent',
 '23': 'gaussian, expectatio, predictive',
 '24': 'manifold, embedding, spectral',
 '25': 'sensing, signal, recovery',
 '26': 'svm, margin, support',
 '27'

In [38]:
topic_model.custom_labels_[0]

'-1, behavior, agent, used'

In [32]:
# Get the current date
current_date = datetime.now().strftime("%Y-%m-%d")

# Create the unique model path with the current date
unique_model_path = f"../src/models/model_{current_date}/"

# Save the topic model
topic_model.save(unique_model_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [33]:
topic_model = BERTopic.load(unique_model_path, embedding_model=embedding_model)

In [34]:
topics, _ = topic_model.transform(["this is a new abstract"])

In [49]:
custom_labels_dict[str(topics[0])]

'classical, probabilit, separation'

In [11]:
topic_model.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed