In [22]:
# !python -m pip install -e ".[dev]"

In [1]:
%reload_ext autoreload
%autoreload 2
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


## Test Case from MaartenGr No. 2 - Zero Shot

### Before delete:

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from umap import UMAP

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

docs = load_dataset("CShorten/ML-ArXiv-Papers")["train"]["abstract"][:20_000]

# Pre-calculate embeddings
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# embeddings = embedding_model.encode(docs, show_progress_bar=True)

# # # Save the embeddings to a file
# np.save("test_ArXiv_embeddings_zeroshot_example.npy", embeddings)
loaded_embeddings = np.load("test_ArXiv_embeddings_zeroshot_example.npy")

# Use sub-models
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0, random_state=42)
hdbscan_model = HDBSCAN(min_samples=5, gen_min_span_tree=True, prediction_data=True)

# Representation models
keybert_model = KeyBERTInspired()
mmr_model = MaximalMarginalRelevance(diversity=0.3)
representation_model = {
  "KeyBERT": keybert_model,
  "MMR": mmr_model,
}

# BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    zeroshot_topic_list=["topic modeling", "large language models"],
    verbose=True,
).fit(docs, loaded_embeddings)

2025-05-27 16:51:28,462 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


In [71]:
topic_model.get_topic_info()[:5]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7624,-1_the_of_to_and,"[the, of, to, and, in, we, that, is, for, lear...",[ A crucial task in system identification pro...
1,0,4,topic modeling,"[topic, papers, svd, topics, allocation, conta...",[ Topic models have emerged as fundamental to...
2,1,24,1_modulation_radio_channel_transmitters,"[modulation, radio, channel, transmitters, sig...",[ We survey the latest advances in machine le...
3,2,7,2_pain_discomfort_diagnostic_facial,"[pain, discomfort, diagnostic, facial, intensi...",[ Pain is a complex and subjective experience...
4,3,6,3_quantum_entanglement_wave_annealers,"[quantum, entanglement, wave, annealers, conva...",[ Modern deep learning has enabled unpreceden...


In [None]:
original_topic_info = topic_model.get_topic_info()
original_topic_info[original_topic_info['Name']=='large language models']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs


In [None]:
original_topic_info.sort_values('Count', ascending=False)[:5]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7624,-1_the_of_to_and,"[the, of, to, and, in, we, that, is, for, lear...",[ A crucial task in system identification pro...
264,263,510,263_generative_gan_gans_generator,"[generative, gan, gans, generator, adversarial...",[ Generative Adversarial Networks (GANs) are ...
324,323,403,323_quantization_hardware_pruning_gpu,"[quantization, hardware, pruning, gpu, precisi...",[ Deep neural networks (DNNs) are used by dif...
99,98,269,98_recommendation_user_items_item,"[recommendation, user, items, item, recommende...",[ Matrix factorization techniques have been w...
29,28,196,28_privacy_private_differential_differentially,"[privacy, private, differential, differentiall...",[ The process of data mining with differentia...


In [None]:
original_topic_info[original_topic_info['Name']=='topic modeling']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
1,0,4,topic modeling,"[topic, papers, svd, topics, allocation, conta...",[ Topic models have emerged as fundamental to...


In [None]:
original_topic_info[original_topic_info['Name'].str.contains('modulation_radio_channel_transmitters')]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
2,1,24,1_modulation_radio_channel_transmitters,"[modulation, radio, channel, transmitters, sig...",[ We survey the latest advances in machine le...


### Run delete:

In [77]:
topic_model.get_topic_info()[:5]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7624,-1_the_of_to_and,"[the, of, to, and, in, we, that, is, for, lear...",[ A crucial task in system identification pro...
1,0,4,topic modeling,"[topic, papers, svd, topics, allocation, conta...",[ Topic models have emerged as fundamental to...
2,1,24,1_modulation_radio_channel_transmitters,"[modulation, radio, channel, transmitters, sig...",[ We survey the latest advances in machine le...
3,2,7,2_pain_discomfort_diagnostic_facial,"[pain, discomfort, diagnostic, facial, intensi...",[ Pain is a complex and subjective experience...
4,3,6,3_quantum_entanglement_wave_annealers,"[quantum, entanglement, wave, annealers, conva...",[ Modern deep learning has enabled unpreceden...


In [54]:
# topic_model.merge_topics(docs, [1, 2])

In [None]:
# topic_model.delete_topics([0])

In [78]:
topic_model.delete_topics([1])

len of mapping: 375
topics_from: [np.int64(0)] and topic_to: 0
zeroshot_labels: ['topic modeling']
cosine_similarities: [0.31793424]
best_cosine_similarity: 0.3179342448711395
new_topic_id_to_zeroshot_topic_idx: {}
after add_mappings: topic_model._topic_id_to_zeroshot_topic_idx: {}


In [88]:
# updated to self.topic_mapper_.add_mappings(mapping, topic_model=deepcopy(self)) to avoid unecessary updates in add_mappings
topic_model._topic_id_to_zeroshot_topic_idx

{0: 0}

In [79]:
after_delete_one_topic_info = topic_model.get_topic_info()
after_delete_one_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7648,-1_the_of_to_and,"[the, of, to, and, in, we, that, is, for, lear...",[ A crucial task in system identification pro...
1,0,4,topic modeling,"[topic, papers, svd, topics, allocation, conta...",[ Topic models have emerged as fundamental to...
2,2,7,2_pain_discomfort_diagnostic_facial,"[pain, discomfort, diagnostic, facial, intensi...",[ Pain is a complex and subjective experience...
3,3,6,3_quantum_entanglement_wave_annealers,"[quantum, entanglement, wave, annealers, conva...",[ Modern deep learning has enabled unpreceden...
4,4,74,4_quantum_classical_qubits_states,"[quantum, classical, qubits, states, circuit, ...",[ Quantum machine learning witnesses an incre...
...,...,...,...,...,...
369,369,8,369_chaos_initialization_jacobian_isometry,"[chaos, initialization, jacobian, isometry, de...",[ It is well known that the initialization of...
370,370,37,370_relu_depth_activation_mathbb,"[relu, depth, activation, mathbb, functions, w...",[ We study the necessary and sufficient compl...
371,371,16,371_generalization_nonvacuous_explain_sensitivity,"[generalization, nonvacuous, explain, sensitiv...",[ Neural networks exhibit good generalization...
372,372,71,372_minima_mathbf_loss_relu,"[minima, mathbf, loss, relu, activation, layer...",[ Deep learning models are often successfully...


In [80]:
after_delete_one_topic_info[after_delete_one_topic_info['Name'].str.contains('topic modeling')]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
1,0,4,topic modeling,"[topic, papers, svd, topics, allocation, conta...",[ Topic models have emerged as fundamental to...


In [81]:
after_delete_one_topic_info[after_delete_one_topic_info['Name'].str.contains('modulation_radio_channel_transmitters')]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs


In [82]:
# Extract just the suffix (after the first “_”) from each Name
orig_suffixes = original_topic_info['Name'].str.split('_', n=1).str[1].where(original_topic_info['Name'].str.contains('_'), original_topic_info['Name'])
after_suffixes = after_delete_one_topic_info['Name'].str.split('_', n=1).str[1].where(after_delete_one_topic_info['Name'].str.contains('_'), after_delete_one_topic_info['Name'])


# Build unique sets
orig_set = set(orig_suffixes)
after_set = set(after_suffixes)

# Find any suffixes that were in the original but not after deletion
missing = orig_set - after_set

# Report
if missing:
    print("Missing Name suffixes after delete/merge:")
    for name in sorted(missing):
        print(name)
else:
    print("All Name suffixes are preserved")

print("\nSummary:")
print(f"Original unique suffix count: {len(orig_set)}")
print(f"After delete/merge unique suffix count: {len(after_set)}")

Missing Name suffixes after delete/merge:
modulation_radio_channel_transmitters

Summary:
Original unique suffix count: 375
After delete/merge unique suffix count: 374


In [86]:
original_topic_info.sort_values('Count', ascending=False)[:5]


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7624,-1_the_of_to_and,"[the, of, to, and, in, we, that, is, for, lear...",[ A crucial task in system identification pro...
264,263,510,263_generative_gan_gans_generator,"[generative, gan, gans, generator, adversarial...",[ Generative Adversarial Networks (GANs) are ...
324,323,403,323_quantization_hardware_pruning_gpu,"[quantization, hardware, pruning, gpu, precisi...",[ Deep neural networks (DNNs) are used by dif...
99,98,269,98_recommendation_user_items_item,"[recommendation, user, items, item, recommende...",[ Matrix factorization techniques have been w...
29,28,196,28_privacy_private_differential_differentially,"[privacy, private, differential, differentiall...",[ The process of data mining with differentia...


In [85]:
after_delete_one_topic_info.sort_values('Count', ascending=False)[:5]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7648,-1_the_of_to_and,"[the, of, to, and, in, we, that, is, for, lear...",[ A crucial task in system identification pro...
263,263,510,263_generative_gan_gans_generator,"[generative, gan, gans, generator, adversarial...",[ Generative Adversarial Networks (GANs) are ...
323,323,403,323_quantization_hardware_pruning_gpu,"[quantization, hardware, pruning, gpu, precisi...",[ Deep neural networks (DNNs) are used by dif...
98,98,269,98_recommendation_user_items_item,"[recommendation, user, items, item, recommende...",[ Matrix factorization techniques have been w...
28,28,196,28_privacy_private_differential_differentially,"[privacy, private, differential, differentiall...",[ The process of data mining with differentia...
