In [5]:
%pip install bertopic nltk rdflib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("memento_data/html_results.csv")

In [7]:
df.head()

Unnamed: 0,source,channel_title,channel_description,title,description,link,pubDate,author,html
0,https://machinelearningmastery.com/feed,MachineLearningMastery.com,Making developers awesome at machine learning,3 Ways Vibe Coding and AI-Assisted Development...,Vibe coding and AI-assisted development are tw...,https://machinelearningmastery.com/3-ways-vibe...,"Mon, 31 Mar 2025 11:00:41 +0000",Iván Palomares Carrascosa,
1,https://machinelearningmastery.com/feed,MachineLearningMastery.com,Making developers awesome at machine learning,Advanced Q&A Features with DistilBERT,This post is divided into three parts; they ar...,https://machinelearningmastery.com/advanced-qa...,"Sat, 29 Mar 2025 18:33:57 +0000",Muhammad Asad Iqbal Khan,
2,https://machinelearningmastery.com/feed,MachineLearningMastery.com,Making developers awesome at machine learning,A Gentle Introduction to Attention and Transfo...,This post is divided into three parts; they ar...,https://machinelearningmastery.com/a-gentle-in...,"Fri, 28 Mar 2025 14:38:37 +0000",Adrian Tam,
3,https://machinelearningmastery.com/feed,MachineLearningMastery.com,Making developers awesome at machine learning,Building a Recommender System From Scratch wit...,"In this article, we will build step by step a ...",https://machinelearningmastery.com/building-a-...,"Fri, 28 Mar 2025 12:00:08 +0000",Iván Palomares Carrascosa,
4,https://machinelearningmastery.com/feed,MachineLearningMastery.com,Making developers awesome at machine learning,The Beginner’s Guide to Machine Learning with ...,Machine learning has become an essential tool ...,https://machinelearningmastery.com/the-beginne...,"Wed, 26 Mar 2025 16:20:41 +0000",Jayita Gulati,


In [8]:
# Loading premade topic for guided topic modeling
from rdflib import Graph

files = ["CSO.3.4.1.owl", "NLP-Taxonomy.owl"]
g = Graph()
for file in files:
    g.parse(f"topic_data/{file}", format="xml")

topics = []
for s, p, o in g:
    if p.endswith("label"):
        topics.append(str(o))

In [9]:
docs = (df["title"].fillna("") + " " + df["description"].fillna("") + df["html"].fillna("")).tolist()

In [10]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

topic_model = BERTopic(
    calculate_probabilities=True,
    verbose=True,
    zeroshot_topic_list=topics,
    min_topic_size=10,
    zeroshot_min_similarity=0.85,
    representation_model=KeyBERTInspired(),
)
topics, probs = topic_model.fit_transform(docs)

2025-05-02 22:12:57,171 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/812 [00:00<?, ?it/s]

2025-05-02 22:21:47,672 - BERTopic - Embedding - Completed ✓
2025-05-02 22:21:47,673 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-02 22:22:18,769 - BERTopic - Dimensionality - Completed ✓
2025-05-02 22:22:18,771 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2025-05-02 22:22:30,723 - BERTopic - Zeroshot Step 1 - Completed ✓
2025-05-02 22:22:47,347 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-02 22:23:54,847 - BERTopic - Cluster - Completed ✓
2025-05-02 22:23:54,847 - BERTopic - Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...
2025-05-02 22:23:54,891 - BERTopic - Zeroshot Step 2 - Completed ✓
2025-05-02 22:23:54,893 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-02 22:24:49,315 - BERTopic - Representation - Completed ✓


In [11]:
# multilabel_topics = []
# for prob_dist in probs:
#     multilabel = [i for i, prob in enumerate(prob_dist) if prob > 0.3]
#     multilabel_topics.append(multilabel)
multilabel_topics_top = [list(np.argsort(prob)[-5:][::-1]) for prob in probs]

In [12]:
from sklearn.cluster import DBSCAN
import numpy as np
multilabel_topics = []

for prob in probs:
    # Reshape to 2D for clustering
    prob_2d = np.array(prob).reshape(-1, 1)

    # Cluster the probabilities
    clustering = DBSCAN(eps=0.05, min_samples=1).fit(prob_2d)

    # Find the cluster with the highest mean probability
    labels = clustering.labels_
    cluster_means = {label: prob_2d[labels == label].mean() for label in set(labels)}
    top_cluster_label = max(cluster_means, key=cluster_means.get)

    # Get topic indices in that top cluster
    selected_topics = [
        i for i, label in enumerate(labels) if label == top_cluster_label
    ]
    
    # If selected_topics is greater than 5, keep only the top 5
    if len(selected_topics) > 5:
        selected_topics = sorted(selected_topics, key=lambda x, p=prob: p[x], reverse=True)[:3]
    multilabel_topics.append(selected_topics)

In [13]:
topic_info = topic_model.get_topic_info()
topic_info.to_csv("memento_data/bertopic_topics.csv", index=False)

In [14]:
topic_model.visualize_topics().show()

In [15]:
topic_info = topic_model.get_topic_info()
id2label = dict(zip(topic_info.Topic, topic_info.Name))

In [16]:
topics = [id2label[t] for t in topics]

In [17]:
multilabel_topics = [
    [id2label[t] for t in multilabel if t in id2label]
    for multilabel in multilabel_topics
]

In [18]:
multilabel_topics_top = [
    [id2label[t] for t in multilabel if t in id2label]
    for multilabel in multilabel_topics_top
]

In [19]:
print("Number of topics:", len(set(topics)))
print(topics[:10])
print(multilabel_topics[:10]) 
print(multilabel_topics_top[:10])

Number of topics: 247
['26_ai_chatbots_intelligence_machines', '243_tutoring_answering_tutors_grading', '234_attention_transformers_softmax_transformer', '49_recommender_factorization_recommendation_recommendations', '9_learning_courses_ai_teaching', '99_graphbased_networks_graphs_graphstructured', '-1_learning_optimal_models_stochastic', '239_translations_multilingual_bilingual_translation', '243_tutoring_answering_tutors_grading', '-1_learning_optimal_models_stochastic']
[['27_adversarial_adversarially_adversary_adversaries'], ['244_dropout_underfitting_overfitting_regularization'], ['235_memory_attention_memoryefficient_decoding'], ['50_ranking_rankings_ranked_rank'], ['10_learning_theory_algorithmic_papers'], ['100_reinforcement_policies_learning_policy'], ['16_microsoft_technologies_researchers_technology', '243_tutoring_answering_tutors_grading', '244_dropout_underfitting_overfitting_regularization'], ['240_adaptivemcmc_mcmc_hamiltonian_monte'], ['219_robust_robustly_outliers_est

In [20]:
df.loc[: len(multilabel_topics) - 1, "assigned_topic_name"] = pd.Series(
    multilabel_topics
)
df["assigned_topic_name"] = df["assigned_topic_name"].apply(
    lambda x: ", ".join(x) if isinstance(x, list) else ""
)

In [21]:
import pandas as pd
import os
output_dir = "memento_data"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"topic_results.csv")
df.to_csv(output_file, index=False)
print(f"\nSaved unique items to {output_file}")


Saved unique items to memento_data\topic_results.csv
