In [1]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer
from umap import UMAP
from scipy.cluster import hierarchy as sch
from hdbscan import HDBSCAN
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform

docs = fetch_20newsgroups(subset='all',
                          remove=('headers', 'footers', 'quotes'))['data']

# 923 - 1 

In [2]:


models = {'small':'all-MiniLM-L6-v2',
          'medium':'all-MiniLM-L12-v2',
          'large':'all-mpnet-base-v2'}

sentence_model = SentenceTransformer(models['medium'])
topic_model = BERTopic(embedding_model=sentence_model)
embeddings = sentence_model.encode(docs,show_progress_bar=True)

from sklearn.feature_extraction.text import CountVectorizer
topics,probs = topic_model.fit_transform(docs, embeddings=embeddings)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

hierarchical_topics = topic_model.hierarchical_topics(docs)
fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics,orientation='bottom', custom_labels=False)
fig.show()
# htm_plot.write_html('news_example.html')

Batches:   0%|          | 0/589 [00:00<?, ?it/s]

100%|██████████| 234/234 [00:12<00:00, 18.93it/s]


## SciPy Hierarchy

In [4]:
# Select topics based on top_n and topics args
freq_df = topic_model.get_topic_freq()
freq_df = freq_df.loc[freq_df.Topic != -1, :]
topics = sorted(freq_df.Topic.to_list())

# labels and linkage function as in _hierarchy
new_labels = [[[str(topics[int(x)]), None]] +
              topic_model.get_topic(topics[int(x)])
              for x in range(len(topics))]
new_labels = [
    "_".join([label[0] for label in labels[:4]]) for labels in new_labels
]
new_labels = [
    label if len(label) < 30 else label[:27] + "..." for label in new_labels
]

distance_function = lambda X: 1 - cosine_similarity(X)
linkage_function = lambda X: hierarchy.linkage(
    X, 'ward', optimal_ordering=True)

# scipy linkage
embeddings = topic_model.c_tf_idf_[1:]
d = distance_function(embeddings)
np.fill_diagonal(d, 0)
d = squareform(d)
Z = linkage_function(d)
leaves = hierarchy.leaves_list(Z)
scipy_labels = [new_labels[i] for i in leaves]

# assert they are the same
bertopic_labels = [label for label in fig.layout["xaxis"]["ticktext"]]
assert scipy_labels == bertopic_labels
print("Labels are the same!")

Labels are the same!


# 923 - 2

In [5]:


data= pd.read_csv('Corona_NLP_test.csv')

sentence_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
embeddings = sentence_model.encode(data.OriginalTweet, show_progress_bar=False)
embeddings = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)
hdbscan_model = HDBSCAN(min_cluster_size=341, min_samples=3)

topic_model = BERTopic(hdbscan_model=hdbscan_model ,n_gram_range=(1,3), min_topic_size=341)
topics, probs = topic_model.fit_transform(data.OriginalTweet, embeddings)
hierarchical_topics = topic_model.hierarchical_topics(data.OriginalTweet)
fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, orientation='left', color_threshold=0)
fig.show()

100%|██████████| 1/1 [00:00<00:00, 92.24it/s]


## SciPy Hierarchy

In [6]:
# Select topics based on top_n and topics args
freq_df = topic_model.get_topic_freq()
freq_df = freq_df.loc[freq_df.Topic != -1, :]
topics = sorted(freq_df.Topic.to_list())

# labels and linkage function as in _hierarchy
new_labels = [[[str(topics[int(x)]), None]] +
              topic_model.get_topic(topics[int(x)])
              for x in range(len(topics))]
new_labels = [
    "_".join([label[0] for label in labels[:4]]) for labels in new_labels
]
new_labels = [
    label if len(label) < 30 else label[:27] + "..." for label in new_labels
]

distance_function = lambda X: 1 - cosine_similarity(X)
linkage_function = lambda X: hierarchy.linkage(
    X, 'ward', optimal_ordering=True)

# scipy linkage
embeddings = topic_model.c_tf_idf_[1:]
d = distance_function(embeddings)
np.fill_diagonal(d, 0)
d = squareform(d)
Z = linkage_function(d)
leaves = hierarchy.leaves_list(Z)
scipy_labels = [new_labels[i] for i in leaves]

# assert they are the same
bertopic_labels = [label for label in fig.layout["yaxis"]["ticktext"]]
assert scipy_labels == bertopic_labels
print("Labels are the same!")

Labels are the same!


# 1063

In [7]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

topic_model = BERTopic(embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs, embeddings)

hierarchical_topics = topic_model.hierarchical_topics(docs)

fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
fig.show()

Batches:   0%|          | 0/589 [00:00<?, ?it/s]

100%|██████████| 214/214 [00:00<00:00, 245.06it/s]


## SciPy Hierarchy

In [8]:
# Select topics based on top_n and topics args
freq_df = topic_model.get_topic_freq()
freq_df = freq_df.loc[freq_df.Topic != -1, :]
topics = sorted(freq_df.Topic.to_list())

# labels and linkage function as in _hierarchy
new_labels = [[[str(topics[int(x)]), None]] +
              topic_model.get_topic(topics[int(x)])
              for x in range(len(topics))]
new_labels = [
    "_".join([label[0] for label in labels[:4]]) for labels in new_labels
]
new_labels = [
    label if len(label) < 30 else label[:27] + "..." for label in new_labels
]

distance_function = lambda X: 1 - cosine_similarity(X)
linkage_function = lambda X: hierarchy.linkage(
    X, 'ward', optimal_ordering=True)

# scipy linkage
embeddings = topic_model.c_tf_idf_[1:]
d = distance_function(embeddings)
np.fill_diagonal(d, 0)
d = squareform(d)
Z = linkage_function(d)
leaves = hierarchy.leaves_list(Z)
scipy_labels = [new_labels[i] for i in leaves]

# assert they are the same
bertopic_labels = [label for label in fig.layout["yaxis"]["ticktext"]]
assert scipy_labels == bertopic_labels
print("Labels are the same!")

Labels are the same!


# 1021

In [9]:

topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(docs)

linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(
    docs, linkage_function=linkage_function)

fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

fig.show()

Batches:   0%|          | 0/589 [00:00<?, ?it/s]

2023-04-08 18:27:20,197 - BERTopic - Transformed documents to Embeddings
2023-04-08 18:27:28,017 - BERTopic - Reduced dimensionality
2023-04-08 18:27:29,188 - BERTopic - Clustered reduced embeddings
100%|██████████| 213/213 [00:02<00:00, 94.22it/s] 


## SciPy Hierarchy

In [10]:
# Select topics based on top_n and topics args
freq_df = topic_model.get_topic_freq()
freq_df = freq_df.loc[freq_df.Topic != -1, :]
topics = sorted(freq_df.Topic.to_list())

# labels and linkage function as in _hierarchy
new_labels = [[[str(topics[int(x)]), None]] +
              topic_model.get_topic(topics[int(x)])
              for x in range(len(topics))]
new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels]
new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels]

distance_function = lambda X: 1 - cosine_similarity(X)
linkage_function = lambda X: hierarchy.linkage(
    X, 'ward', optimal_ordering=True)

# scipy linkage
embeddings = topic_model.c_tf_idf_[1:]
d = distance_function(embeddings)
np.fill_diagonal(d, 0)
d = squareform(d)
Z = linkage_function(d)
leaves = hierarchy.leaves_list(Z)
scipy_labels = [new_labels[i] for i in leaves]

# assert they are the same
bertopic_labels = [label for label in fig.layout["yaxis"]["ticktext"]]
assert scipy_labels == bertopic_labels
print("Labels are the same!")

Labels are the same!
