# BERTopic

# imports and preliminaries

In [81]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("stopwords")
nltk.download('punkt_tab')

path_to_data = "./text_chunks.tsv"
stopwords = stopwords.words("english")
stopwords.extend(["the", "in", "of"])

#print(stopwords)

# Open training data
training_data = pd.read_csv(path_to_data, sep="\t", header=0)

# Tokenise text and remove stopwords
training_data["TEXT"] = training_data["TEXT"].apply(lambda x: word_tokenize(x))
training_data["TEXT"] = training_data["TEXT"].apply(lambda x: ' '.join([word for word in x if word.lower() not in stopwords]))

#training_data.head(20)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/joshua/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## So, is there an ERNIETopic?

In [82]:
# prepare embeddings
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(training_data["TEXT"], show_progress_bar=True)
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)


# Train BERTopic
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(training_data["TEXT"])
hierarchical_topics = topic_model.hierarchical_topics(training_data["TEXT"])

# Save model
topic_model.save("./output/bertopic_model", serialization="safetensors", save_ctfidf=True)

Batches:   0%|          | 0/98 [00:00<?, ?it/s]

2024-11-15 15:45:06,813 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/98 [00:00<?, ?it/s]

2024-11-15 15:45:22,606 - BERTopic - Embedding - Completed ✓
2024-11-15 15:45:22,610 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-15 15:45:29,366 - BERTopic - Dimensionality - Completed ✓
2024-11-15 15:45:29,372 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-15 15:45:29,469 - BERTopic - Cluster - Completed ✓
2024-11-15 15:45:29,524 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-15 15:45:29,716 - BERTopic - Representation - Completed ✓
100%|██████████| 50/50 [00:00<00:00, 779.07it/s]


In [84]:
topic_model.get_document_info(training_data["TEXT"])

similar_topics, similarity = topic_model.find_topics("composition", top_n=5)
topic_model.get_topic(similar_topics[0])

[('composition', 0.19144879634682016),
 ('everything', 0.13094539581058962),
 ('naturally', 0.09163811222897467),
 ('different', 0.08567724234208268),
 ('beginning', 0.08510833571172136),
 ('time', 0.0774172724691031),
 ('present', 0.06974068245956487),
 ('thing', 0.06701065207716213),
 ('continuous', 0.06051138948966097),
 ('simply', 0.05867872056659128)]