# BERTopic

In [41]:
# imports and preliminaries

import pandas as pd
from bertopic import BERTopic
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("stopwords")
nltk.download('punkt_tab')

path_to_data = "./text_chunks.tsv"
stopwords = stopwords.words("english")
stopwords.extend(["the", "in", "of"])

#print(stopwords)

# Open training data
training_data = pd.read_csv(path_to_data, sep="\t", header=0)

# Tokenise text and remove stopwords
training_data["TEXT"] = training_data["TEXT"].apply(lambda x: word_tokenize(x))
training_data["TEXT"] = training_data["TEXT"].apply(lambda x: ' '.join([word for word in x if word.lower() not in stopwords]))

training_data.head(20)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joshua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/joshua/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,FILE,IDNO,TEXT
0,2-12-delius,doc_0_0_0,DELIUS
1,2-12-delius,doc_0_1_0,purpose present essay biographical . volume De...
2,2-12-delius,doc_0_1_1,"one opinion ; moreover , lack perception excus..."
3,2-12-delius,doc_0_2_0,"position Delius , viewed historically , partic..."
4,2-12-delius,doc_0_2_1,unmistakable Brigg Fair smaller idylls . part ...
5,2-12-delius,doc_0_3_0,"question immediate derivation , although signi..."
6,2-12-delius,doc_0_4_0,"One must admit , however , development indepen..."
7,2-12-delius,doc_0_5_0,"clear grasp Delius ’ music entirety , question..."
8,2-12-delius,doc_0_6_0,distinction encourages critic claim Delius pos...
9,2-12-delius,doc_0_7_0,intensely chromatic character Delius ’ music f...


In [42]:
# Train BERTopic
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(training_data["TEXT"])

# Save model
topic_model.save("./output/bertopic_model", serialization="safetensors", save_ctfidf=True)

2024-11-15 14:46:33,904 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/98 [00:00<?, ?it/s]

2024-11-15 14:46:58,109 - BERTopic - Embedding - Completed ✓
2024-11-15 14:46:58,110 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-15 14:47:08,601 - BERTopic - Dimensionality - Completed ✓
2024-11-15 14:47:08,603 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-15 14:47:08,673 - BERTopic - Cluster - Completed ✓
2024-11-15 14:47:08,681 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-15 14:47:08,791 - BERTopic - Representation - Completed ✓


In [43]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1057,-1_one_poetry_life_would,"[one, poetry, life, would, verse, may, poets, ...","[surprise Poe considered highest importance , ..."
1,0,285,0_art_aesthetic_artist_form,"[art, aesthetic, artist, form, one, works, emo...",[simple case presents easy form problems confr...
2,1,142,1_light_sleep_thou_night,"[light, sleep, thou, night, flowers, thee, lik...","[Since winter approaching , Henry James begun ..."
3,2,140,2_women_men_woman_life,"[women, men, woman, life, human, individual, c...",[already admitted difference men women relativ...
4,3,125,3_enemy_men_yards_front,"[enemy, men, yards, front, line, fire, battali...",[morning thickly misty . artillery fire desult...
5,4,81,4_delius_music_composer_wagner,"[delius, music, composer, wagner, bach, presen...","[DELIUS, position Delius , viewed historically..."
6,5,55,5_composition_everything_different_naturally,"[composition, everything, different, naturally...",[understood time everything except composition...
7,6,50,6_etheredge_hobart_temple_wilmot,"[etheredge, hobart, temple, wilmot, sarah, roc...","[Etheredge . compliment , Mr FitzJames , upon ..."
8,7,50,7_verlaine_life_wanted_never,"[verlaine, life, wanted, never, could, remaine...","[R. good ... tried conquer running away , got ..."
9,8,50,8_fitzjames_george_sir_father,"[fitzjames, george, sir, father, wolf, would, ...","[FitzJames . violent humour , Sir George ?, Fi..."
