First, you will have to "pip install bertopic"

In [6]:
import os.path as osp
import pandas as pd

from util import stopwords, viet_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

Read data

In [7]:
temp_dir = osp.join('.', 'temp')
data = pd.read_csv(osp.join(temp_dir, 'data.csv'), encoding='utf-8')
features = pd.read_csv(osp.join(temp_dir, 'features.csv'), encoding='utf-8')

BERTopic pipeline is as folllows:
1. Embed documents: vectorise documents, sentence-transformers is set as default 
2. Dimensionality reduction: this step will handle high dimensional data to reduce computational cost, UMAP technique is set as a default in BERTopic
3. Cluster documents: clustering data to find topics, HDBSCAN, a density-based clustering technique
4. Tokenise topics: vectorise topics, c-TF-IDF (class-based TF-IDF) is set as default
5. Topic representation: create topic representation
5. Fine-tune Topic representation: fine tune to get the best representation of topics, KeyBERTInspired is the default model

Change format for input requirement of BERTopic which is list of str

In [8]:
docs = list(data['article'])

# Build & Train

In [9]:
#Embed documents
embedding_model = TfidfVectorizer(tokenizer=viet_tokenize, stop_words=stopwords, ngram_range=(2,2), min_df=.1, max_df=.6)

#Dimensionality redeuction
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.0, metric='cosine')

#Perform clustering
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

#Tokenize topics
vectorizer_model = CountVectorizer(stop_words=stopwords)

#Create topic representation
ctfidf_model = ClassTfidfTransformer()

#(Optional) Fine-tune topic representations with 
representation_model = KeyBERTInspired()

topic_model = BERTopic(
  embedding_model=embedding_model,          
  umap_model=umap_model,                    
  hdbscan_model=hdbscan_model,              
  vectorizer_model=vectorizer_model,        
  ctfidf_model=ctfidf_model,                
  representation_model=representation_model,
  language='multilingual' 
)


: 

In [10]:
topics, probs = topic_model.fit_transform(docs)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [None]:
topic_model.get_topic_info()

# Visualisation

In [None]:
topic_model.visualize_topics()

# Save

In [None]:
topic_model.save(temp_dir, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)