First, you will have to "pip install bertopic"

In [6]:
import os.path as osp
import pandas as pd
import joblib

from util import stopwords, viet_tokenize, draw_wordcloud, draw_silhouette

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

# Read data

In [7]:
temp_dir = osp.join('.', 'temp')
data = pd.read_csv(osp.join(temp_dir, 'data.csv'), encoding='utf-8')
features = pd.read_csv(osp.join(temp_dir, 'features.csv'), encoding='utf-8')

Change format for input requirement of BERTopic which is list of str

In [8]:
docs = list(data['article'])

# Build & Train

BERTopic pipeline is as folllows:
1. Embed documents: vectorise documents using TfidfVectorizer
2. Dimensionality reduction: reduce dimension using PCA
3. Cluster documents: clustering data to find topics using kmeans
4. Tokenise topics: vectorise topics using tfidf
5. Topic representation: create topic representation using c-TF-IDF (class-based TF-IDF)
5. Fine-tune Topic representation: fine tune to get the best representation of topics, KeyBERTInspired is the default model

In [None]:
embedding_model = TfidfVectorizer(tokenizer=viet_tokenize, stop_words=stopwords,
                                  ngram_range=(1,3), min_df=.05, max_df=.7)
pca = PCA(n_components=300)
cluster_model = KMeans(n_clusters=6, random_state=0, max_iter=3000)
vectorizer_model = TfidfVectorizer(tokenizer=viet_tokenize, stop_words=stopwords,
                                   ngram_range=(1,3), min_df=.05, max_df=.7)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=False)
representation_model = KeyBERTInspired()

topic_model = BERTopic(
  n_gram_range=(1,3),
  embedding_model=embedding_model,
  umap_model=pca,
  hdbscan_model=cluster_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  representation_model=representation_model,
  language='multilingual'
)

In [10]:
topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


# Visualisation

In [None]:
import matplotlib.pyplot as plt

In [None]:
clustered_docs = pd.DataFrame(topics, columns=['cluster'])

In [None]:
draw_silhouette(plt.gca(), features, clustered_docs['cluster'], 'BERTopic')

In [None]:
draw_wordcloud(data[clustered_docs['cluster'] == 0]['article'], plt.gca(),
               background_color='white')

In [None]:
draw_wordcloud(data[clustered_docs['cluster'] == 1]['article'], plt.gca(),
               background_color='white')

In [None]:
draw_wordcloud(data[clustered_docs['cluster'] == 2]['article'], plt.gca(),
               background_color='white')

In [None]:
draw_wordcloud(data[clustered_docs['cluster'] == 3]['article'], plt.gca(),
               background_color='white')

In [None]:
draw_wordcloud(data[clustered_docs['cluster'] == 4]['article'], plt.gca(),
               background_color='white')

In [None]:
draw_wordcloud(data[clustered_docs['cluster'] == 5]['article'], plt.gca(),
               background_color='white')

# Save

In [None]:
from pathlib import Path

if not osp.exists((output_dir := osp.join('..', 'Output', 'BERTopic'))):
    Path(output_dir).mkdir()
    Path(osp.join(output_dir, 'model')).mkdir()

pd.concat([features, clustered_docs, data['link']], axis=1)\
  .to_csv(osp.join(output_dir, 'clustered.csv'), index=False)
topic_model.save(osp.join(output_dir, 'model'), serialization="safetensors", save_ctfidf=True,
                 save_embedding_model=embedding_model)