Demo ipynb for BERTopic

Testing the pipeline for a single game

Ref

BERTopic tutorial

https://colab.research.google.com/drive/1FieRA9fLdkQEGDIMYl0I3MCjSUKVF8C-?usp=sharing#scrollTo=ScBUgXn06IK6


BERTopic Best Practices

https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing#scrollTo=m3aN-f9B4rmU


BERTopic Big data (for improving the speed of the training pipeline, on GPU)

https://colab.research.google.com/drive/1W7aEdDPxC29jP99GGZphUlqjMFFVKtBC?usp=sharing#scrollTo=Ls2Q-iccGs7O


BERTopic Topic Modelling with Llama2

https://colab.research.google.com/drive/1QCERSMUjqGetGGujdrvv_6_EeoIcd_9M?usp=sharing#scrollTo=4Uj8MYhCafmX

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

import gensim

import nltk

import pyLDAvis

In [2]:
dataset_path = Path('../../dataset/topic_modelling/top_10_games/00_Terraria.pkl')

dataset = pd.read_pickle(dataset_path)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 81776 entries, 63365 to 145140
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         81776 non-null  int64 
 1   app_id        81776 non-null  int64 
 2   app_name      81776 non-null  object
 3   review_text   81776 non-null  object
 4   review_score  81776 non-null  int64 
 5   review_votes  81776 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 4.4+ MB


In [3]:
%load_ext autoreload

In [4]:
# data preprocessing

import re

import sys
sys.path.append('../../sa')

%autoreload 2
import str_cleaning_functions


def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

def cleaning_strlist(str_list):
    str_list = list(map(lambda x: str_cleaning_functions.remove_links(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_links2(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.clean(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.deEmojify(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
    return str_list

In [5]:
cleaning(dataset, 'review_text')

In [6]:
X = dataset['review_text'].values

Training

for small documents, simply run with the BERTopic encapsulated function and the training is all done.

for large documents, it's better to pre-calculate embeddings and prepare vocab b4 training to reduce memory usage.

In [7]:
# small documents

# from bertopic import BERTopic

# TOP_N_WORDS = 10                # number of words per topic
# N_GRAM_RANGE = (1, 2)           # n-gram

# topic_model = BERTopic(language="english", top_n_words=TOP_N_WORDS, calculate_probabilities=True, verbose=True)
# topics, probs = topic_model.fit_transform(X)

In [8]:
# large documents

# pre-calculate embeddings

from sentence_transformers import SentenceTransformer
import torch

# Create embeddings

SENTENCE_TRANSFORMERS_NAME = 'sentence-transformers/all-MiniLM-L6-v2'

model = SentenceTransformer(SENTENCE_TRANSFORMERS_NAME, device='cuda' if torch.cuda.is_available() else 'cpu')
embeddings = model.encode(X, show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
Batches:   1%|▏         | 33/2556 [00:17<21:48,  1.93it/s] 


KeyboardInterrupt: 

In [9]:
# save the embeddings

embedding_path = Path('00_Terraria_embeddings.pkl')

if not embedding_path.exists():
    with open(embedding_path, 'wb') as f:
        np.save(f, embeddings)

# load the embeddings
if embedding_path.exists():
    with open(embedding_path, 'rb') as f:
        embeddings = np.load(f)

In [8]:
# prepare vocabulary before training such that tokenizer does not need to do the calculations itself

import collections
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer

# Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for doc in tqdm(X):
  vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 15]; len(vocab)    # set the minimum frequency to reduce the vocabulary size

100%|██████████| 81776/81776 [00:00<00:00, 103987.28it/s]


6891

In [None]:
# not using GPU acceleration as the dependency is fking messy
# and the model is deployed on a CPU only server

from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

# parameter optimization
# UMAP
UMAP_N_COMPONENTS = 5
UMAP_N_NEIGHBORS = 50

# HDBSCAN
HDBSCAN_MIN_CLUSTER_SIZE = 150
HDBSCAN_MIN_SAMPLES = 20

# BERTopic
N_TOPICS = 20

# check: https://maartengr.github.io/BERTopic/faq.html#which-embedding-model-should-i-choose 
# for more parameter optimization on the UMAP and HDBSCAN models

# Prepare sub-models
# the HDBSCAN and UMAP are (Nvidia) GPU-accelerated versions
embedding_model = SentenceTransformer(SENTENCE_TRANSFORMERS_NAME)       # use the model as the embedding model
umap_model = UMAP(n_components=UMAP_N_COMPONENTS, n_neighbors=UMAP_N_NEIGHBORS, min_dist=0.0, random_state=42, metric="cosine", verbose=True)       # set random_state for reproductability
hdbscan_model = HDBSCAN( min_cluster_size=HDBSCAN_MIN_CLUSTER_SIZE, min_samples=HDBSCAN_MIN_SAMPLES, gen_min_span_tree=True, prediction_data=True)
vectorizer_model = CountVectorizer(vocabulary=vocab, stop_words="english")

# Fit BERTopic without actually performing any clustering
topic_model = BERTopic(
        nr_topics=N_TOPICS + 1,                 # add 1 as the topic with id = '-1' represents outliers, and should be typically ignored
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        calculate_probabilities=True,
        
        verbose=True
)

topics, probs = topic_model.fit_transform(X, embeddings=embeddings)

2024-01-11 18:24:19,915 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=5, n_neighbors=50, random_state=42, verbose=True)
Thu Jan 11 18:24:19 2024 Construct fuzzy simplicial set
Thu Jan 11 18:24:20 2024 Finding Nearest Neighbors
Thu Jan 11 18:24:20 2024 Building RP forest with 19 trees
Thu Jan 11 18:24:22 2024 NN descent for 16 iterations
	 1  /  16
	 2  /  16
	 3  /  16
	 4  /  16
	 5  /  16
	Stopping threshold met -- exiting after 5 iterations
Thu Jan 11 18:24:36 2024 Finished Nearest Neighbor Search
Thu Jan 11 18:24:38 2024 Construct embedding


Epochs completed:   2%| ▏          4/200 [00:00]

	completed  0  /  200 epochs


Epochs completed:  11%| █          22/200 [00:04]

	completed  20  /  200 epochs


Epochs completed:  20%| ██         41/200 [00:08]

	completed  40  /  200 epochs


Epochs completed:  30%| ███        61/200 [00:12]

	completed  60  /  200 epochs


Epochs completed:  41%| ████       82/200 [00:17]

	completed  80  /  200 epochs


Epochs completed:  50%| █████      101/200 [00:21]

	completed  100  /  200 epochs


Epochs completed:  60%| ██████     121/200 [00:25]

	completed  120  /  200 epochs


Epochs completed:  70%| ███████    141/200 [00:29]

	completed  140  /  200 epochs


Epochs completed:  80%| ████████   161/200 [00:34]

	completed  160  /  200 epochs


Epochs completed:  90%| █████████  181/200 [00:38]

	completed  180  /  200 epochs


Epochs completed: 100%| ██████████ 200/200 [00:42]


Thu Jan 11 18:25:27 2024 Finished embedding


2024-01-11 18:25:28,099 - BERTopic - Dimensionality - Completed ✓
2024-01-11 18:25:28,101 - BERTopic - Cluster - Start clustering the reduced embeddings
  self._all_finite = is_finite(X)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TO

In [None]:
#get top5 topic frequency table
freq = topic_model.get_topic_freq()
print(freq)
print('Num of topics:', len(freq))

    Topic  Count
1      -1  28269
4       0  15871
0       1  14415
2       2  11068
9       3   3472
5       4   1332
10      5   1090
13      6   1056
3       7    811
11      8    783
15      9    513
6      10    464
16     11    431
20     12    426
19     13    323
18     14    293
7      15    287
8      16    279
12     17    231
14     18    181
17     19    181
Num of topics: 21


In [None]:
# get topic frequency table
freq = topic_model.get_topic_freq()
print(freq)
print('Num of topics:', len(freq))

    Topic  Count
1      -1  28269
4       0  15871
0       1  14415
2       2  11068
9       3   3472
5       4   1332
10      5   1090
13      6   1056
3       7    811
11      8    783
15      9    513
6      10    464
16     11    431
20     12    426
19     13    323
18     14    293
7      15    287
8      16    279
12     17    231
14     18    181
17     19    181
Num of topics: 21


In [37]:
# reduce outlier: https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html

# https://medium.com/@n83072/topic-modeling-bertopic-ca1b73a035f2

# Reduce outliers using the `probabilities` strategy
# This strategy uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document.
# To use this, make sure to calculate the probabilities beforehand by instantiating BERTopic with calculate_probabilities=True.
new_topics = topic_model.reduce_outliers(X, topics, probabilities=probs, strategy="probabilities")


# Use the topic distributions, as calculated with .approximate_distribution
# to find the most frequent topic in each outlier document.
# You can use the distributions_params variable to tweak the parameters of .approximate_distribution.
# Reduce outliers using the `distributions` strategy
new_topics = topic_model.reduce_outliers(X, topics, strategy="distributions")


# Reduce outliers using the `c-tf-idf` strategy
# Calculate the c-TF-IDF representation for each outlier document 
# and find the best matching c-TF-IDF topic representation using cosine similarity.
new_topics = topic_model.reduce_outliers(X, topics, strategy="c-tf-idf")

# Reduce outliers using the `embeddings` strategy
# but it costs huge reduction in npmi score
# maybe other less aggressive strategies should be used
# new_topics = topic_model.reduce_outliers(X, topics, strategy="embeddings")

In [43]:
from collections import Counter
new_topic_dict = dict(Counter(new_topics))


new_topic_dict_df = pd.DataFrame(list(new_topic_dict.items()), columns=['topic_id', 'count'])
new_topic_dict_df = new_topic_dict_df.sort_values(by=['count'], ascending=False)

new_topic_dict_df

Unnamed: 0,topic_id,count
3,0,29399
0,1,14477
5,2,13139
8,3,8073
7,4,2529
12,6,1726
6,7,1547
13,5,1500
17,12,1409
14,9,1353


In [45]:
new_topic_dict_df[new_topic_dict_df['topic_id'] == '-1']

Unnamed: 0,topic_id,count


In [46]:
# try to apply the topic reduction to the BERTopic model

topic_model.update_topics(X, topics=new_topics)



In [13]:
# save the model (different from the func for small documents)
from datetime import datetime

topic_model_folder_path = Path(f'my_model_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
topic_model.save(
    path=topic_model_folder_path,
    serialization="safetensors",
    save_ctfidf=True,
    save_embedding_model=SENTENCE_TRANSFORMERS_NAME
)

In [14]:
# reload the trained topic model for faster inference
# del topic_model

In [10]:
# load the model

from bertopic import BERTopic

# when loading the model, the public attributes are not stored
topic_model = BERTopic.load('my_model_20240118_163626')

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
embeddings.shape

(81769, 384)

In [11]:
# load the embeddings
# embedding_path = Path('00_Terraria_embeddings.pkl')
# embeddings = np.load(embedding_path)

# inference to get the topics and prob for evaluation
# hence, we need the probs to get topic-doc-matrix
topics, probs = topic_model.transform(X, embeddings=embeddings)

ValueError: Make sure that the embeddings are a numpy array with shape: (len(docs), vector_dim) where vector_dim is the dimensionality of the vector embeddings. 

In [16]:
probs.shape

NameError: name 'probs' is not defined

---

Get the docs with the highest probability in each topic when transform with a new set of documents

In [None]:
# how about we use the topics and probs variable to calculate the top N representative docs
top_N = 10

idx = np.argpartition(-probs, top_N, axis=0)[:top_N]

In [None]:
# row = document, col = topic
idx.shape

(10, 21)

In [None]:
idx[:, -1]

array([66922, 60612, 39721, 41823, 34887, 66124,  5826, 44161, 76701,
       76489])

In [None]:
probs[idx[:, -1], -1]

array([0.8847593 , 0.88933086, 0.89252526, 0.87341017, 0.86458516,
       0.86464214, 0.87115467, 0.86157316, 0.8588035 , 0.8588035 ],
      dtype=float32)

In [None]:
for i in idx[:, -1]:
    print(X[i])

Such a great Game 10/10 -Ign
I LOVE THIS GAME ign 10/10
this game is amazing 10/10 IGN
I LOVE THIS GAME 10/10 BEST GAVE EVER IGN
its a great game 10/10 IGN rating 
This is one of the best games ever. It got 9/10 IGN
this Game is amazing 10/1o ign
Great game 10/10 IGN :)
Great game, IGN 11/10
Great game, IGN 11/10


In [None]:
scores = probs[idx[:, 0]]

In [None]:
scores

array([[0.83925736, 0.74457836, 0.80703235, 0.6795908 , 0.4112299 ,
        0.60110843, 0.32747   , 0.4742515 , 0.57148874, 0.11809592,
        0.37909943, 0.57450265, 0.4534629 , 0.4331435 , 0.5107883 ,
        0.52181256, 0.52877164, 0.5952083 , 0.51749295, 0.2519888 ,
        0.3827619 ],
       [0.83819544, 0.6792901 , 0.82225263, 0.59778   , 0.4467274 ,
        0.71488297, 0.4026624 , 0.4975381 , 0.5864483 , 0.17567718,
        0.37321538, 0.6265934 , 0.49665412, 0.45622283, 0.5865581 ,
        0.57748616, 0.50651133, 0.5680938 , 0.5247112 , 0.29756355,
        0.419262  ],
       [0.862828  , 0.71632946, 0.8440254 , 0.6514621 , 0.38810313,
        0.73899895, 0.35407072, 0.51394963, 0.6051907 , 0.10856348,
        0.3591198 , 0.58158875, 0.474846  , 0.46928063, 0.5575608 ,
        0.5806221 , 0.5548263 , 0.5549511 , 0.5808619 , 0.23594311,
        0.4032487 ],
       [0.8423841 , 0.7527422 , 0.79166555, 0.6698292 , 0.42994094,
        0.6526007 , 0.42818356, 0.46310222, 0.551491 

In [None]:
scores.shape

(10, 21)

In [None]:
# # load the embeddings
# embedding_path = Path('00_Terraria_embeddings.pkl')
# embeddings = np.load(embedding_path)

# # inference to get the topics and prob for evaluation
# # hence, we need the probs to get topic-doc-matrix
# topics, probs = topic_model.transform(X, embeddings=embeddings)

---

Extracting Topics

In [18]:
# look at the most frequent topics 

freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,28269,-1_game_10_fun_play,"[game, 10, fun, play, hours, great, best, good...","[This game is mint., A great game to get into ..."
1,0,15871,0_game_fun_great_10,"[game, fun, great, 10, play, friends, hours, g...","[This game is fun with friends!, Great game to..."
2,1,14415,1_terraria_game_minecraft_like,"[terraria, game, minecraft, like, bosses, just...",[People always compare Terraria to Minecraft b...
3,2,11068,2_minecraft_game_2d_like,"[minecraft, game, 2d, like, sandbox, better, f...","[This game is what Minecraft should be., If yo..."
4,3,3472,3_game_best_great_games,"[game, best, great, games, played, fun, love, ...","[this is the best game i have ever played , Th..."


In [19]:
topic_model.get_topic(0)  # Select the most frequent topic

[('game', 0.04897517005070272),
 ('fun', 0.03681612297312159),
 ('great', 0.028061810313746726),
 ('10', 0.026495364566666124),
 ('play', 0.026040356378824243),
 ('friends', 0.026031871120225824),
 ('hours', 0.025632506264549243),
 ('good', 0.024068693232027892),
 ('played', 0.02277120709839394),
 ('best', 0.021386578408234158)]

(Copy from BERTopic ipynb in colab)

There are a number of attributes that you can access after having trained your BERTopic model:


| Attribute | Description |
|------------------------|---------------------------------------------------------------------------------------------|
| topics_               | The topics that are generated for each document after training or updating the topic model. |
| probabilities_ | The probabilities that are generated for each document if HDBSCAN is used. |
| topic_sizes_           | The size of each topic                                                                      |
| topic_mapper_          | A class for tracking topics and their mappings anytime they are merged/reduced.             |
| topic_representations_ | The top *n* terms per topic and their respective c-TF-IDF values.                             |
| c_tf_idf_              | The topic-term matrix as calculated through c-TF-IDF.                                       |
| topic_labels_          | The default labels for each topic.                                                          |
| custom_labels_         | Custom labels for each topic as generated through `.set_topic_labels`.                                                               |
| topic_embeddings_      | The embeddings for each topic if `embedding_model` was used.                                                              |
| representative_docs_   | The representative documents for each topic if HDBSCAN is used.                                                |

Save and load BERTopic models and components

Visualization

In [47]:
# visualize topics

topic_model.visualize_topics()

In [48]:
# visualize topic probabilities
# to understand how confident BERTopic is that certain topics are present in the documents

topic_model.visualize_distribution(probs[200], min_probability=0.001)

In [49]:
# visualize how topics are hierarchically reduced

topic_model.visualize_hierarchy(top_n_topics=50)


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead



In [50]:
# visualize selecteed terms for a few topics
# creating bar charts out of the c-TF-IDF scores for each topic representation.

topic_model.visualize_barchart(top_n_topics=5)

In [51]:
# visualize topic similarity
# Having generated topic embeddings, through both c-TF-IDF and embeddings,
# we can create a similarity matrix by simply applying cosine similarities through those topic embeddings.
# The result will be a matrix indicating how similar certain topics are to each other.

topic_model.visualize_heatmap(n_clusters=10, width=1000, height=1000)

Evaluation

Calculate metrics with octis

Reference

https://www.theanalyticslab.nl/topic-modeling-with-bertopic/

In [52]:
result_bertopic = {}

top_words = 10     # the functions will only return that number of top words
def _get_topics(topic_model):
    topic_list = []
    empty_topic_l_idx = []

    for idx, topics in topic_model.get_topics().items():
        if idx < 0:
            continue

        topics_sorted = sorted(topics, key=lambda x: x[1], reverse=True)
        topic_l = [t[0] for t in topics_sorted if t[0].strip() != '']

        # it's possible that resulting in an empty list
        # also, topic with only one word fails at calculating NPMI
        if len(topic_l) <= 1:
            empty_topic_l_idx.append(idx)
            continue

        topic_list.append(topic_l)
        # print(len(topic_l))

    return topic_list, empty_topic_l_idx

def _get_topic_word_matrix(topic_model, empty_topic_idxs):

    # use ctfidf value to calculate the probability of a word assigned to a topic
    # but this is not the probability of a word in a topic
    # maybe there's a better way

    c_tfidf_all = topic_model.c_tf_idf_.todense()

    topic_word_matrix = np.exp(c_tfidf_all) / np.exp(c_tfidf_all).sum(axis=1)

    # remove empty topics from the largest index
    for idx in empty_topic_idxs[::-1]:
        topic_word_matrix = np.delete(topic_word_matrix, idx, axis=0)

    # a better way: https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html#visualize-probablities-or-distribution
    # TODO: implement that for topic-word-matrix
    

    return topic_word_matrix

def _get_topic_document_matrix(probabilities, empty_topic_idxs):

    topic_document_matrix = probabilities.T

    for idx in empty_topic_idxs[::-1]:
        topic_document_matrix = np.delete(topic_document_matrix, idx, axis=1)

    return topic_document_matrix

result_bertopic['topics'], empty_topic_idxs = _get_topics(topic_model)
result_bertopic['topic-word-matrix'] = _get_topic_word_matrix(topic_model, empty_topic_idxs)
result_bertopic['topic-document-matrix'] = _get_topic_document_matrix(probs, empty_topic_idxs)

In [53]:
result_bertopic['topics'], result_bertopic['topic-word-matrix'], result_bertopic['topic-document-matrix']

([['game', 'this', 'it', 'and', 'the', 'to', 'of', 'you', 'is', 'fun'],
  ['terraria', 'the', 'and', 'to', 'you', 'is', 'of', 'it', 'game', 'that'],
  ['minecraft', 'and', 'game', 'this', 'it', 'is', 'of', 'you', 'the', 'to'],
  ['game', 'this', 'best', 'great', 'ever', 'love', 'is', 'good', 'one', 'it'],
  ['10',
   'again',
   'killed',
   'would',
   'the',
   'my',
   'you',
   'and',
   'to',
   'unicorn'],
  ['my', 'it', 'but', 'fix', 'the', 'and', 'game', 'to', 'me', 'this'],
  ['addictive',
   'addicting',
   'fun',
   'very',
   'addicted',
   'game',
   'hours',
   'and',
   'this',
   'it'],
  ['10', 'would', 'again', '11', 'ign', 'play', 'life', 'tunk', 'my', 'good'],
  ['good',
   'ok',
   'its',
   'pretty',
   'alright',
   'it',
   'guess',
   'cool',
   'yeah',
   'okay'],
  ['bye',
   'cool',
   'slit',
   'dink',
   'so',
   'tickle',
   'pickle',
   'zone',
   'it',
   'let'],
  ['review',
   'reviews',
   'badgei',
   'le',
   'this',
   'the',
   'game',
   'badge

In [57]:
topic_freq = topic_model.get_topic_freq()
topic_freq[topic_freq['Topic'] != -1]

Unnamed: 0,Topic,Count
3,0,29399
0,1,14477
5,2,13139
8,3,8073
7,4,2529
12,6,1726
6,7,1547
13,5,1500
17,12,1409
14,9,1353


Evaluation with gensim

(as gives more freedom to control the CoherenceModel by gensim)

In [59]:
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

# https://stackoverflow.com/questions/70548316/gensim-coherencemodel-gives-valueerror-unable-to-interpret-topic-as-either-a-l

# filter topics that contain only one word from the corpus for calculating npmi
# https://github.com/piskvorky/gensim/issues/3328


topic_words, empty_topic_l_idx = _get_topics(topic_model)

documents = pd.DataFrame({"Document": X,
                          "ID": range(len(X)),
                          "Topic": topics})

# remove documents which their topic contains 1<= words
documents = documents[~documents['Topic'].isin(empty_topic_idxs)]

documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

bertopic_vectorizer = topic_model.vectorizer_model
bertopic_analyzer = bertopic_vectorizer.build_analyzer()

words = bertopic_vectorizer.get_feature_names_out()
tokens = [bertopic_analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

In [60]:
# ~3 min on i714700 with CountVectorizer ~ 6000 words

# we first analysze NPMI

coherence_model = CoherenceModel(topics=topic_words,
                                 texts=tokens,
                                corpus=corpus,
                                dictionary=dictionary,
                                topn=10,
                                coherence='c_v')

# npmi = Coherence(texts=tokens,topk=10, measure='c_npmi')
# nmpi_score = npmi.score(result_bertopic)

cv_score = coherence_model.get_coherence()
cv_score


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0.3994560925733617

In [61]:
coherence_model_npmi = CoherenceModel(topics=topic_words,
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    topn=10,
                                    coherence='c_npmi')

npmi_score = coherence_model_npmi.get_coherence()
npmi_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

-0.0029256775418027474

In [31]:
def get_topic_diversity(topics, topk=10):
    ''' Topic Diversity as the percentage of unique words in the top M words of all topics
    Modified from octis implementation
    
    Parameters
    ----------
    topics : list of list of str
        List of topics, where each topic is a list of words.
    topk : int, optional
    '''
    if topics is None:
        return 0
    # if topk > len(topics[0]):
    #     raise Exception('Words in topics are less than ' + str(self.topk))

    unique_words = set()
    for topic in topics:
        unique_words = unique_words.union(set(topic[:topk]))
    td = len(unique_words) / (topk * len(topics))
    return td

get_topic_diversity(topic_words)

0.655

In [33]:
import itertools

import sys
sys.path.append('../')

from rbo import rbo

def get_word2index(list1, list2):
    words = set(list1)
    words = words.union(set(list2))
    word2index = {w: i for i, w in enumerate(words)}
    return word2index

def get_inverted_RBO(topics, topk=10, weight=0.9):
    ''' Inverted Rank-Biased Overlap (iRBO)
    to measure the diversity of the topics
    Modified from octis implementation

    Parameters
    ----------
    topics : list of list of str
        List of topics, where each topic is a list of words.
    topk : int, optional
    weight : float, optional
    '''

    if topics is None:
        return 0
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than topk')
    else:
        collect = []
        for list1, list2 in itertools.combinations(topics, 2):
            word2index = get_word2index(list1, list2)
            indexed_list1 = [word2index[word] for word in list1]
            indexed_list2 = [word2index[word] for word in list2]
            rbo_val = rbo(indexed_list1[:topk], indexed_list2[:topk], p=weight)[2]
            collect.append(rbo_val)
        return 1 - np.mean(collect)
    
get_inverted_RBO(topic_words)

0.9363353717539098

In [34]:
def _KL(P, Q):
    """
    Perform Kullback-Leibler divergence

    Parameters
    ----------
    P : distribution P
    Q : distribution Q

    Returns
    -------
    divergence : divergence from Q to P
    """
    # add epsilon to grant absolute continuity
    epsilon = 0.00001
    P = P+epsilon
    Q = Q+epsilon

    divergence = np.sum(np.multiply(P, np.log(P/Q)))        # changed the operator from * to np.multiply to do element-wise multiplication
    return divergence

def get_kl_divergence(topic_word_metrix):
    """Compute KL divergence between topic-word distributions
    to measure document covrage
    Modified from octis implementation
    https://github.com/MIND-Lab/OCTIS/blob/master/octis/evaluation_metrics/diversity_metrics.py#L209

    Parameters
    ----------
    topic_word_metrix : topic-word distribution matrix
    """
    beta = topic_word_metrix
    kl_div = 0
    count = 0
    for i, j in itertools.combinations(range(len(beta)), 2):
        kl_div += _KL(beta[i], beta[j])
        count += 1
    return kl_div / count

get_kl_divergence(result_bertopic['topic-word-matrix'])

0.00022574783055084367

In [35]:
result_bertopic['topic-word-matrix'].shape

(21, 6968)