Demo ipynb for BERTopic

Testing the pipeline for a single game

Ref

BERTopic tutorial

https://colab.research.google.com/drive/1FieRA9fLdkQEGDIMYl0I3MCjSUKVF8C-?usp=sharing#scrollTo=ScBUgXn06IK6


BERTopic Best Practices

https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing#scrollTo=m3aN-f9B4rmU


BERTopic Big data (for improving the speed of the training pipeline, on GPU)

https://colab.research.google.com/drive/1W7aEdDPxC29jP99GGZphUlqjMFFVKtBC?usp=sharing#scrollTo=Ls2Q-iccGs7O


BERTopic Topic Modelling with Llama2

https://colab.research.google.com/drive/1QCERSMUjqGetGGujdrvv_6_EeoIcd_9M?usp=sharing#scrollTo=4Uj8MYhCafmX

In [2]:
import pandas as pd
import numpy as np

from pathlib import Path
import json
from datetime import datetime

import gensim

import nltk

import pyLDAvis

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

In [3]:
# dataset_path = Path('../../dataset/topic_modelling/top_10_games/00_Terraria.pkl')
dataset_path = Path('../../dataset/topic_modelling/top_11_genres/01_Indie.pkl')

dataset = pd.read_pickle(dataset_path)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 741913 entries, 25636 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         741913 non-null  int64 
 1   app_id        741913 non-null  int64 
 2   app_name      741913 non-null  object
 3   review_text   741913 non-null  object
 4   review_score  741913 non-null  int64 
 5   review_votes  741913 non-null  int64 
 6   genre_id      741913 non-null  object
 7   category_id   741913 non-null  object
dtypes: int64(4), object(4)
memory usage: 50.9+ MB


In [4]:
%load_ext autoreload

In [5]:
# data preprocessing

import re

import sys
sys.path.append('../../sa')

%autoreload 2
import str_cleaning_functions


def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

def cleaning_strlist(str_list):
    str_list = list(map(lambda x: str_cleaning_functions.remove_links(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_links2(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.clean(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.deEmojify(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
    return str_list

In [6]:
cleaning(dataset, 'review_text')

In [7]:
X = dataset['review_text'].values

In [8]:
# remove empty strings

X = list(filter(lambda x: len(x) > 0, X))

Training

for small documents, simply run with the BERTopic encapsulated function and the training is all done.

for large documents, it's better to pre-calculate embeddings and prepare vocab b4 training to reduce memory usage.

In [9]:
# small documents

# from bertopic import BERTopic

# TOP_N_WORDS = 10                # number of words per topic
# N_GRAM_RANGE = (1, 2)           # n-gram

# topic_model = BERTopic(language="english", top_n_words=TOP_N_WORDS, calculate_probabilities=True, verbose=True)
# topics, probs = topic_model.fit_transform(X)

In [10]:
import platform
import torch

if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

cuda


Build the hyperparameter selection (typically number of topics, but UMAP and HBSCAN also have hyperparameters)

with grid/random search

In [11]:
from gensim.models import CoherenceModel
from copy import deepcopy

from sklearn.model_selection import ParameterGrid, ParameterSampler

sys.path.append('../')

from eval_metrics import compute_inverted_rbo, compute_topic_diversity, compute_pairwise_jaccard_similarity, \
                        METRICS, SEARCH_BEHAVIOUR, COHERENCE_MODEL_METRICS

# TODO: write sth like lda_demo_gridsearch.ipynb to select hyperparameters

In [12]:
def _print_message(message):
    '''Print message with a timestamp in front of it

    Timestamp format: YYYY-MM-DD HH:MM:SS,mmm
    '''
    print(f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]} - {message}')

In [13]:
from typing import Iterable, List, Tuple, Union


def _init_sentence_transformers_params(model_name_or_path: str = None):
    
    params_dict = {}
    params_dict['model_name_or_path'] = model_name_or_path

    return params_dict

def _init_vocab_tokenizer_params(n_frequency:int = 0, ngram_range:Tuple[int, int] = (1, 1)):

    params_dict = {}
    params_dict['n_frequency'] = n_frequency
    params_dict['ngram_range'] = ngram_range

    return params_dict

def _init_umap_params(n_neighbors:int = 15,     # the number of neighbors to consider when approximating the local metric
                      n_components:int = 5,     # the target embedding dimension, its effect is largest on the performance of HDBSCAN. Increasing this value too much and HDBSCAN will have a hard time clustering the high-dimensional embeddings
                      metric:str = 'cosine',
                      min_dist:float = 0.1,     # the desired separation between close points in the embedding space
                      n_epochs:int = None,      # the number of training epochs to use when optimizing the low dimensional representation
                      low_memory:bool = False,
                      random_state:int = None):
    
    '''
    Suggested parameter tuning: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html
    '''

    params_dict = {}
    params_dict['n_neighbors'] = n_neighbors
    params_dict['n_components'] = n_components
    params_dict['metric'] = metric
    params_dict['min_dist'] = min_dist
    params_dict['n_epochs'] = n_epochs
    params_dict['low_memory'] = low_memory
    params_dict['random_state'] = random_state

    return params_dict

def _init_hdbscan_params(min_cluster_size:int = 5,
                            min_samples:int = None, 
                            metric:str = 'euclidean',
                            prediction_data:bool = True):
        '''
        Suggested parameter tuning: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html
        '''
    
        params_dict = {}
        params_dict['min_cluster_size'] = min_cluster_size      # equivalent to min_topic_size in BERTopic params
        params_dict['min_samples'] = min_samples
        params_dict['metric'] = metric              # options are those in sklearn.metrics.pairwise_distances. [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’]
        params_dict['prediction_data'] = prediction_data
    
        return params_dict

def _init_bertopic_params(language: str = "english",        # used to simplify the selection of sentence-transformers models, but since we are passing our own sbert model, this can be ignored.
                 top_n_words: int = 10,
                #  n_gram_range: Tuple[int, int] = (1, 1),
                #  min_topic_size: int = 10,
                 nr_topics: Union[int, str] = None,
                #  low_memory: bool = False,
                 calculate_probabilities: bool = False,
                #  seed_topic_list: List[List[str]] = None,
                #  zeroshot_topic_list: List[str] = None,
                #  zeroshot_min_similarity: float = .7
                 ):
    
    '''
    Parameter tuning: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html
    '''
     
    params_dict = {}
    params_dict['language'] = language                     # put in the params_dict for completeness (as it appears in Parameter tuning in BERTopic page)
    params_dict['top_n_words'] = top_n_words                # this also affects the number of top N words used to calculate metrics
    # params_dict['n_gram_range'] = n_gram_range            # this is controlled by the vocab_tokenizer_params
    # params_dict['min_topic_size'] = min_topic_size        # this is controlled by the hdbscan_params
    params_dict['nr_topics'] = nr_topics
    # params_dict['low_memory'] = low_memory                # this is controlled by the umap_params
    params_dict['calculate_probabilities'] = calculate_probabilities
    # params_dict['seed_topic_list'] = seed_topic_list
    # params_dict['zeroshot_topic_list'] = zeroshot_topic_list
    # params_dict['zeroshot_min_similarity'] = zeroshot_min_similarity

    return params_dict


In [14]:
def _init_config_dict(config_path:Path, model_name:str, hyperparameters:dict, search_space_dict:dict, 
                      metrics:list[METRICS], monitor:METRICS,
                      search_behaviour:SEARCH_BEHAVIOUR, search_rs:int, search_n_iter:int):
    
    if not config_path.exists():
        config = {}

        sbert_params = _init_sentence_transformers_params(**hyperparameters['sbert_params'])
        vocab_tokenizer_params = _init_vocab_tokenizer_params(**hyperparameters['vocab_tokenizer_params'])
        umap_params = _init_umap_params(**hyperparameters['umap_params'])
        hdbscan_params = _init_hdbscan_params(**hyperparameters['hdbscan_params'])
        bertopic_params = _init_bertopic_params(**hyperparameters['bertopic_params'])

        config['model'] = model_name
        config['sbert_params'] = sbert_params
        config['vocab_tokenizer_params'] = vocab_tokenizer_params
        config['umap_params'] = umap_params
        config['hdbscan_params'] = hdbscan_params
        config['bertopic_params'] = bertopic_params

        # remove hyperparameters that are in the search_space_dict
        if 'sbert_params' in search_space_dict:
            for k in search_space_dict['sbert_params'].keys():
                sbert_params.pop(k, '')
        if 'vocab_tokenizer_params' in search_space_dict:
            for k in search_space_dict['vocab_tokenizer_params'].keys():
                vocab_tokenizer_params.pop(k, '')
        if 'umap_params' in search_space_dict:
            for k in search_space_dict['umap_params'].keys():
                umap_params.pop(k, '')
        if 'hdbscan_params' in search_space_dict:
            for k in search_space_dict['hdbscan_params'].keys():
                hdbscan_params.pop(k, '')
        if 'bertopic_params' in search_space_dict:
            for k in search_space_dict['bertopic_params'].keys():
                bertopic_params.pop(k, '')

        config['search_space'] = search_space_dict

        config['metrics'] = list(map(lambda x: x.value, metrics))

        config['monitor'] = monitor.value

        config['search_behaviour'] = search_behaviour.value
        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            config['search_rs'] = search_rs
            config['search_n_iter'] = search_n_iter

        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)

        _print_message('Created config file at {}'.format(config_path))
        # print('Created config file at {}'.format(config_path))

    else:
        with open(config_path, 'r') as f:
            config = json.load(f)

        # check whether the config file is consistent with the input parameters
        assert config['model'] == model_name, 'input model_name is not consistent with config["model"]'
        assert config['metrics'] == list(map(lambda x: x.value, metrics)), 'input metrics is not consistent with config["metrics"]'
        assert config['monitor'] == monitor.value, 'input monitor is not consistent with config["monitor"]'
        assert config['search_behaviour'] == search_behaviour.value, 'input search_behaviour is not consistent with config["search_behaviour"]'
        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            assert config['search_rs'] == search_rs, 'input search_rs is not consistent with config["search_rs"]'
            assert config['search_n_iter'] == search_n_iter, 'input search_n_iter is not consistent with config["search_n_iter"]'
        
        # check whether the config file contains all the hyperparameters
        sbert_params = _init_sentence_transformers_params(**hyperparameters['sbert_params'])
        vocab_tokenizer_params = _init_vocab_tokenizer_params(**hyperparameters['vocab_tokenizer_params'])
        umap_params = _init_umap_params(**hyperparameters['umap_params'])
        hdbscan_params = _init_hdbscan_params(**hyperparameters['hdbscan_params'])
        bertopic_params = _init_bertopic_params(**hyperparameters['bertopic_params'])

        assert config['sbert_params'].keys() <= sbert_params.keys(), 'existing config["sbert_params"] contains additional hyperparameters'
        assert config['vocab_tokenizer_params'].keys() <= vocab_tokenizer_params.keys(), 'existing config["vocab_tokenizer_params"] contains additional hyperparameters'
        assert config['umap_params'].keys() <= umap_params.keys(), 'existing config["umap_params"] contains additional hyperparameters'
        assert config['hdbscan_params'].keys() <= hdbscan_params.keys(), 'existing config["hdbscan_params"] contains additional hyperparameters'
        assert config['bertopic_params'].keys() <= bertopic_params.keys(), 'existing config["bertopic_params"] contains additional hyperparameters'

        for key in sbert_params.keys() & config['sbert_params'].keys():
            assert sbert_params[key] == config['sbert_params'][key], 'existing config["sbert_params"] contains different hyperparameters'
        for key in vocab_tokenizer_params.keys() & config['vocab_tokenizer_params'].keys():
            assert vocab_tokenizer_params[key] == config['vocab_tokenizer_params'][key], 'existing config["vocab_tokenizer_params"] contains different hyperparameters'
        for key in umap_params.keys() & config['umap_params'].keys():
            assert umap_params[key] == config['umap_params'][key], 'existing config["umap_params"] contains different hyperparameters'
        for key in hdbscan_params.keys() & config['hdbscan_params'].keys():
            assert hdbscan_params[key] == config['hdbscan_params'][key], 'existing config["hdbscan_params"] contains different hyperparameters'
        for key in bertopic_params.keys() & config['bertopic_params'].keys():
            assert bertopic_params[key] == config['bertopic_params'][key], 'existing config["bertopic_params"] contains different hyperparameters'
        
        # check whether the config file contains all the search space
        
        if 'sbert_params' in config['search_space']:
            assert config['search_space']['sbert_params'].keys() == search_space_dict['sbert_params'].keys(), 'input search_space_dict["sbert_params"] contains different hyperparameter keys than existing config["search_space"]["sbert_params"]'
            for key in search_space_dict['sbert_params'].keys():
                assert search_space_dict['sbert_params'][key] == config['search_space']['sbert_params'][key], f'input search_space_dict["sbert_params"]["{key}"] contains value than existing config["search_space"]["sbert_params"]["{key}"]'
        if 'vocab_tokenizer_params' in config['search_space']:
            assert config['search_space']['vocab_tokenizer_params'].keys() == search_space_dict['vocab_tokenizer_params'].keys(), 'input search_space_dict["vocab_tokenizer_params"] contains different hyperparameter keys than existing config["search_space"]["vocab_tokenizer_params"]'
            for key in search_space_dict['vocab_tokenizer_params'].keys():
                assert search_space_dict['vocab_tokenizer_params'][key] == config['search_space']['vocab_tokenizer_params'][key], f'input search_space_dict["vocab_tokenizer_params"]["{key}"] contains value than existing config["search_space"]["vocab_tokenizer_params"]["{key}"]'
        if 'umap_params' in config['search_space']:
            assert config['search_space']['umap_params'].keys() == search_space_dict['umap_params'].keys(), 'input search_space_dict["umap_params"] contains different hyperparameter keys than existing config["search_space"]["umap_params"]'
            for key in search_space_dict['umap_params'].keys():
                assert search_space_dict['umap_params'][key] == config['search_space']['umap_params'][key], f'input search_space_dict["umap_params"]["{key}"] contains value than existing config["search_space"]["umap_params"]["{key}"]'
        if 'hdbscan_params' in config['search_space']:
            assert config['search_space']['hdbscan_params'].keys() == search_space_dict['hdbscan_params'].keys(), 'input search_space_dict["hdbscan_params"] contains different hyperparameter keys than existing config["search_space"]["hdbscan_params"]'
            for key in search_space_dict['hdbscan_params'].keys():
                assert search_space_dict['hdbscan_params'][key] == config['search_space']['hdbscan_params'][key], f'input search_space_dict["hdbscan_params"]["{key}"] contains value than existing config["search_space"]["hdbscan_params"]["{key}"]'
        if 'bertopic_params' in config['search_space']:
            assert config['search_space']['bertopic_params'].keys() == search_space_dict['bertopic_params'].keys(), 'input search_space_dict["bertopic_params"] contains different hyperparameter keys than existing config["search_space"]["bertopic_params"]'
            for key in search_space_dict['bertopic_params'].keys():
                assert search_space_dict['bertopic_params'][key] == config['search_space']['bertopic_params'][key], f'input search_space_dict["bertopic_params"]["{key}"] contains value than existing config["search_space"]["bertopic_params"]["{key}"]'


        _print_message('Loaded existing config file from {}'.format(config_path))
        _print_message('Hyperparameters and search space are consistent with the input parameters')
        # print('Loaded existing config file from {}'.format(config_path))
        # print('Hyperparameters and search space are consistent with the input parameters')

    return config

In [15]:
def _init_result_dict(result_path: Path, monitor_type:str):
        
    if not result_path.exists():
        result = {}

        result['best_metric'] = -float('inf')
        result['best_model_checkpoint'] = ""
        result['best_hyperparameters'] = dict()
        result["monitor_type"] = monitor_type
        result["log_history"] = list()
        
    else:
        with open(result_path, 'r') as f:
            result = json.load(f)

        assert result['monitor_type'] == monitor_type

        _print_message('Loaded existing result file from {}'.format(result_path))
        # print('Loaded existing result file from {}'.format(result_path))
    
    return result

In [16]:
def _get_topics(topic_model):
    topic_list = []
    empty_topic_l_idx = []

    for idx, topics in topic_model.get_topics().items():
        if idx < 0:
            continue

        topics_sorted = sorted(topics, key=lambda x: x[1], reverse=True)
        topic_l = [t[0] for t in topics_sorted if t[0].strip() != '']

        # it's possible that resulting in an empty list
        # also, topic with only one word fails at calculating NPMI
        if len(topic_l) <= 1:
            empty_topic_l_idx.append(idx)
            continue

        topic_list.append(topic_l)
        # print(len(topic_l))

    return topic_list, empty_topic_l_idx

def _get_topic_word_matrix(topic_model, empty_topic_idxs):
    # use ctfidf value to calculate the probability of a word assigned to a topic
    # but this is not the probability of a word in a topic
    # maybe there's a better way

    c_tfidf_all = topic_model.c_tf_idf_.todense()

    topic_word_matrix = np.exp(c_tfidf_all) / np.exp(c_tfidf_all).sum(axis=1)

    # remove empty topics from the largest index
    for idx in empty_topic_idxs[::-1]:
        topic_word_matrix = np.delete(topic_word_matrix, idx, axis=0)

def _get_topic_document_matrix(probs, empty_topic_idxs):
    topic_document_matrix = probs

    for idx in empty_topic_idxs[::-1]:
        topic_document_matrix = np.delete(topic_document_matrix, idx, axis=0)

    return topic_document_matrix.T

In [17]:
from bertopic import BERTopic

def _load_bertopic_model(model_path:Path):
    topic_model = BERTopic.load(str(model_path))

    return topic_model

In [18]:
class Dimensionality:
  """ Use this for pre-calculated reduced embeddings """
  def __init__(self, reduced_embeddings):
    self.reduced_embeddings = reduced_embeddings

  def fit(self, X):
    return self

  def transform(self, X):
    return self.reduced_embeddings

In [19]:
from gensim import corpora

from sentence_transformers import SentenceTransformer

import collections
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

from bertopic import BERTopic
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import cuml


def model_search(X, hyperparameters:dict, search_space:dict, save_folder:Path,
                 additional_stopwords:list[str]=None, cuml_accl:bool=False,
                metrics:list[METRICS]=[METRICS.C_NPMI], monitor:METRICS=METRICS.C_NPMI, 
                save_each_models=True, run_from_checkpoints=False,
                search_behaviour=SEARCH_BEHAVIOUR.GRID_SEARCH, search_rs=42, search_n_iter=10):
    
    config_json_path = save_folder.joinpath('config.json')
    result_json_path = save_folder.joinpath('result.json')

    if monitor not in metrics:
        raise Exception('monitor is not in metrics. Please modify the metrics passed in.')

    if run_from_checkpoints:
        if not save_folder.exists():
            _print_message('Save folder:' + str(save_folder.resolve()) + ' does not exist. Function terminates.')
            # print('Save folder:' + str(save_folder.resolve()) + ' does not exist. Function terminates.')
            raise Exception('No checkpoints found. Function terminates.')
        
        # check for existing configs
        if not config_json_path.exists():
            raise Exception('No config.json found. Function terminates.')
        
        # check for existing results
        if not result_json_path.exists():
            _print_message('No result.json is found. Assuming no existing checkpoints.')
            # print('no result.json is found. Assuming no existing checkpoints.')
    else:
        if save_folder.exists():
            raise Exception('Checkpoints found. Please delete the checkpoints or set run_from_checkpoints=True. Function terminates.')

    if not save_folder.exists():
        save_folder.mkdir()

    config = _init_config_dict(config_json_path, 'bertopic', hyperparameters, search_space, 
                               metrics, monitor, search_behaviour, search_rs, search_n_iter)
    
    result = _init_result_dict(result_json_path, monitor.value)

    _print_message('Search folder: {}'.format(save_folder))

    # init
    best_model_path = result['best_model_checkpoint']
    best_metric_score = result['best_metric']
    best_model = _load_bertopic_model(Path(best_model_path)) if best_model_path != "" else None
    best_hyperparameters = result['best_hyperparameters']

    _print_message('Best model checkpoint: {}'.format(best_model_path))
    _print_message('Best metric score: {}'.format(best_metric_score))
    _print_message('Best model: {}'.format(best_model))

    # print(f'Best model checkpoint: {best_model_path}')
    # print(f'Best metric score: {best_metric_score}')
    # print(f'Best model: {best_model}')

    # search
    # create a temporary dict for initiating the search space by sklearn parameter grid / parameter sampler
    temp_search_space = {}
    for k, v in search_space.items():
        for kk, vv in v.items():
            temp_search_space[k + '__' + kk] = vv

    if search_behaviour == SEARCH_BEHAVIOUR.GRID_SEARCH:
        search_iterator = ParameterGrid(temp_search_space)
    elif search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
        search_iterator = ParameterSampler(temp_search_space, n_iter=search_n_iter, random_state=search_rs)

    print('\n')

    for search_space_dict in search_iterator:
        # unwrap the search space dict

        model_name = ''

        _sbert_params = {}
        _vocab_tokenizer_params = {}
        _umap_params = {}
        _hdbscan_params = {}
        _bertopic_params = {}

        for k, v in search_space_dict.items():
            if k.startswith('sbert_params'):
                _sbert_params[k.split('__')[1]] = v
                model_name += 'sb_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('vocab_tokenizer_params'):
                _vocab_tokenizer_params[k.split('__')[1]] = v
                model_name += 'vt_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('umap_params'):
                _umap_params[k.split('__')[1]] = v
                model_name += 'um_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('hdbscan_params'):
                _hdbscan_params[k.split('__')[1]] = v
                model_name += 'hs_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('bertopic_params'):
                _bertopic_params[k.split('__')[1]] = v
                model_name += 'bt_' + k.split('__')[1] + '_' + str(v) + '_'
            else:
                raise Exception('Unknown key: {}'.format(k))
            
        model_name = model_name[:-1]       # remove the last '_'

        # create the model path to save the model
        model_path = save_folder.joinpath(config['model'] + '_' + model_name)

        # check whether the model exists
        if model_path.exists():
            _print_message(f'Skipping current search space: {search_space_dict}')
            # print(f'Skipping current search space: {search_space_dict}')
            continue

        ##########
        # Training starts
        ##########

        _print_message(f'Current search space: {search_space_dict}')
        # print(f'Current search space: {search_space_dict}')

        sbert_params = deepcopy(config['sbert_params'])     # deepcopy just for data safety (not messing up with the original config dict)
        vocab_tokenizer_params = deepcopy(config['vocab_tokenizer_params'])
        umap_params = deepcopy(config['umap_params'])
        hdbscan_params = deepcopy(config['hdbscan_params'])
        bertopic_params = deepcopy(config['bertopic_params'])

        sbert_params.update(_sbert_params)
        vocab_tokenizer_params.update(_vocab_tokenizer_params)
        umap_params.update(_umap_params)
        hdbscan_params.update(_hdbscan_params)
        bertopic_params.update(_bertopic_params)

        # create embeddings
        if platform.system() == 'Linux' or platform.system() == 'Windows':
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            device = torch.device('mps')        # m-series machine
        
        sent_transformers = SentenceTransformer(**sbert_params,
                                                device=device)
        
        # load existing embeddings in the search folder to reuse the embeddings
        embeddings_path = save_folder.joinpath(f'embeddings_{sbert_params["model_name_or_path"]}.pkl')
        if embeddings_path.exists():
            with open(embeddings_path, 'rb') as f:
                embeddings = np.load(f)

            _print_message(f'Found existing sbert embeddings at {embeddings_path}. Reusing them.')
            # print(f'Found existing sbert embeddings at {embeddings_path}. Reusing them.')
        else:
            embeddings = sent_transformers.encode(X, show_progress_bar=True, batch_size=64)
            with open(embeddings_path, 'wb') as f:
                np.save(f, embeddings)
            
            _print_message(f'Saved sbert embeddings at: {embeddings_path}')
            # print('Saved sbert embeddings at:', embeddings_path)


        # Structure of the BERTopic
        # (Clustering: Topic Creation)
        # 1. SBERT to create document embeddings
        # 2. UMAP to reduce dimensionality
        # 3. HDBSCAN to cluster embedddings
        # (Representation: Label topics)
        # 4. CountVectorizer to tokenize words
        # 5. c-TF-IDF to weight the words and select the most important words

        # prepare the vocabulary (for c-TFIDF) b4 training
        # TODO: remove any numbers in the vocabulary
        vocab = collections.Counter()
        tokenizer = CountVectorizer().build_tokenizer()
        for doc in tqdm(X):
            vocab.update(tokenizer(doc))
        vocab = [word for word, frequency in vocab.items() if frequency >= vocab_tokenizer_params['n_frequency']]       # set the minimum frequency to reduce the vocabulary size
        _print_message('Number of vocabulary: {}'.format(len(vocab)))

        del vocab_tokenizer_params['n_frequency']       # not used in the vectorizer model for training
        vocab_tokenizer_params['ngram_range'] = tuple(vocab_tokenizer_params['ngram_range'])       # convert list to tuple

        # prepare the sub models of BERTopic
        embedding_model = SentenceTransformer(**sbert_params)       # use the model as the embedding model

        # tokenize the words (for representation)
        # maybe we can do more pre-processing to the CV vocab to eliminate more words
        # like in LLM-take: remove common adj
        vectorizer_model = CountVectorizer(
            vocabulary=vocab, 
            stop_words="english" if additional_stopwords is None else list(ENGLISH_STOP_WORDS.union(additional_stopwords)),
            analyzer='word',
            **vocab_tokenizer_params)              # for computing c-tfidf (first creating a count matrix, then let c-tfidf to calculate the c-tfidf representation)

        bertopic_params['nr_topics'] += 1       # add 1 BERTopic will produce an extra topic for outliers

        # using cuml for faster training
        if cuml_accl:
            umap_model = cuml.manifold.UMAP(**umap_params, verbose=True)
            # TODO: save the reduced embeddings for reuse
            reduced_embeddings = umap_model.fit_transform(embeddings)

            hdbscan_model = cuml.cluster.hdbscan.HDBSCAN(**hdbscan_params, gen_min_span_tree=True)
            # clusters = hdbscan_model.fit_predict(reduced_embeddings).labels_
            clusters = hdbscan_model.fit_predict(reduced_embeddings)

            # Fit BERTopic without actually performing any clustering
            topic_model= BERTopic(
                    **bertopic_params,
                    embedding_model=embedding_model,
                    umap_model=Dimensionality(reduced_embeddings),
                    hdbscan_model= BaseCluster(),
                    vectorizer_model=vectorizer_model,
                    verbose=True
            ).fit(X, embeddings=embeddings, y=clusters)

            topics, probs = topic_model.transform(X, embeddings=embeddings)

            # print(topics)
            # print(topics.shape)
            # print(probs)
            # print(probs.shape)

        else:
            umap_model = UMAP(**umap_params, verbose=True)       # set random_state for reproductability
            hdbscan_model = HDBSCAN(**hdbscan_params, gen_min_span_tree=True)
            

            topic_model = BERTopic(**bertopic_params,
                embedding_model=embedding_model, 
                vectorizer_model=vectorizer_model,
                umap_model=umap_model, 
                hdbscan_model=hdbscan_model,
                # calculate_probabilities=True,     # already in bertopic_params
                verbose=True)
        
            topics, probs = topic_model.fit_transform(X, embeddings=embeddings)

        ##########
        # Training ends
        ##########

        ##########
        # Evaluation starts
        ##########

        # init data for gensim coherence model
        topic_words, empty_topic_idxs = _get_topics(topic_model)

        documents = pd.DataFrame({"Document": X,
                                "ID": range(len(X)),
                                "Topic": topics})

        # remove documents which their topic contains 1<= words
        documents = documents[~documents['Topic'].isin(empty_topic_idxs)]

        documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
        cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

        bertopic_vectorizer = topic_model.vectorizer_model
        bertopic_analyzer = bertopic_vectorizer.build_analyzer()

        words = bertopic_vectorizer.get_feature_names_out()
        tokens = [bertopic_analyzer(doc) for doc in cleaned_docs]
        dictionary = corpora.Dictionary(tokens)
        corpus = [dictionary.doc2bow(token) for token in tokens]

        _print_message('Computing evaluation metrics')
        # print('Computing evaluation metrics')

        
        topn = bertopic_params['top_n_words']
    
        # init octis format result for convenience
        result_octis = {}
        result_octis['topics'] = topic_words
        result_octis['topic-word-matrix'] = _get_topic_word_matrix(topic_model, empty_topic_idxs)
        result_octis['topic-document-matrix'] = _get_topic_document_matrix(probs, empty_topic_idxs)


        metrics_score = dict()

        for metric in metrics:
            if metric in COHERENCE_MODEL_METRICS:
                # compute the coherence
                coherencemodel = CoherenceModel(topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, topn=topn, coherence=metric.value, processes=3)
                score = coherencemodel.get_coherence()              

            elif metric == METRICS.TOPIC_DIVERSITY:
                # compute the coherence
                score = compute_topic_diversity(result_octis, topk=10)


            elif metric == METRICS.INVERTED_RBO:
                # compute the coherence
                score = compute_inverted_rbo(result_octis, topk=10)

            elif metric == METRICS.PAIRWISE_JACCARD_SIMILARITY:
                # compute the coherence
                score = compute_pairwise_jaccard_similarity(result_octis, topk=10)

            else:
                raise Exception(f'Unknown metric: {metric.value}')
            
            metrics_score[metric.value] = score

            _print_message(f'Evaluation metric ({metric.value}): {score}')
            # print(f'Evaluation metric ({metric.value}): {score}')
            
        # get the monitor score
        monitor_score = metrics_score[monitor.value]

        ##########
        # Evaluation ends
        ##########
            
        ##########
        # Save models
        ##########
            
        if not model_path.exists():
            model_path.mkdir()

        if save_each_models:
            topic_model.save(
                path = model_path,
                serialization="safetensors",
                save_ctfidf=True,
                save_embedding_model=sbert_params['model_name_or_path']
            )

            _print_message('Model saved at: {}'.format(model_path))
            # print('Model saved at:', model_path)

        ##########
        # Save models ends
        ##########

        ###########
        # Update result dict and json file
        ###########
        
        # rebuild the model_hyperparameters dict
        model_hyperparameters = {
            'sbert_params': sbert_params,
            'vocab_tokenizer_params': vocab_tokenizer_params,
            'umap_params': umap_params,
            'hdbscan_params': hdbscan_params,
            'bertopic_params': bertopic_params
        }

        if monitor_score > best_metric_score:
            best_metric_score = monitor_score
            best_model = topic_model
            best_model_path = model_path
            best_hyperparameters = model_hyperparameters

        model_log_history = dict()
        model_log_history.update(metrics_score)         # add the metrics score values to the log history
        model_log_history['model_name'] = model_name
        model_log_history['hyperparameters'] = model_hyperparameters

        result['best_metric'] = best_metric_score
        result['best_model_checkpoint'] = str(best_model_path)      # relative path
        result['best_hyperparameters'] = best_hyperparameters
        result["log_history"].append(model_log_history)

        # print(result)

        # save result
        with open(result_json_path, 'w') as f:
            json.dump(result, f, indent=2)

        _print_message('Saved result.json at: {}'.format(result_json_path))
        # print("Saved result.json at:", result_json_path)
        print('\n\n')
    
    _print_message('Search ends')
    # print('Search ends')
    return best_model, best_model_path, best_hyperparameters


In [20]:
# load/create custom stopwords stored in a txt from dataset folder
from pathlib import Path

custom_stopwords_path = Path('../../dataset/topic_modelling/stopwords.txt')
custom_stowords_games_path = Path('../../dataset/topic_modelling/stopwords_games.txt')
game_name_list_path = Path('../../dataset/topic_modelling/game_name_list.txt')

with open(custom_stopwords_path, 'r') as f:
    custom_stopwords = f.read().splitlines()

with open(custom_stowords_games_path, 'r') as f:
    custom_stowords_games = f.read().splitlines()

with open(game_name_list_path, 'r') as f:
    game_name_list = f.read().splitlines()

# also include the stopword list from nltk
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')

custom_stopwords = custom_stopwords + custom_stowords_games + game_name_list + nltk_stopwords
custom_stopwords = set(custom_stopwords)

print(custom_stopwords)
print(len(custom_stopwords))


155928


In [29]:
# grid search / random search

# hyperparameters
sbert_params = _init_sentence_transformers_params(model_name_or_path='all-MiniLM-L6-v2')
vocab_tokenizer_params = _init_vocab_tokenizer_params(n_frequency=70, ngram_range=[1, 1])       # pass ngram_range as list for type-value check against config.json
umap_params = _init_umap_params(n_neighbors=15, n_components=5, metric='cosine', min_dist=0.1, n_epochs=None, low_memory=False)
hdbscan_params = _init_hdbscan_params(min_cluster_size=15, min_samples=5, metric='euclidean', prediction_data=True)
bertopic_params = _init_bertopic_params(
    nr_topics=20, 
    top_n_words=10, 
    calculate_probabilities=True)

# search space dict
# search_space_dict = {
#     'sbert_params':{
#         'model_name_or_path': ['all-MiniLM-L12-v1', 'all-mpnet-base-v1']
#     },
#     # 'vocab_tokenizer_params':{
#     #     'ngram_range': [[1, 1], [1, 2]]     # datatype is list as json does not support tuple
#     # },
#     # 'umap_params':{
#     #     'n_neighbors': [15, 20, 25],
#     #     'n_components': [5, 10],
#     #     'min_dist': [0.0, 0.1],
#     # },
#     'hdbscan_params':{
#         'min_cluster_size': [30, 60, 90, 120, 150, 180, 200],      #  It controls the minimum size of a cluster
#         'min_samples': [10, 20, 30, 40, 50],                    # controls the number of outliers generated.  Setting this value significantly lower than min_cluster_size might help you reduce the amount of noise you will get.
#     },
#     'bertopic_params':{
#         'top_n_words': [10, 20, 30],     # number of words per topic
#         'nr_topics': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],     # number of topics
#     }
# }

search_space_dict = {
    'bertopic_params':{
        'nr_topics': [50, 60, 70],     # number of topics
    }
}

# search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH

training_datetime = datetime.now()
# training_datetime = datetime(2024, 1, 27, 0, 25, 24)
training_folder = Path(f'bertopic_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')

best_model, best_model_path, best_hyperparameters = model_search(
    X,
    hyperparameters={
        'sbert_params': sbert_params,
        'vocab_tokenizer_params': vocab_tokenizer_params,
        'umap_params': umap_params,
        'hdbscan_params': hdbscan_params,
        'bertopic_params': bertopic_params
    },
    search_space=search_space_dict,
    save_folder=training_folder,
    additional_stopwords=custom_stopwords, cuml_accl=True,
    metrics=[METRICS.C_NPMI, METRICS.C_V, METRICS.UMASS, METRICS.C_UCI, METRICS.TOPIC_DIVERSITY, METRICS.INVERTED_RBO, METRICS.PAIRWISE_JACCARD_SIMILARITY],
    monitor=METRICS.C_NPMI,
    save_each_models=True,
    run_from_checkpoints=False,
    search_behaviour=search_behaviour,
    # search_rs=42,
    # search_n_iter=80
)

2024-01-29 22:34:05,105 - Created config file at bertopic_grid_search_20240129_223405/config.json
2024-01-29 22:34:05,105 - Search folder: bertopic_grid_search_20240129_223405
2024-01-29 22:34:05,105 - Best model checkpoint: 
2024-01-29 22:34:05,105 - Best metric score: -inf
2024-01-29 22:34:05,105 - Best model: None


2024-01-29 22:34:05,105 - Current search space: {'bertopic_params__nr_topics': 50}


Batches: 100%|██████████| 11591/11591 [03:42<00:00, 52.02it/s] 


2024-01-29 22:37:57,310 - Saved sbert embeddings at: bertopic_grid_search_20240129_223405/embeddings_all-MiniLM-L6-v2.pkl


100%|██████████| 741791/741791 [00:07<00:00, 103852.11it/s]


2024-01-29 22:38:04,472 - Number of vocabulary: 16226
[I] [22:38:04.574289] Unused keyword parameter: low_memory during cuML estimator initialization
[D] [22:38:05.898959] /__w/cuml/cuml/cpp/src/umap/runner.cuh:108 n_neighbors=15
[D] [22:38:05.900075] /__w/cuml/cuml/cpp/src/umap/runner.cuh:130 Calling knn graph run
[D] [22:38:24.577301] /__w/cuml/cuml/cpp/src/umap/runner.cuh:136 Done. Calling fuzzy simplicial set
[D] [22:38:24.580559] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:317 Smooth kNN Distances
[D] [22:38:24.582866] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:319 sigmas = [ 0.02215, 0.0227345, 0.220146, 0.0242783, 0.0453489, 0.0499585, 0.0133889, 0.0589283, 0.198677, 0.0345178, 0.300059, 0.0193553, 0.00802296, 0.169545, 0.239354, 0.0249778, 0.0136317, 0.0172867, 0.200801, 0.190983, 0.0299821, 0.207464, 0.251967, 0.208841, 0.0130431 ]

[D] [22:38:24.582908] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:321 rhos = [ 0.259338, 0.252034, 1.78814e-07, 0.2

2024-01-29 22:39:21,689 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-29 22:39:21,690 - BERTopic - Dimensionality - Completed ✓
2024-01-29 22:39:21,708 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-29 22:39:21,792 - BERTopic - Cluster - Completed ✓
2024-01-29 22:39:21,793 - BERTopic - Representation - Extracting topics from clusters using representation models.
  idf = np.log((avg_nr_samples / df)+1)
2024-01-29 22:39:43,609 - BERTopic - Representation - Completed ✓
2024-01-29 22:39:43,612 - BERTopic - Topic reduction - Reducing number of topics
  idf = np.log((avg_nr_samples / df)+1)
2024-01-29 22:40:04,590 - BERTopic - Topic reduction - Reduced number of topics from 3823 to 51
2024-01-29 22:40:05,167 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


2024-01-29 22:40:21,155 - Computing evaluation metrics
2024-01-29 22:41:28,581 - Evaluation metric (c_npmi): 0.06648423236338818
2024-01-29 22:44:51,830 - Evaluation metric (c_v): 0.5025011402602899
2024-01-29 22:44:51,892 - Evaluation metric (u_mass): -0.309989664374845
2024-01-29 22:45:58,616 - Evaluation metric (c_uci): -0.09259203095686708
2024-01-29 22:45:58,616 - Evaluation metric (topic_diversity): 0.836
2024-01-29 22:45:58,655 - Evaluation metric (inverted_rbo): 0.9840613192839782
2024-01-29 22:45:58,656 - Evaluation metric (pairwise_jaccard_similarity): 0.012472179944063397
2024-01-29 22:45:58,892 - Model saved at: bertopic_grid_search_20240129_223405/bertopic_bt_nr_topics_50
2024-01-29 22:45:58,893 - Saved result.json at: bertopic_grid_search_20240129_223405/result.json



2024-01-29 22:45:58,893 - Current search space: {'bertopic_params__nr_topics': 60}
2024-01-29 22:45:59,686 - Found existing sbert embeddings at bertopic_grid_search_20240129_223405/embeddings_all-MiniLM-L6-

100%|██████████| 741791/741791 [00:07<00:00, 93499.14it/s] 


2024-01-29 22:46:07,651 - Number of vocabulary: 16226
[I] [22:46:07.788557] Unused keyword parameter: low_memory during cuML estimator initialization
[D] [22:46:07.931539] /__w/cuml/cuml/cpp/src/umap/runner.cuh:108 n_neighbors=15
[D] [22:46:07.932520] /__w/cuml/cuml/cpp/src/umap/runner.cuh:130 Calling knn graph run
[D] [22:46:26.847568] /__w/cuml/cuml/cpp/src/umap/runner.cuh:136 Done. Calling fuzzy simplicial set
[D] [22:46:26.850928] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:317 Smooth kNN Distances
[D] [22:46:26.860760] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:319 sigmas = [ 0.02215, 0.0227345, 0.220146, 0.0242783, 0.0453489, 0.0499585, 0.0133889, 0.0589283, 0.198677, 0.0345178, 0.300059, 0.0193553, 0.00802296, 0.169545, 0.239354, 0.0249778, 0.0136317, 0.0172867, 0.200801, 0.190983, 0.0299821, 0.207464, 0.251967, 0.208841, 0.0130431 ]

[D] [22:46:26.860947] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:321 rhos = [ 0.259338, 0.252034, 1.78814e-07, 0.2

In [27]:
# Test whether the result are the same when load the model from the disk

# load the best model and the embedding from the config folder

search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
training_datetime = datetime(2024, 1, 29, 21, 8, 12)
training_folder = Path(f'bertopic_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')


training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

# embeddings
embeddings_path = training_folder.joinpath(
    f'embeddings_{training_result["best_hyperparameters"]["sbert_params"]["model_name_or_path"]}.pkl'
)
if embeddings_path.exists():
    with open(embeddings_path, 'rb') as f:
        embeddings = np.load(f)
else:
    raise Exception('No embeddings found. Function terminates.')


# model
best_model_checkpoint_path = training_result['best_model_checkpoint']

best_model_loaded = _load_bertopic_model(best_model_checkpoint_path)


topics, probs = best_model.transform(X, embeddings=embeddings)
topics2, probs2 = best_model_loaded.transform(X, embeddings=embeddings)

assert topics.shape == topics2.shape
assert probs.shape == probs2.shape

np.testing.assert_allclose(topics, topics2, rtol=1e-5, atol=1e-5)
np.testing.assert_allclose(probs, probs2, rtol=1e-5, atol=1e-5)

2024-01-29 21:43:10,260 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.
2024-01-29 21:43:12,895 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [28]:
# how about we calculate the embeddings on the fly

best_model_loaded2 = _load_bertopic_model(best_model_checkpoint_path)

sent_transformers2 = SentenceTransformer(
    **sbert_params,
    device=device
)

embeddings3 = sent_transformers2.encode(X, show_progress_bar=True, batch_size=64)

topics3, probs3 = best_model_loaded2.transform(X, embeddings=embeddings3)
assert topics.shape == topics3.shape
assert probs.shape == probs3.shape

np.testing.assert_allclose(topics, topics3, rtol=1e-5, atol=1e-5)
np.testing.assert_allclose(probs, probs3, rtol=1e-5, atol=1e-5)

Batches: 100%|██████████| 11591/11591 [03:29<00:00, 55.34it/s] 
2024-01-29 22:20:50,829 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


The above confirmed that only need to copy the pre-processing techniques is adequate to deploy a trained model,

As the 


In [20]:
# load the best model and the embedding from the config folder

search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
training_datetime = datetime(2024, 1, 24, 15, 30, 50)
training_folder = Path(f'bertopic_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')


training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

# embeddings
embeddings_path = training_folder.joinpath(
    f'embeddings_{training_result["best_hyperparameters"]["sbert_params"]["model_name_or_path"]}.pkl'
)
if embeddings_path.exists():
    with open(embeddings_path, 'rb') as f:
        embeddings = np.load(f)
else:
    raise Exception('No embeddings found. Function terminates.')


# model
best_model_checkpoint_path = training_result['best_model_checkpoint']

best_model = _load_bertopic_model(best_model_checkpoint_path)

topic_model = best_model
topics, probs = topic_model.transform(X, embeddings=embeddings)

Downloading .gitattributes: 100%|██████████| 737/737 [00:00<00:00, 147kB/s]
Downloading 1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 118kB/s]
Downloading README.md: 100%|██████████| 9.85k/9.85k [00:00<00:00, 18.9MB/s]
Downloading config.json: 100%|██████████| 591/591 [00:00<00:00, 1.46MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 194kB/s]
Downloading data_config.json: 100%|██████████| 15.7k/15.7k [00:00<00:00, 13.7MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:44<00:00, 9.82MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 139kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 1.14MB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.57MB/s]
Downloading tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 864kB/s]
Downloading train_script.py: 100%|██████████| 13.2k/13.2k [00:00<00:00, 27.6MB/s]
Downloading vocab.txt: 

In [21]:
best_model_checkpoint_path

'bertopic_random_search_20240124_153050/bertopic_sb_model_name_or_path_all-mpnet-base-v1_hs_min_samples_40_hs_min_cluster_size_180_bt_top_n_words_10_bt_nr_topics_70'

In [23]:
# get topic frequency table
freq = topic_model.get_topic_freq()
print(freq)
print('Num of topics:', len(freq))
print('\n\n')

# sum the 'Count'
print('Total number of docs:', freq['Count'].sum())
print('Number of in-liers:', freq['Count'].sum() - freq[freq['Topic'] == -1]['Count'].sum())
print('Ratio of in-liners:', (freq['Count'].sum() - freq[freq['Topic'] == -1]['Count'].sum()) / float(freq['Count'].sum()))

    Topic   Count
0      -1  365992
1       0  242984
44      1   13984
34      2   12013
2       3   10084
..    ...     ...
63     65     233
57     66     211
55     67     196
38     68     192
51     69     184

[71 rows x 2 columns]
Num of topics: 71



Total number of docs: 741791
Number of in-liers: 375799
Ratio of in-liners: 0.5066103525116913


Outlier reduction

In [28]:
# reduce outlier: https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html

# https://medium.com/@n83072/topic-modeling-bertopic-ca1b73a035f2

# Reduce outliers using the `probabilities` strategy
# This strategy uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document.
# To use this, make sure to calculate the probabilities beforehand by instantiating BERTopic with calculate_probabilities=True.
# it will reduce the number of outliers to 0
# new_topics = topic_model.reduce_outliers(X, topics, probabilities=probs, strategy="probabilities")


# Use the topic distributions, as calculated with .approximate_distribution
# to find the most frequent topic in each outlier document.
# You can use the distributions_params variable to tweak the parameters of .approximate_distribution.
# Reduce outliers using the `distributions` strategy
# new_topics = topic_model.reduce_outliers(X, topics, strategy="distributions")


# Reduce outliers using the `c-tf-idf` strategy
# Calculate the c-TF-IDF representation for each outlier document 
# and find the best matching c-TF-IDF topic representation using cosine similarity.
# depends on the similarity ratio, it will not reduce number of outliers to 0
# new_topics = topic_model.reduce_outliers(X, topics, strategy="c-tf-idf")

# Reduce outliers using the `embeddings` strategy
# but it costs huge reduction in npmi score
# maybe other less aggressive strategies should be used
# also reduces the number of outliers to 0
new_topics = topic_model.reduce_outliers(X, topics, strategy="embeddings")

In [29]:
from collections import Counter
new_topic_dict = dict(Counter(new_topics))


new_topic_dict_df = pd.DataFrame(list(new_topic_dict.items()), columns=['topic_id', 'count'])
new_topic_dict_df = new_topic_dict_df.sort_values(by=['count'], ascending=False)

new_topic_dict_df

Unnamed: 0,topic_id,count
0,0,58320
4,1,5512
6,3,3376
7,2,3228
1,4,2710
2,5,1243
5,6,448
9,7,315
3,8,271
8,9,71


In [27]:
new_topic_dict_df[new_topic_dict_df['topic_id'] == -1]

Unnamed: 0,topic_id,count


In [46]:
# try to apply the topic reduction to the BERTopic model

topic_model.update_topics(X, topics=new_topics)



In [24]:
# save the model (different from the func for small documents)
from datetime import datetime

topic_model_folder_path = Path(f'my_model_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
topic_model.save(
    path=topic_model_folder_path,
    serialization="safetensors",
    save_ctfidf=True,
    save_embedding_model=SENTENCE_TRANSFORMERS_NAME
)

In [24]:
# get top 10 representative docs for each topic

# Approximate most representative documents per topic by sampling
# a subset of the documents in each topic and calculating which are
# most represenative to their topic based on the cosine similarity between
# c-TF-IDF representations

# the method was called internally in the fit_transform method
# so that the .get_topic_info() can work properly when the model was reloaded from disk
repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids = topic_model._extract_representative_docs(
    topic_model.c_tf_idf_,
    pd.DataFrame({"Document": X, "ID": range(len(X)), "Topic": topics}),
    topic_model.topic_representations_,
    nr_samples=500,
    nr_repr_docs=5
)

In [15]:
X[13283]

"This game. THIS GAME. I'ts freaking AWESOME!!! Some people say it's a 'cheap 2D rip-off of Minecraft', and to that i say NOOOO!!! This game has so many aspects in its gameplay other than mining and building (and I don't say Minecraft is bad), it's just that these two have different 'categories', Terraria is a '2D Sandbox Adventure game with an enphasis on combat', while Minecraft is a '3D, Sandbox Survival Adventure game based around building', without counting that Minecraft has official mod compatibility. Terraria doesn't need mods to have a lot of content, as a 'good patch' in Terraria equals to 6 new bosses, 43 new enemies (each with their own variants), 168 new items, 45 new weapons with wacky effects, 4 new NPCs, 3 new biomes, and an ENTIRE DIFFICULTY SETTING (add that to the fact that Terraria haas had at least 4 'good patches' and you got yourself a game for an entire year!) :D The replay value is excellent, because of the completely random worlds, along with the added difficu

In [25]:
# the mapping is in no particular order

repr_docs_mappings

{-1: [" I originally Kickstarted this one a long time ago before its second name change. What was originally known as Ravensdale is now finally released as Rogue Stormers. I didn't have that high of but perhaps too high, as I'm quite disappointed and underwhelmed by this final release. I don't care that much how it changed over development like others seem I just care about what we have now. I put in over a dozen hours, unlocked 2 extra characters, got a bunch of and I'm honestly bored with it, despite never seeing to the end of the game. This is probably as much as I'll ever play it. When you first start playing, it's impressive. The theme is clearly Warhammer, but for whatever reason, they didn't license it, even though that license is given out in backalleys at this point to anyone that wants it. Have you seen some of the Warhammer games on Steam these In any case, it's space marines and orks. It looks nice. It does have good controls, and switches between controller/keyboard on the

In [34]:
# more than one as maybe after cleaning, the documents are the same

repr_docs_ids

[[54289, 13639, 52940, 48158, 58598],
 [67579, 20794, 59433, 3479, 17498],
 [57212, 15351, 2215, 840, 16126],
 [25920, 53572, 58212, 47275, 19912],
 [73778, 12575, 58789, 45070, 48797],
 [75408, 33888, 57220, 64615, 31561],
 [21605, 43324, 30557, 61891, 29372],
 [39756, 23704, 16397, 15297, 128],
 [18683, 20753, 71660, 45220, 25136],
 [18950, 51877, 29764, 40571, 7625],
 [3931, 20742, 4400, 2910, 251]]

In [33]:
for i in repr_docs_ids[-1]:
    print(X[i])

its a fun game, buy it heres a wicked pie recipe Ingredients: 1. 6 tablespoons unsalted butter 2. 1/2 cup white sugar 3. 1/2 cup brown sugar 4. 1/4 cup water 5. 1/4 teaspoon cinnamon 6. 1 pinch salt 7. 5 apples - peeled, cored and sliced 8. 1 pastry for double-crust pie Directions: 1. Preheat oven to 425 degrees F (220 degrees C). 2. Combine butter, white sugar, brown sugar, water, cinnamon, and salt in a saucepan over medium heat. Bring to a boil, remove from heat and set aside. 3. Roll out half the pastry to fit a 9-inch pie plate. Place bottom crust in pie plate; pour in apple slices. 4. Roll out top crust into a 10-inch circle. Cut into 8 (1-inch) wide strips with a sharp paring knife or pastry wheel. Weave the pastry strips, one at a time, into a lattice pattern. Fold the ends of the lattice strips under the edge of the bottom crust and crimp to seal. 5. Pour butter-sugar mixture over top of pie, coating the lattice, and allowing any remaining sauce to drizzle through the crust. 6

In [35]:
probs.shape

(75494, 11)

---

Get the docs with the highest probability in each topic when transform with a new set of documents

In [62]:
# how about we use the topics and probs variable to calculate the top N representative docs
top_N = 10

idx = np.argpartition(-probs, top_N, axis=0)[:top_N]

In [63]:
# row = document, col = topic
idx.shape

(10, 21)

In [65]:
idx[:, -1]

array([66922, 60612, 39721, 41823, 34887, 66124,  5826, 44161, 76701,
       76489])

In [66]:
probs[idx[:, -1], -1]

array([0.8847593 , 0.88933086, 0.89252526, 0.87341017, 0.86458516,
       0.86464214, 0.87115467, 0.86157316, 0.8588035 , 0.8588035 ],
      dtype=float32)

In [75]:
for i in idx[:, -1]:
    print(X[i])

Such a great Game 10/10 -Ign
I LOVE THIS GAME ign 10/10
this game is amazing 10/10 IGN
I LOVE THIS GAME 10/10 BEST GAVE EVER IGN
its a great game 10/10 IGN rating 
This is one of the best games ever. It got 9/10 IGN
this Game is amazing 10/1o ign
Great game 10/10 IGN :)
Great game, IGN 11/10
Great game, IGN 11/10


In [77]:
scores = probs[idx[:, 0]]

In [78]:
scores

array([[0.83925736, 0.74457836, 0.80703235, 0.6795908 , 0.4112299 ,
        0.60110843, 0.32747   , 0.4742515 , 0.57148874, 0.11809592,
        0.37909943, 0.57450265, 0.4534629 , 0.4331435 , 0.5107883 ,
        0.52181256, 0.52877164, 0.5952083 , 0.51749295, 0.2519888 ,
        0.3827619 ],
       [0.83819544, 0.6792901 , 0.82225263, 0.59778   , 0.4467274 ,
        0.71488297, 0.4026624 , 0.4975381 , 0.5864483 , 0.17567718,
        0.37321538, 0.6265934 , 0.49665412, 0.45622283, 0.5865581 ,
        0.57748616, 0.50651133, 0.5680938 , 0.5247112 , 0.29756355,
        0.419262  ],
       [0.862828  , 0.71632946, 0.8440254 , 0.6514621 , 0.38810313,
        0.73899895, 0.35407072, 0.51394963, 0.6051907 , 0.10856348,
        0.3591198 , 0.58158875, 0.474846  , 0.46928063, 0.5575608 ,
        0.5806221 , 0.5548263 , 0.5549511 , 0.5808619 , 0.23594311,
        0.4032487 ],
       [0.8423841 , 0.7527422 , 0.79166555, 0.6698292 , 0.42994094,
        0.6526007 , 0.42818356, 0.46310222, 0.551491 

In [79]:
scores.shape

(10, 21)

In [16]:
# # load the embeddings
# embedding_path = Path('00_Terraria_embeddings.pkl')
# embeddings = np.load(embedding_path)

# # inference to get the topics and prob for evaluation
# # hence, we need the probs to get topic-doc-matrix
# topics, probs = topic_model.transform(X, embeddings=embeddings)

In [17]:
probs.shape

(81776, 20)

Extracting Topics

In [36]:
# look at the most frequent topics 

freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,27879,-1_game_10_fun_play,"[game, 10, fun, play, hours, great, good, like...",
1,0,42731,0_game_terraria_minecraft_like,"[game, terraria, minecraft, like, fun, play, j...",
2,1,1750,1_10_slime_ign_killed,"[10, slime, ign, killed, moon, unicorn, 11, lo...",
3,2,1052,2_mac_game_play_fix,"[mac, game, play, fix, help, controller, wont,...",
4,3,890,3_good_awesome_pretty_cool,"[good, awesome, pretty, cool, awsome, love, be...",


In [37]:
topic_model.get_topic(0)  # Select the most frequent topic

[['game', 0.05619772068148005],
 ['terraria', 0.04560588141268186],
 ['minecraft', 0.04180504922424392],
 ['like', 0.03053447443751355],
 ['fun', 0.02781572270180542],
 ['play', 0.023692604503819516],
 ['just', 0.02246930526909072],
 ['2d', 0.02236894176737179],
 ['games', 0.021693425049604708],
 ['great', 0.02095166088788123]]

(Copy from BERTopic ipynb in colab)

There are a number of attributes that you can access after having trained your BERTopic model:


| Attribute | Description |
|------------------------|---------------------------------------------------------------------------------------------|
| topics_               | The topics that are generated for each document after training or updating the topic model. |
| probabilities_ | The probabilities that are generated for each document if HDBSCAN is used. |
| topic_sizes_           | The size of each topic                                                                      |
| topic_mapper_          | A class for tracking topics and their mappings anytime they are merged/reduced.             |
| topic_representations_ | The top *n* terms per topic and their respective c-TF-IDF values.                             |
| c_tf_idf_              | The topic-term matrix as calculated through c-TF-IDF.                                       |
| topic_labels_          | The default labels for each topic.                                                          |
| custom_labels_         | Custom labels for each topic as generated through `.set_topic_labels`.                                                               |
| topic_embeddings_      | The embeddings for each topic if `embedding_model` was used.                                                              |
| representative_docs_   | The representative documents for each topic if HDBSCAN is used. (affects evaluation (calling get_topic_info()), transform with the provided data to get the topic and the probability and re-calculate them)                                                |

Save and load BERTopic models and components

Visualization

In [38]:
# visualize topics

topic_model.visualize_topics()

In [40]:
# visualize topic probabilities
# to understand how confident BERTopic is that certain topics are present in the documents

topic_model.visualize_distribution(probs[100], min_probability=0.001)

In [43]:
# visualize how topics are hierarchically reduced

topic_model.visualize_hierarchy(top_n_topics=50)


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead



In [44]:
# visualize selecteed terms for a few topics
# creating bar charts out of the c-TF-IDF scores for each topic representation.

topic_model.visualize_barchart(top_n_topics=10)

In [57]:
# visualize topic similarity
# Having generated topic embeddings, through both c-TF-IDF and embeddings,
# we can create a similarity matrix by simply applying cosine similarities through those topic embeddings.
# The result will be a matrix indicating how similar certain topics are to each other.

topic_model.visualize_heatmap(top_n_topics=11, width=1000, height=1000)

Evaluation

Calculate metrics with octis

Reference

https://www.theanalyticslab.nl/topic-modeling-with-bertopic/

In [52]:
result_bertopic = {}

top_words = 10     # the functions will only return that number of top words
def _get_topics(topic_model):
    topic_list = []
    empty_topic_l_idx = []

    for idx, topics in topic_model.get_topics().items():
        if idx < 0:
            continue

        topics_sorted = sorted(topics, key=lambda x: x[1], reverse=True)
        topic_l = [t[0] for t in topics_sorted if t[0].strip() != '']

        # it's possible that resulting in an empty list
        # also, topic with only one word fails at calculating NPMI
        if len(topic_l) <= 1:
            empty_topic_l_idx.append(idx)
            continue

        topic_list.append(topic_l)
        # print(len(topic_l))

    return topic_list, empty_topic_l_idx

def _get_topic_word_matrix(topic_model, empty_topic_idxs):

    # use ctfidf value to calculate the probability of a word assigned to a topic
    # but this is not the probability of a word in a topic
    # maybe there's a better way

    c_tfidf_all = topic_model.c_tf_idf_.todense()

    topic_word_matrix = np.exp(c_tfidf_all) / np.exp(c_tfidf_all).sum(axis=1)

    # remove empty topics from the largest index
    for idx in empty_topic_idxs[::-1]:
        topic_word_matrix = np.delete(topic_word_matrix, idx, axis=0)

    # a better way: https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html#visualize-probablities-or-distribution
    

    return topic_word_matrix

def _get_topic_document_matrix(probabilities, empty_topic_idxs):

    topic_document_matrix = probabilities.T

    for idx in empty_topic_idxs[::-1]:
        topic_document_matrix = np.delete(topic_document_matrix, idx, axis=1)

    return topic_document_matrix

result_bertopic['topics'], empty_topic_idxs = _get_topics(topic_model)
result_bertopic['topic-word-matrix'] = _get_topic_word_matrix(topic_model, empty_topic_idxs)
result_bertopic['topic-document-matrix'] = _get_topic_document_matrix(probs, empty_topic_idxs)

In [53]:
result_bertopic['topics'], result_bertopic['topic-word-matrix'], result_bertopic['topic-document-matrix']

([['game', 'this', 'it', 'and', 'the', 'to', 'of', 'you', 'is', 'fun'],
  ['terraria', 'the', 'and', 'to', 'you', 'is', 'of', 'it', 'game', 'that'],
  ['minecraft', 'and', 'game', 'this', 'it', 'is', 'of', 'you', 'the', 'to'],
  ['game', 'this', 'best', 'great', 'ever', 'love', 'is', 'good', 'one', 'it'],
  ['10',
   'again',
   'killed',
   'would',
   'the',
   'my',
   'you',
   'and',
   'to',
   'unicorn'],
  ['my', 'it', 'but', 'fix', 'the', 'and', 'game', 'to', 'me', 'this'],
  ['addictive',
   'addicting',
   'fun',
   'very',
   'addicted',
   'game',
   'hours',
   'and',
   'this',
   'it'],
  ['10', 'would', 'again', '11', 'ign', 'play', 'life', 'tunk', 'my', 'good'],
  ['good',
   'ok',
   'its',
   'pretty',
   'alright',
   'it',
   'guess',
   'cool',
   'yeah',
   'okay'],
  ['bye',
   'cool',
   'slit',
   'dink',
   'so',
   'tickle',
   'pickle',
   'zone',
   'it',
   'let'],
  ['review',
   'reviews',
   'badgei',
   'le',
   'this',
   'the',
   'game',
   'badge

In [57]:
topic_freq = topic_model.get_topic_freq()
topic_freq[topic_freq['Topic'] != -1]

Unnamed: 0,Topic,Count
3,0,29399
0,1,14477
5,2,13139
8,3,8073
7,4,2529
12,6,1726
6,7,1547
13,5,1500
17,12,1409
14,9,1353


Evaluation with gensim

(as gives more freedom to control the CoherenceModel by gensim)

In [59]:
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

# https://stackoverflow.com/questions/70548316/gensim-coherencemodel-gives-valueerror-unable-to-interpret-topic-as-either-a-l

# filter topics that contain only one word from the corpus for calculating npmi
# https://github.com/piskvorky/gensim/issues/3328


topic_words, empty_topic_l_idx = _get_topics(topic_model)

documents = pd.DataFrame({"Document": X,
                          "ID": range(len(X)),
                          "Topic": topics})

# remove documents which their topic contains 1<= words
documents = documents[~documents['Topic'].isin(empty_topic_idxs)]

documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

bertopic_vectorizer = topic_model.vectorizer_model
bertopic_analyzer = bertopic_vectorizer.build_analyzer()

words = bertopic_vectorizer.get_feature_names_out()
tokens = [bertopic_analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

In [60]:
# ~3 min on i714700 with CountVectorizer ~ 6000 words

# we first analysze NPMI

coherence_model = CoherenceModel(topics=topic_words,
                                 texts=tokens,
                                corpus=corpus,
                                dictionary=dictionary,
                                topn=10,
                                coherence='c_v')

# npmi = Coherence(texts=tokens,topk=10, measure='c_npmi')
# nmpi_score = npmi.score(result_bertopic)

cv_score = coherence_model.get_coherence()
cv_score


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0.3994560925733617

In [61]:
coherence_model_npmi = CoherenceModel(topics=topic_words,
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    topn=10,
                                    coherence='c_npmi')

npmi_score = coherence_model_npmi.get_coherence()
npmi_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

-0.0029256775418027474

In [31]:
def get_topic_diversity(topics, topk=10):
    ''' Topic Diversity as the percentage of unique words in the top M words of all topics
    Modified from octis implementation
    
    Parameters
    ----------
    topics : list of list of str
        List of topics, where each topic is a list of words.
    topk : int, optional
    '''
    if topics is None:
        return 0
    # if topk > len(topics[0]):
    #     raise Exception('Words in topics are less than ' + str(self.topk))

    unique_words = set()
    for topic in topics:
        unique_words = unique_words.union(set(topic[:topk]))
    td = len(unique_words) / (topk * len(topics))
    return td

get_topic_diversity(topic_words)

0.655

In [33]:
import itertools

import sys
sys.path.append('../')

from rbo import rbo

def get_word2index(list1, list2):
    words = set(list1)
    words = words.union(set(list2))
    word2index = {w: i for i, w in enumerate(words)}
    return word2index

def get_inverted_RBO(topics, topk=10, weight=0.9):
    ''' Inverted Rank-Biased Overlap (iRBO)
    to measure the diversity of the topics
    Modified from octis implementation

    Parameters
    ----------
    topics : list of list of str
        List of topics, where each topic is a list of words.
    topk : int, optional
    weight : float, optional
    '''

    if topics is None:
        return 0
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than topk')
    else:
        collect = []
        for list1, list2 in itertools.combinations(topics, 2):
            word2index = get_word2index(list1, list2)
            indexed_list1 = [word2index[word] for word in list1]
            indexed_list2 = [word2index[word] for word in list2]
            rbo_val = rbo(indexed_list1[:topk], indexed_list2[:topk], p=weight)[2]
            collect.append(rbo_val)
        return 1 - np.mean(collect)
    
get_inverted_RBO(topic_words)

0.9363353717539098

In [34]:
def _KL(P, Q):
    """
    Perform Kullback-Leibler divergence

    Parameters
    ----------
    P : distribution P
    Q : distribution Q

    Returns
    -------
    divergence : divergence from Q to P
    """
    # add epsilon to grant absolute continuity
    epsilon = 0.00001
    P = P+epsilon
    Q = Q+epsilon

    divergence = np.sum(np.multiply(P, np.log(P/Q)))        # changed the operator from * to np.multiply to do element-wise multiplication
    return divergence

def get_kl_divergence(topic_word_metrix):
    """Compute KL divergence between topic-word distributions
    to measure document covrage
    Modified from octis implementation
    https://github.com/MIND-Lab/OCTIS/blob/master/octis/evaluation_metrics/diversity_metrics.py#L209

    Parameters
    ----------
    topic_word_metrix : topic-word distribution matrix
    """
    beta = topic_word_metrix
    kl_div = 0
    count = 0
    for i, j in itertools.combinations(range(len(beta)), 2):
        kl_div += _KL(beta[i], beta[j])
        count += 1
    return kl_div / count

get_kl_divergence(result_bertopic['topic-word-matrix'])

0.00022574783055084367

In [35]:
result_bertopic['topic-word-matrix'].shape

(21, 6968)

Inference Test

In [None]:
inference_test = ["well its been fun guys, but that's it, no more updates, that one was the last one, there is no longer going to be anymore content for this game anymore, there is no way to replay it as there won't be any updates, nope, that was it, the last update, nothing more, this game has no new ways to experience it as there is no more content updates, nothing new to freshen up the experience, its such a shame that this game has no replay-ability, once you beat the game there is like no point to playing again, as they said guys 1.2 will be they final update. nothing more after 1.2, there is no chance they will make another final update right? several years and final updates later: alright, thats it, no more updates we wont be getting anymore, thats it, nothing more, no more updates, for real this time... oh god, redigit made another tweet.",
                  "keeps forcing me to play it",
'''I will leave the cat here, so that everybody who passes by can pet it and give it a thumbs up and awards
　　　 　　／＞　　フ
　　　 　　| 　_　 _ l
　 　　 　／` ミ＿xノ
　　 　 /　　　 　 |
　　　 /　 ヽ　　 ﾉ
　 　 │　　|　|　|
　／￣|　　 |　|　|
　| (￣ヽ＿_ヽ_)__)
　＼二つ''']

In [1]:
from bertopic import BERTopic