Demo ipynb for BERTopic

Testing the pipeline for a single game

Ref

BERTopic tutorial

https://colab.research.google.com/drive/1FieRA9fLdkQEGDIMYl0I3MCjSUKVF8C-?usp=sharing#scrollTo=ScBUgXn06IK6


BERTopic Best Practices

https://colab.research.google.com/drive/1BoQ_vakEVtojsd2x_U6-_x52OOuqruj2?usp=sharing#scrollTo=m3aN-f9B4rmU


BERTopic Big data (for improving the speed of the training pipeline, on GPU)

https://colab.research.google.com/drive/1W7aEdDPxC29jP99GGZphUlqjMFFVKtBC?usp=sharing#scrollTo=Ls2Q-iccGs7O


BERTopic Topic Modelling with Llama2

https://colab.research.google.com/drive/1QCERSMUjqGetGGujdrvv_6_EeoIcd_9M?usp=sharing#scrollTo=4Uj8MYhCafmX

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import json
from datetime import datetime

import gensim

import nltk

import pyLDAvis

In [2]:
dataset_path = Path('../../dataset/topic_modelling/top_10_games/00_Terraria.pkl')

dataset = pd.read_pickle(dataset_path)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 75499 entries, 57735 to 133233
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         75499 non-null  int64 
 1   app_id        75499 non-null  int64 
 2   app_name      75499 non-null  object
 3   review_text   75499 non-null  object
 4   review_score  75499 non-null  int64 
 5   review_votes  75499 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 4.0+ MB


In [3]:
%load_ext autoreload

In [4]:
# data preprocessing

import re

import sys
sys.path.append('../../sa')

%autoreload 2
import str_cleaning_functions


def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

def cleaning_strlist(str_list):
    str_list = list(map(lambda x: str_cleaning_functions.remove_links(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_links2(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.clean(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.deEmojify(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
    return str_list

In [5]:
cleaning(dataset, 'review_text')

In [6]:
X = dataset['review_text'].values

In [7]:
# remove empty strings

X = list(filter(lambda x: len(x) > 0, X))

Training

for small documents, simply run with the BERTopic encapsulated function and the training is all done.

for large documents, it's better to pre-calculate embeddings and prepare vocab b4 training to reduce memory usage.

In [8]:
# small documents

# from bertopic import BERTopic

# TOP_N_WORDS = 10                # number of words per topic
# N_GRAM_RANGE = (1, 2)           # n-gram

# topic_model = BERTopic(language="english", top_n_words=TOP_N_WORDS, calculate_probabilities=True, verbose=True)
# topics, probs = topic_model.fit_transform(X)

In [9]:
import platform
import torch

if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

mps


Build the hyperparameter selection (typically number of topics, but UMAP and HBSCAN also have hyperparameters)

with grid/random search

In [10]:
from gensim.models import CoherenceModel
from copy import deepcopy

from sklearn.model_selection import ParameterGrid, ParameterSampler

sys.path.append('../')

from eval_metrics import compute_inverted_rbo, compute_topic_diversity, compute_pairwise_jaccard_similarity, \
                        METRICS, SEARCH_BEHAVIOUR, COHERENCE_MODEL_METRICS

# TODO: write sth like lda_demo_gridsearch.ipynb to select hyperparameters

In [11]:
from typing import Iterable, List, Tuple, Union


def _init_sentence_transformers_params(model_name_or_path: str = None):
    
    params_dict = {}
    params_dict['model_name_or_path'] = model_name_or_path

    return params_dict

def _init_vocab_tokenizer_params(n_frequency:int = 0, ngram_range:Tuple[int, int] = (1, 1)):

    params_dict = {}
    params_dict['n_frequency'] = n_frequency
    params_dict['ngram_range'] = ngram_range

    return params_dict

def _init_umap_params(n_neighbors:int = 15,     # the number of neighbors to consider when approximating the local metric
                      n_components:int = 5,     # the target embedding dimension, its effect is largest on the performance of HDBSCAN. Increasing this value too much and HDBSCAN will have a hard time clustering the high-dimensional embeddings
                      metric:str = 'cosine', 
                      min_dist:float = 0.1,     # the desired separation between close points in the embedding space
                      n_epochs:int = None,      # the number of training epochs to use when optimizing the low dimensional representation
                      low_memory:bool = False,
                      random_state:int = None):
    
    '''
    Suggested parameter tuning: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html
    '''

    params_dict = {}
    params_dict['n_neighbors'] = n_neighbors
    params_dict['n_components'] = n_components
    params_dict['metric'] = metric
    params_dict['min_dist'] = min_dist
    params_dict['n_epochs'] = n_epochs
    params_dict['low_memory'] = low_memory
    params_dict['random_state'] = random_state

    return params_dict

def _init_hdbscan_params(min_cluster_size:int = 5,
                            min_samples:int = None, 
                            metric:str = 'euclidean',
                            prediction_data:bool = True):
        '''
        Suggested parameter tuning: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html
        '''
    
        params_dict = {}
        params_dict['min_cluster_size'] = min_cluster_size
        params_dict['min_samples'] = min_samples
        params_dict['metric'] = metric
        params_dict['prediction_data'] = prediction_data
    
        return params_dict

def _init_bertopic_params(language: str = "english",        # used to simplify the selection of sentence-transformers models, but since we are passing our own sbert model, this can be ignored.
                 top_n_words: int = 10,
                #  n_gram_range: Tuple[int, int] = (1, 1),
                 min_topic_size: int = 10,
                 nr_topics: Union[int, str] = None,
                 low_memory: bool = False,
                 calculate_probabilities: bool = False,
                 seed_topic_list: List[List[str]] = None,
                 zeroshot_topic_list: List[str] = None,
                 zeroshot_min_similarity: float = .7):
    
    '''
    Parameter tuning: https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html
    '''
     
    params_dict = {}
    params_dict['language'] = language                     # put in the params_dict for completeness (as it appears in Parameter tuning in BERTopic page)
    params_dict['top_n_words'] = top_n_words                # this also affects the number of top N words used to calculate metrics
    # params_dict['n_gram_range'] = n_gram_range            # this is controlled by the vocab_tokenizer_params
    params_dict['min_topic_size'] = min_topic_size
    params_dict['nr_topics'] = nr_topics
    params_dict['low_memory'] = low_memory
    params_dict['calculate_probabilities'] = calculate_probabilities
    params_dict['seed_topic_list'] = seed_topic_list
    params_dict['zeroshot_topic_list'] = zeroshot_topic_list
    params_dict['zeroshot_min_similarity'] = zeroshot_min_similarity

    return params_dict


In [34]:
def _init_config_dict(config_path:Path, model_name:str, hyperparameters:dict, search_space_dict:dict, 
                      metrics:list[METRICS], monitor:METRICS,
                      search_behaviour:SEARCH_BEHAVIOUR, search_rs:int, search_n_iter:int):
    
    if not config_path.exists():
        config = {}

        sbert_params = _init_sentence_transformers_params(**hyperparameters['sbert_params'])
        vocab_tokenizer_params = _init_vocab_tokenizer_params(**hyperparameters['vocab_tokenizer_params'])
        umap_params = _init_umap_params(**hyperparameters['umap_params'])
        hdbscan_params = _init_hdbscan_params(**hyperparameters['hdbscan_params'])
        bertopic_params = _init_bertopic_params(**hyperparameters['bertopic_params'])

        config['model'] = model_name
        config['sbert_params'] = sbert_params
        config['vocab_tokenizer_params'] = vocab_tokenizer_params
        config['umap_params'] = umap_params
        config['hdbscan_params'] = hdbscan_params
        config['bertopic_params'] = bertopic_params

        # remove hyperparameters that are in the search_space_dict
        if 'sbert_params' in search_space_dict:
            for k in search_space_dict['sbert_params'].keys():
                sbert_params.pop(k, '')
        if 'vocab_tokenizer_params' in search_space_dict:
            for k in search_space_dict['vocab_tokenizer_params'].keys():
                vocab_tokenizer_params.pop(k, '')
        if 'umap_params' in search_space_dict:
            for k in search_space_dict['umap_params'].keys():
                umap_params.pop(k, '')
        if 'hdbscan_params' in search_space_dict:
            for k in search_space_dict['hdbscan_params'].keys():
                hdbscan_params.pop(k, '')
        if 'bertopic_params' in search_space_dict:
            for k in search_space_dict['bertopic_params'].keys():
                bertopic_params.pop(k, '')

        config['search_space'] = search_space_dict

        config['metrics'] = list(map(lambda x: x.value, metrics))

        config['monitor'] = monitor.value

        config['search_behaviour'] = search_behaviour.value
        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            config['search_rs'] = search_rs
            config['search_n_iter'] = search_n_iter

        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)

        print('Created config file at {}'.format(config_path))

    else:
        with open(config_path, 'r') as f:
            config = json.load(f)

        # check whether the config file is consistent with the input parameters
        assert config['model'] == model_name, 'input model_name is not consistent with config["model"]'
        assert config['metrics'] == list(map(lambda x: x.value, metrics)), 'input metrics is not consistent with config["metrics"]'
        assert config['monitor'] == monitor.value, 'input monitor is not consistent with config["monitor"]'
        assert config['search_behaviour'] == search_behaviour.value, 'input search_behaviour is not consistent with config["search_behaviour"]'
        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            assert config['search_rs'] == search_rs, 'input search_rs is not consistent with config["search_rs"]'
            assert config['search_n_iter'] == search_n_iter, 'input search_n_iter is not consistent with config["search_n_iter"]'
        
        # check whether the config file contains all the hyperparameters
        sbert_params = _init_sentence_transformers_params(**hyperparameters['sbert_params'])
        vocab_tokenizer_params = _init_vocab_tokenizer_params(**hyperparameters['vocab_tokenizer_params'])
        umap_params = _init_umap_params(**hyperparameters['umap_params'])
        hdbscan_params = _init_hdbscan_params(**hyperparameters['hdbscan_params'])
        bertopic_params = _init_bertopic_params(**hyperparameters['bertopic_params'])

        assert config['sbert_params'].keys() <= sbert_params.keys(), 'existing config["sbert_params"] contains additional hyperparameters'
        assert config['vocab_tokenizer_params'].keys() <= vocab_tokenizer_params.keys(), 'existing config["vocab_tokenizer_params"] contains additional hyperparameters'
        assert config['umap_params'].keys() <= umap_params.keys(), 'existing config["umap_params"] contains additional hyperparameters'
        assert config['hdbscan_params'].keys() <= hdbscan_params.keys(), 'existing config["hdbscan_params"] contains additional hyperparameters'
        assert config['bertopic_params'].keys() <= bertopic_params.keys(), 'existing config["bertopic_params"] contains additional hyperparameters'

        for key in sbert_params.keys() & config['sbert_params'].keys():
            assert sbert_params[key] == config['sbert_params'][key], 'existing config["sbert_params"] contains different hyperparameters'
        for key in vocab_tokenizer_params.keys() & config['vocab_tokenizer_params'].keys():
            assert vocab_tokenizer_params[key] == config['vocab_tokenizer_params'][key], 'existing config["vocab_tokenizer_params"] contains different hyperparameters'
        for key in umap_params.keys() & config['umap_params'].keys():
            assert umap_params[key] == config['umap_params'][key], 'existing config["umap_params"] contains different hyperparameters'
        for key in hdbscan_params.keys() & config['hdbscan_params'].keys():
            assert hdbscan_params[key] == config['hdbscan_params'][key], 'existing config["hdbscan_params"] contains different hyperparameters'
        for key in bertopic_params.keys() & config['bertopic_params'].keys():
            assert bertopic_params[key] == config['bertopic_params'][key], 'existing config["bertopic_params"] contains different hyperparameters'
        
        # check whether the config file contains all the search space
        
        if 'sbert_params' in config['search_space']:
            assert config['search_space']['sbert_params'].keys() == search_space_dict['sbert_params'].keys(), 'input search_space_dict["sbert_params"] contains different hyperparameter keys than existing config["search_space"]["sbert_params"]'
            for key in search_space_dict['sbert_params'].keys():
                assert search_space_dict['sbert_params'][key] == config['search_space']['sbert_params'][key], f'input search_space_dict["sbert_params"]["{key}"] contains value than existing config["search_space"]["sbert_params"]["{key}"]'
        if 'vocab_tokenizer_params' in config['search_space']:
            assert config['search_space']['vocab_tokenizer_params'].keys() == search_space_dict['vocab_tokenizer_params'].keys(), 'input search_space_dict["vocab_tokenizer_params"] contains different hyperparameter keys than existing config["search_space"]["vocab_tokenizer_params"]'
            for key in search_space_dict['vocab_tokenizer_params'].keys():
                assert search_space_dict['vocab_tokenizer_params'][key] == config['search_space']['vocab_tokenizer_params'][key], f'input search_space_dict["vocab_tokenizer_params"]["{key}"] contains value than existing config["search_space"]["vocab_tokenizer_params"]["{key}"]'
        if 'umap_params' in config['search_space']:
            assert config['search_space']['umap_params'].keys() == search_space_dict['umap_params'].keys(), 'input search_space_dict["umap_params"] contains different hyperparameter keys than existing config["search_space"]["umap_params"]'
            for key in search_space_dict['umap_params'].keys():
                assert search_space_dict['umap_params'][key] == config['search_space']['umap_params'][key], f'input search_space_dict["umap_params"]["{key}"] contains value than existing config["search_space"]["umap_params"]["{key}"]'
        if 'hdbscan_params' in config['search_space']:
            assert config['search_space']['hdbscan_params'].keys() == search_space_dict['hdbscan_params'].keys(), 'input search_space_dict["hdbscan_params"] contains different hyperparameter keys than existing config["search_space"]["hdbscan_params"]'
            for key in search_space_dict['hdbscan_params'].keys():
                assert search_space_dict['hdbscan_params'][key] == config['search_space']['hdbscan_params'][key], f'input search_space_dict["hdbscan_params"]["{key}"] contains value than existing config["search_space"]["hdbscan_params"]["{key}"]'
        if 'bertopic_params' in config['search_space']:
            assert config['search_space']['bertopic_params'].keys() == search_space_dict['bertopic_params'].keys(), 'input search_space_dict["bertopic_params"] contains different hyperparameter keys than existing config["search_space"]["bertopic_params"]'
            for key in search_space_dict['bertopic_params'].keys():
                assert search_space_dict['bertopic_params'][key] == config['search_space']['bertopic_params'][key], f'input search_space_dict["bertopic_params"]["{key}"] contains value than existing config["search_space"]["bertopic_params"]["{key}"]'

        print('Loaded existing config file from {}'.format(config_path))
        print('Hyperparameters and search space are consistent with the input parameters')

    return config

In [22]:
def _init_result_dict(result_path: Path, monitor_type:str):
        
    if not result_path.exists():
        result = {}

        result['best_metric'] = -float('inf')
        result['best_model_checkpoint'] = ""
        result['best_hyperparameters'] = dict()
        result["monitor_type"] = monitor_type
        result["log_history"] = list()

        # with open(result_path, 'w') as f:
        #     json.dump(result, f, indent=2)

        # print('Created result file at {}'.format(result_path))

    else:
        with open(result_path, 'r') as f:
            result = json.load(f)

        assert result['monitor_type'] == monitor_type

        print('Loaded existing result file from {}'.format(result_path))
    
    return result

In [14]:
def _get_topics(topic_model):
    topic_list = []
    empty_topic_l_idx = []

    for idx, topics in topic_model.get_topics().items():
        if idx < 0:
            continue

        topics_sorted = sorted(topics, key=lambda x: x[1], reverse=True)
        topic_l = [t[0] for t in topics_sorted if t[0].strip() != '']

        # it's possible that resulting in an empty list
        # also, topic with only one word fails at calculating NPMI
        if len(topic_l) <= 1:
            empty_topic_l_idx.append(idx)
            continue

        topic_list.append(topic_l)
        # print(len(topic_l))

    return topic_list, empty_topic_l_idx

def _get_topic_word_matrix(topic_model, empty_topic_idxs):
    # use ctfidf value to calculate the probability of a word assigned to a topic
    # but this is not the probability of a word in a topic
    # maybe there's a better way

    c_tfidf_all = topic_model.c_tf_idf_.todense()

    topic_word_matrix = np.exp(c_tfidf_all) / np.exp(c_tfidf_all).sum(axis=1)

    # remove empty topics from the largest index
    for idx in empty_topic_idxs[::-1]:
        topic_word_matrix = np.delete(topic_word_matrix, idx, axis=0)

def _get_topic_document_matrix(probs, empty_topic_idxs):
    topic_document_matrix = probs.T

    for idx in empty_topic_idxs[::-1]:
        topic_document_matrix = np.delete(topic_document_matrix, idx, axis=1)

    return topic_document_matrix

In [15]:
from bertopic import BERTopic

def _load_bertopic_model(model_path:Path):
    topic_model = BERTopic.load(str(model_path))

    return topic_model

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
from gensim import corpora

from sentence_transformers import SentenceTransformer

import collections
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
from hdbscan import HDBSCAN


def model_search(X, hyperparameters:dict, search_space:dict, save_folder:Path,
                metrics:list[METRICS]=[METRICS.C_NPMI], monitor:METRICS=METRICS.C_NPMI, 
                save_each_models=True, run_from_checkpoints=False,
                search_behaviour=SEARCH_BEHAVIOUR.GRID_SEARCH, search_rs=42, search_n_iter=10):
    
    config_json_path = save_folder.joinpath('config.json')
    result_json_path = save_folder.joinpath('result.json')

    if monitor not in metrics:
        raise Exception('monitor is not in metrics. Please modify the metrics passed in.')

    if run_from_checkpoints:
        if not save_folder.exists():
            print('Save folder:' + str(save_folder.resolve()) + ' does not exist. Function terminates.')
            raise Exception('No checkpoints found. Function terminates.')
        
        # check for existing configs
        if not config_json_path.exists():
            raise Exception('No config.json found. Function terminates.')
        
        # check for existing results
        if not result_json_path.exists():
            print('no result.json is found. Assuming no existing checkpoints.')
    else:
        if save_folder.exists():
            raise Exception('Checkpoints found. Please delete the checkpoints or set run_from_checkpoints=True. Function terminates.')

    if not save_folder.exists():
        save_folder.mkdir()

    config = _init_config_dict(config_json_path, 'bertopic', hyperparameters, search_space, 
                               metrics, monitor, search_behaviour, search_rs, search_n_iter)
    
    result = _init_result_dict(result_json_path, monitor.value)

    print('Search folder: {}'.format(save_folder))

    # init
    best_model_path = result['best_model_checkpoint']
    best_metric_score = result['best_metric']
    best_model = _load_bertopic_model(Path(best_model_path)) if best_model_path != "" else None

    print(f'Best model checkpoint: {best_model_path}')
    print(f'Best metric score: {best_metric_score}')
    print(f'Best model: {best_model}')

    # search
    # create a temporary dict for initiating the search space by sklearn parameter grid / parameter sampler
    temp_search_space = {}
    for k, v in search_space.items():
        for kk, vv in v.items():
            temp_search_space[k + '__' + kk] = vv

    if search_behaviour == SEARCH_BEHAVIOUR.GRID_SEARCH:
        search_iterator = ParameterGrid(temp_search_space)
    elif search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
        search_iterator = ParameterSampler(temp_search_space, n_iter=search_n_iter, random_state=search_rs)

    print('\n')

    for search_space_dict in search_iterator:
        # unwrap the search space dict

        model_name = ''

        _sbert_params = {}
        _vocab_tokenizer_params = {}
        _umap_params = {}
        _hdbscan_params = {}
        _bertopic_params = {}

        for k, v in search_space_dict.items():
            if k.startswith('sbert_params'):
                _sbert_params[k.split('__')[1]] = v
                model_name += 'sb_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('vocab_tokenizer_params'):
                _vocab_tokenizer_params[k.split('__')[1]] = v
                model_name += 'vt_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('umap_params'):
                _umap_params[k.split('__')[1]] = v
                model_name += 'um_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('hdbscan_params'):
                _hdbscan_params[k.split('__')[1]] = v
                model_name += 'hs_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('bertopic_params'):
                _bertopic_params[k.split('__')[1]] = v
                model_name += 'bt_' + k.split('__')[1] + '_' + str(v) + '_'
            else:
                raise Exception('Unknown key: {}'.format(k))

        # create the model path to save the model
        model_path = save_folder.joinpath(config['model'] + '_' + model_name[:-1])      # remove the last '_'

        # check whether the model exists
        if model_path.exists():
            print(f'Skipping current search space: {search_space_dict}')
            continue

        ##########
        # Training starts
        ##########

        print(f'Current search space: {search_space_dict}')

        sbert_params = config['sbert_params'].copy()
        vocab_tokenizer_params = config['vocab_tokenizer_params'].copy()
        umap_params = config['umap_params'].copy()
        hdbscan_params = config['hdbscan_params'].copy()
        bertopic_params = config['bertopic_params'].copy()

        sbert_params.update(_sbert_params)
        vocab_tokenizer_params.update(_vocab_tokenizer_params)
        umap_params.update(_umap_params)
        hdbscan_params.update(_hdbscan_params)
        bertopic_params.update(_bertopic_params)

        # create embeddings
        if platform.system() == 'Linux' or platform.system() == 'Windows':
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            device = torch.device('mps')        # m-series machine
        
        sent_transformers = SentenceTransformer(**sbert_params,
                                                device=device)
        
        # load existing embeddings in the search folder to reuse the embeddings
        embeddings_path = save_folder.joinpath('embeddings.pkl')
        if embeddings_path.exists():
            with open(embeddings_path, 'rb') as f:
                embeddings = np.load(f)

            print(f'Found existing sbert embeddings at {embeddings_path}. Reusing them.')
        else:
            embeddings = sent_transformers.encode(X, show_progress_bar=True)
            with open(embeddings_path, 'wb') as f:
                np.save(f, embeddings)
            
            print('Saved sbert embeddings at:', embeddings_path)

        # prepare the vocabulary b4 training
        vocab = collections.Counter()
        tokenizer = CountVectorizer().build_tokenizer()
        for doc in tqdm(X):
            vocab.update(tokenizer(doc))
        vocab = [word for word, frequency in vocab.items() if frequency >= vocab_tokenizer_params['n_frequency']]; len(vocab)    # set the minimum frequency to reduce the vocabulary size

        del vocab_tokenizer_params['n_frequency']       # not used in the vectorizer model for training
        vocab_tokenizer_params['ngram_range'] = tuple(vocab_tokenizer_params['ngram_range'])       # convert list to tuple

        # prepare the sub models of BERTopic
        embedding_model = SentenceTransformer(**sbert_params)       # use the model as the embedding model
        umap_model = UMAP(**umap_params, verbose=True)       # set random_state for reproductability
        hdbscan_model = HDBSCAN(**hdbscan_params, gen_min_span_tree=True)
        vectorizer_model = CountVectorizer(vocabulary=vocab, stop_words="english", **vocab_tokenizer_params)              # for computing c-tfidf (first creating a count matrix, then let c-tfidf to calculate the c-tfidf representation)

        bertopic_params['nr_topics'] += 1       # add 1 BERTopic will produce an extra topic for outliers
        topic_model = BERTopic(**bertopic_params,
            embedding_model=embedding_model, 
            vectorizer_model=vectorizer_model,
            umap_model=umap_model, 
            hdbscan_model=hdbscan_model,
            # calculate_probabilities=True,     # already in bertopic_params
            verbose=True)
        
        topics, probs = topic_model.fit_transform(X, embeddings=embeddings)

        ##########
        # Training ends
        ##########

        ##########
        # Evaluation starts
        ##########

        # init data for gensim coherence model
        topic_words, empty_topic_idxs = _get_topics(topic_model)

        documents = pd.DataFrame({"Document": X,
                                "ID": range(len(X)),
                                "Topic": topics})

        # remove documents which their topic contains 1<= words
        documents = documents[~documents['Topic'].isin(empty_topic_idxs)]

        documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
        cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

        bertopic_vectorizer = topic_model.vectorizer_model
        bertopic_analyzer = bertopic_vectorizer.build_analyzer()

        words = bertopic_vectorizer.get_feature_names_out()
        tokens = [bertopic_analyzer(doc) for doc in cleaned_docs]
        dictionary = corpora.Dictionary(tokens)
        corpus = [dictionary.doc2bow(token) for token in tokens]

        topn = bertopic_params['top_n_words']
    
        # init octis format result for convenience
        result_octis = {}
        result_octis['topics'] = topic_words
        result_octis['topic-word-matrix'] = _get_topic_word_matrix(topic_model, empty_topic_idxs)
        result_octis['topic-document-matrix'] = _get_topic_document_matrix(probs, empty_topic_idxs)

        print('Computing evaluation metrics')

        metrics_score = dict()

        for metric in metrics:
            if metric in COHERENCE_MODEL_METRICS:
                # compute the coherence
                coherencemodel = CoherenceModel(topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, topn=topn, coherence=metric.value)
                score = coherencemodel.get_coherence()              

            elif metric == METRICS.TOPIC_DIVERSITY:
                # compute the coherence
                score = compute_topic_diversity(result_octis, topk=10)

            elif metric == METRICS.INVERTED_RBO:
                # compute the coherence
                score = compute_inverted_rbo(result_octis, topk=10)

            elif metric == METRICS.PAIRWISE_JACCARD_SIMILARITY:
                # compute the coherence
                score = compute_pairwise_jaccard_similarity(result_octis, topk=10)

            else:
                raise Exception(f'Unknown metric: {metric.value}')
            
            metrics_score[metric.value] = score

            print(f'Evaluation metric ({metric.value}): {score}')
            
        # get the monitor score
        monitor_score = metrics_score[monitor.value]

        ##########
        # Evaluation ends
        ##########
            
        ##########
        # Save models
        ##########
            
        if not model_path.exists():
            model_path.mkdir()

        if save_each_models:
            topic_model.save(
                path = model_path,
                serialization="safetensors",
                save_ctfidf=True,
                save_embedding_model=sbert_params['model_name_or_path']
            )

            print('Model saved at:', model_path)

        ##########
        # Save models ends
        ##########

        ###########
        # Update result dict and json file
        ###########
        
        # rebuild the model_hyperparameters dict
        model_hyperparameters = {
            'sbert_params': sbert_params,
            'vocab_tokenizer_params': vocab_tokenizer_params,
            'umap_params': umap_params,
            'hdbscan_params': hdbscan_params,
            'bertopic_params': bertopic_params
        }

        if monitor_score > best_metric_score:
            best_metric_score = monitor_score
            best_model = topic_model
            best_model_path = model_path
            best_hyperparameters = model_hyperparameters

        model_log_history = dict()
        model_log_history.update(metrics_score)         # add the metrics score values to the log history
        model_log_history['hyperparameters'] = model_hyperparameters

        result['best_metric'] = best_metric_score
        result['best_model_checkpoint'] = str(best_model_path)      # relative path
        result['best_hyperparameters'] = model_hyperparameters
        result["log_history"].append(model_log_history)

        # print(result)

        # save result
        with open(result_json_path, 'w') as f:
            json.dump(result, f, indent=2)

        print("Saved result.json at:", result_json_path)
        print('\n\n')
    
    print('Search ends')
    return best_model, best_model_path, best_hyperparameters


In [41]:
# grid search / random search

# hyperparameters
sbert_params = _init_sentence_transformers_params(model_name_or_path='sentence-transformers/all-MiniLM-L6-v2')
vocab_tokenizer_params = _init_vocab_tokenizer_params(n_frequency=15, ngram_range=(1, 2))
umap_params = _init_umap_params(n_neighbors=15, n_components=5, metric='cosine', min_dist=0.0, n_epochs=None, low_memory=False)
hdbscan_params = _init_hdbscan_params(min_cluster_size=15, min_samples=20, metric='euclidean', prediction_data=True)
bertopic_params = _init_bertopic_params(
    nr_topics=20, 
    top_n_words=10, 
    min_topic_size=15, 
    low_memory=False, 
    calculate_probabilities=True)

# search space dict
search_space_dict = {
    'vocab_tokenizer_params':{
        'ngram_range': [[1, 1], [1, 2]]     # datatype is list as json does not support tuple
    },
    'umap_params':{
        'n_neighbors': [15, 20, 25],
        'n_components': [5, 10],
        'min_dist': [0.0, 0.1],
    },
    'bertopic_params':{
        'nr_topics': [10, 20, 30],
        'min_topic_size': [15, 20, 25]
    }
}

# search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
search_behaviour = SEARCH_BEHAVIOUR.RANDOM_SEARCH

# training_datetime = datetime.now()
training_datetime = datetime(2024, 1, 21, 15, 43, 45)
training_folder = Path(f'bertopic_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')

best_model, best_model_path, best_hyperparameters = model_search(
    X,
    hyperparameters={
        'sbert_params': sbert_params,
        'vocab_tokenizer_params': vocab_tokenizer_params,
        'umap_params': umap_params,
        'hdbscan_params': hdbscan_params,
        'bertopic_params': bertopic_params
    },
    search_space=search_space_dict,
    save_folder=training_folder,
    metrics=[METRICS.C_NPMI, METRICS.C_V, METRICS.UMASS, METRICS.C_UCI, METRICS.TOPIC_DIVERSITY, METRICS.INVERTED_RBO, METRICS.PAIRWISE_JACCARD_SIMILARITY],
    monitor=METRICS.C_NPMI,
    save_each_models=True,
    run_from_checkpoints=True,
    search_behaviour=search_behaviour
)

no result.json is found. Assuming no existing checkpoints.
Loaded existing config file from bertopic_random_search_20240121_154345/config.json
Hyperparameters and search space are consistent with the input parameters
Search folder: bertopic_random_search_20240121_154345
Best model checkpoint: 
Best metric score: -inf
Best model: None


Current search space: {'vocab_tokenizer_params__ngram_range': [1, 2], 'umap_params__n_neighbors': 20, 'umap_params__n_components': 10, 'umap_params__min_dist': 0.0, 'bertopic_params__nr_topics': 30, 'bertopic_params__min_topic_size': 25}
Found existing sbert embeddings at bertopic_random_search_20240121_154345/embeddings.pkl. Reusing them.


100%|██████████| 75494/75494 [00:00<00:00, 96446.04it/s] 
2024-01-21 16:40:08,139 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=10, n_neighbors=20, verbose=True)
Sun Jan 21 16:40:08 2024 Construct fuzzy simplicial set
Sun Jan 21 16:40:08 2024 Finding Nearest Neighbors
Sun Jan 21 16:40:08 2024 Building RP forest with 19 trees
Sun Jan 21 16:40:08 2024 NN descent for 16 iterations
	 1  /  16
	 2  /  16
	 3  /  16
	 4  /  16
	 5  /  16
	 6  /  16
	 7  /  16
	Stopping threshold met -- exiting after 7 iterations
Sun Jan 21 16:40:15 2024 Finished Nearest Neighbor Search
Sun Jan 21 16:40:15 2024 Construct embedding


Epochs completed:   3%| ▎          6/200 [00:00]

	completed  0  /  200 epochs


Epochs completed:  12%| █▏         24/200 [00:01]

	completed  20  /  200 epochs


Epochs completed:  22%| ██▎        45/200 [00:02]

	completed  40  /  200 epochs


Epochs completed:  32%| ███▏       63/200 [00:03]

	completed  60  /  200 epochs


Epochs completed:  42%| ████▏      84/200 [00:04]

	completed  80  /  200 epochs


Epochs completed:  52%| █████▎     105/200 [00:05]

	completed  100  /  200 epochs


Epochs completed:  62%| ██████▏    123/200 [00:06]

	completed  120  /  200 epochs


Epochs completed:  72%| ███████▎   145/200 [00:07]

	completed  140  /  200 epochs


Epochs completed:  82%| ████████▏  163/200 [00:08]

	completed  160  /  200 epochs


Epochs completed:  92%| █████████▏ 184/200 [00:09]

	completed  180  /  200 epochs


Epochs completed: 100%| ██████████ 200/200 [00:10]


Sun Jan 21 16:40:28 2024 Finished embedding


2024-01-21 16:40:28,921 - BERTopic - Dimensionality - Completed ✓
2024-01-21 16:40:28,922 - BERTopic - Cluster - Start clustering the reduced embeddings
  self._all_finite = is_finite(X)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TO

In [10]:
# large documents

# pre-calculate embeddings

from sentence_transformers import SentenceTransformer

# Create embeddings

SENTENCE_TRANSFORMERS_NAME = 'sentence-transformers/all-MiniLM-L6-v2'

model = SentenceTransformer(SENTENCE_TRANSFORMERS_NAME, device=device)
embeddings = model.encode(X, show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 2556/2556 [01:27<00:00, 29.16it/s]


In [11]:
# save the embeddings

embedding_path = Path('00_Terraria_embeddings.pkl')

if not embedding_path.exists():
    with open(embedding_path, 'wb') as f:
        np.save(f, embeddings)

In [19]:
# prepare vocabulary before training such that tokenizer does not need to do the calculations itself
# this limits the vocab considered by c-tfidf in BERTopic
# creating vocab before training to reduce RAM

import collections
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer

# Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for doc in tqdm(X):
  vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 15]; len(vocab)    # set the minimum frequency to reduce the vocabulary size

100%|██████████| 81776/81776 [00:00<00:00, 102273.93it/s]


6891

In [20]:
# not using GPU acceleration as the dependency is fking messy
# and the model is deployed on a CPU only server

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
from hdbscan import HDBSCAN


# parameter optimization
# UMAP
UMAP_N_COMPONENTS = 5
UMAP_N_NEIGHBORS = 50

# HDBSCAN
HDBSCAN_MIN_CLUSTER_SIZE = 150
HDBSCAN_MIN_SAMPLES = 20

# BERTopic
N_TOPICS = 20

# check: https://maartengr.github.io/BERTopic/faq.html#which-embedding-model-should-i-choose 
# for more parameter optimization on the UMAP and HDBSCAN models

# Prepare sub-models
# the HDBSCAN and UMAP are (Nvidia) GPU-accelerated versions
embedding_model = SentenceTransformer(SENTENCE_TRANSFORMERS_NAME)       # use the model as the embedding model
umap_model = UMAP(n_components=UMAP_N_COMPONENTS, n_neighbors=UMAP_N_NEIGHBORS, min_dist=0.0, random_state=42, metric="cosine", verbose=True)       # set random_state for reproductability
hdbscan_model = HDBSCAN( min_cluster_size=HDBSCAN_MIN_CLUSTER_SIZE, min_samples=HDBSCAN_MIN_SAMPLES, gen_min_span_tree=True, prediction_data=True)
vectorizer_model = CountVectorizer(vocabulary=vocab, stop_words="english", ngram_range=(1,1))              # for computing c-tfidf (first creating a count matrix, then let c-tfidf to calculate the c-tfidf representation)
# vectorizer_model = ClassTfidfTransformer()

# Fit BERTopic without actually performing any clustering
topic_model = BERTopic(
        nr_topics=N_TOPICS + 1,                 # add 1 as the topic with id = '-1' represents outliers, and should be typically ignored
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        calculate_probabilities=True,
        
        verbose=True
)

topics, probs = topic_model.fit_transform(X, embeddings=embeddings)

2024-01-18 16:32:56,917 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=5, n_neighbors=50, random_state=42, verbose=True)
Thu Jan 18 16:32:57 2024 Construct fuzzy simplicial set
Thu Jan 18 16:32:57 2024 Finding Nearest Neighbors
Thu Jan 18 16:32:57 2024 Building RP forest with 19 trees
Thu Jan 18 16:32:59 2024 NN descent for 16 iterations
	 1  /  16
	 2  /  16
	 3  /  16
	 4  /  16
	 5  /  16
	Stopping threshold met -- exiting after 5 iterations
Thu Jan 18 16:33:13 2024 Finished Nearest Neighbor Search
Thu Jan 18 16:33:16 2024 Construct embedding


Epochs completed:   2%| ▏          3/200 [00:00]

	completed  0  /  200 epochs


Epochs completed:  10%| █          21/200 [00:05]

	completed  20  /  200 epochs


Epochs completed:  20%| ██         41/200 [00:11]

	completed  40  /  200 epochs


Epochs completed:  30%| ███        61/200 [00:18]

	completed  60  /  200 epochs


Epochs completed:  40%| ████       81/200 [00:24]

	completed  80  /  200 epochs


Epochs completed:  50%| █████      101/200 [00:30]

	completed  100  /  200 epochs


Epochs completed:  60%| ██████     121/200 [00:36]

	completed  120  /  200 epochs


Epochs completed:  70%| ███████    141/200 [00:43]

	completed  140  /  200 epochs


Epochs completed:  80%| ████████   161/200 [00:49]

	completed  160  /  200 epochs


Epochs completed:  90%| █████████  181/200 [00:55]

	completed  180  /  200 epochs


Epochs completed: 100%| ██████████ 200/200 [01:01]


Thu Jan 18 16:34:26 2024 Finished embedding


2024-01-18 16:34:26,778 - BERTopic - Dimensionality - Completed ✓
2024-01-18 16:34:26,779 - BERTopic - Cluster - Start clustering the reduced embeddings
  self._all_finite = is_finite(X)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TO

In [22]:
# get topic frequency table
freq = topic_model.get_topic_freq()
print(freq)
print('Num of topics:', len(freq))

    Topic  Count
1      -1  30807
0       0  19379
7       1  15848
2       2   4146
12      3   2342
4       4   1548
6       5   1237
10      6   1049
14      7   1029
3       8    776
16      9    628
5      10    475
15     11    473
19     12    431
11     13    374
9      14    284
8      15    276
13     16    175
18     17    174
20     18    173
17     19    152
Num of topics: 21


Outlier reduction

In [37]:
# reduce outlier: https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html

# https://medium.com/@n83072/topic-modeling-bertopic-ca1b73a035f2

# Reduce outliers using the `probabilities` strategy
# This strategy uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document.
# To use this, make sure to calculate the probabilities beforehand by instantiating BERTopic with calculate_probabilities=True.
new_topics = topic_model.reduce_outliers(X, topics, probabilities=probs, strategy="probabilities")


# Use the topic distributions, as calculated with .approximate_distribution
# to find the most frequent topic in each outlier document.
# You can use the distributions_params variable to tweak the parameters of .approximate_distribution.
# Reduce outliers using the `distributions` strategy
new_topics = topic_model.reduce_outliers(X, topics, strategy="distributions")


# Reduce outliers using the `c-tf-idf` strategy
# Calculate the c-TF-IDF representation for each outlier document 
# and find the best matching c-TF-IDF topic representation using cosine similarity.
new_topics = topic_model.reduce_outliers(X, topics, strategy="c-tf-idf")

# Reduce outliers using the `embeddings` strategy
# but it costs huge reduction in npmi score
# maybe other less aggressive strategies should be used
# new_topics = topic_model.reduce_outliers(X, topics, strategy="embeddings")

In [23]:
from collections import Counter
new_topic_dict = dict(Counter(new_topics))


new_topic_dict_df = pd.DataFrame(list(new_topic_dict.items()), columns=['topic_id', 'count'])
new_topic_dict_df = new_topic_dict_df.sort_values(by=['count'], ascending=False)

new_topic_dict_df

NameError: name 'new_topics' is not defined

In [45]:
new_topic_dict_df[new_topic_dict_df['topic_id'] == '-1']

Unnamed: 0,topic_id,count


In [46]:
# try to apply the topic reduction to the BERTopic model

topic_model.update_topics(X, topics=new_topics)



In [24]:
# save the model (different from the func for small documents)
from datetime import datetime

topic_model_folder_path = Path(f'my_model_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
topic_model.save(
    path=topic_model_folder_path,
    serialization="safetensors",
    save_ctfidf=True,
    save_embedding_model=SENTENCE_TRANSFORMERS_NAME
)

In [26]:
# reload the trained topic model for faster inference
del topic_model

In [16]:
# load the embeddings

embedding_path = Path('00_Terraria_embeddings.pkl')

if embedding_path.exists():
    with open(embedding_path, 'rb') as f:
        embeddings = np.load(f)
else:
    raise FileNotFoundError('Embedding file not found')

In [12]:
# load the model

from bertopic import BERTopic

# # when loading the model, the public attributes are not stored
# e.g. self.representative_docs_ (this can be regenerated by calling topic_model.)
topic_model = BERTopic.load('my_model_20240118_163626')

# topics: list of topic ids (n,) where n is the number of documents
# probs: list of probabilities for each topic (n, d) where n is the number of documents and d is the number of topics (including outlier)
topics, probs = topic_model.transform(X, embeddings=embeddings)

2024-01-19 23:38:56,151 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [13]:
# get top 10 representative docs for each topic

# Approximate most representative documents per topic by sampling
# a subset of the documents in each topic and calculating which are
# most represenative to their topic based on the cosine similarity between
# c-TF-IDF representations

# the method was called internally in the fit_transform method
# so that the .get_topic_info() can work properly when the model was reloaded from disk
repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids = topic_model._extract_representative_docs(
    topic_model.c_tf_idf_,
    pd.DataFrame({"Document": X, "ID": range(len(X)), "Topic": topics}),
    topic_model.topic_representations_,
    nr_samples=500,
    nr_repr_docs=5
)

In [15]:
X[13283]

"This game. THIS GAME. I'ts freaking AWESOME!!! Some people say it's a 'cheap 2D rip-off of Minecraft', and to that i say NOOOO!!! This game has so many aspects in its gameplay other than mining and building (and I don't say Minecraft is bad), it's just that these two have different 'categories', Terraria is a '2D Sandbox Adventure game with an enphasis on combat', while Minecraft is a '3D, Sandbox Survival Adventure game based around building', without counting that Minecraft has official mod compatibility. Terraria doesn't need mods to have a lot of content, as a 'good patch' in Terraria equals to 6 new bosses, 43 new enemies (each with their own variants), 168 new items, 45 new weapons with wacky effects, 4 new NPCs, 3 new biomes, and an ENTIRE DIFFICULTY SETTING (add that to the fact that Terraria haas had at least 4 'good patches' and you got yourself a game for an entire year!) :D The replay value is excellent, because of the completely random worlds, along with the added difficu

In [14]:
# the mapping is in no particular order

repr_docs_mappings

{-1: ["This game is not for me. I don't know I see a lot of people playing it, and I'm sure they're having fun with it, but not me. I think it's downright boring. I had the same feeling with Minecraft. Sure, I knew before I bought this game that it would be somewhat similar to Minecraft, but still, I like to experience all games myself. That's the reason I buy so many games. Anyway, in this game you get to kill enemies and chop wood, things like that. With the resources you build things like houses and weapons. That's the basic idea. There are no objectives, there's no mission. You just create a world of your own, fighting enemies along the way. I salute those who do have the patience to put hundreds of hours into these games, creating the best worlds possible, exploring everything, doing everything, but next time I see a game like this I'll just simply ignore it. Period. [Rating: 66/100]",
  '( I played this game before so dont look at my hourse ;3 ) This game is AWESOME!!! You should

In [16]:
# more than one as maybe after cleaning, the documents are the same

repr_docs_ids

[[69133, 29213, 28978, 8221, 65124],
 [13283, 4330, 32151, 25802, 66255],
 [26042, 23556, 56004, 13546, 19767],
 [72766, 22245, 27028, 64480, 43599],
 [76234, 76513, 24610, 9937, 81174, 44565, 78625, 71770, 21959],
 [58017, 80060, 75614, 24699, 29527],
 [5612, 21355, 15721, 73722, 76673],
 [66926, 81123, 69346, 39736, 73166],
 [45828, 14808, 14052, 43830, 31786],
 [13954,
  15367,
  3571,
  62552,
  67027,
  74730,
  27664,
  22968,
  45916,
  41352,
  44060,
  77713,
  72729,
  65253,
  41473,
  64035,
  69670,
  73271,
  1740,
  46623,
  26851,
  41517,
  41952,
  81378,
  78216,
  41589,
  48347,
  26682,
  25142,
  69155,
  64631,
  60209,
  10852,
  73386,
  69198,
  26065,
  77440,
  17722,
  21359,
  55854,
  60560,
  47427,
  36040,
  62040,
  42949,
  7516,
  27658,
  72843,
  77500,
  79390,
  67005],
 [66470, 81395, 79254, 8903, 14471],
 [2712, 54219, 17971, 24134, 78942],
 [5188, 840, 37047, 45138, 5042],
 [31321, 51514, 58100, 61773, 27996],
 [10515, 17867, 59988, 63367, 8

In [76]:
for i in repr_docs_ids[-1]:
    print(X[i])

10/10, would IGN
10/6 would IGN again
I like this game so much. 10/10 IGN approves. ^_^
10/10 would terraria again -IGN
There's glowsticks. 10/10 IGN


In [46]:
probs.shape

(81769, 21)

---

Get the docs with the highest probability in each topic when transform with a new set of documents

In [62]:
# how about we use the topics and probs variable to calculate the top N representative docs
top_N = 10

idx = np.argpartition(-probs, top_N, axis=0)[:top_N]

In [63]:
# row = document, col = topic
idx.shape

(10, 21)

In [65]:
idx[:, -1]

array([66922, 60612, 39721, 41823, 34887, 66124,  5826, 44161, 76701,
       76489])

In [66]:
probs[idx[:, -1], -1]

array([0.8847593 , 0.88933086, 0.89252526, 0.87341017, 0.86458516,
       0.86464214, 0.87115467, 0.86157316, 0.8588035 , 0.8588035 ],
      dtype=float32)

In [75]:
for i in idx[:, -1]:
    print(X[i])

Such a great Game 10/10 -Ign
I LOVE THIS GAME ign 10/10
this game is amazing 10/10 IGN
I LOVE THIS GAME 10/10 BEST GAVE EVER IGN
its a great game 10/10 IGN rating 
This is one of the best games ever. It got 9/10 IGN
this Game is amazing 10/1o ign
Great game 10/10 IGN :)
Great game, IGN 11/10
Great game, IGN 11/10


In [77]:
scores = probs[idx[:, 0]]

In [78]:
scores

array([[0.83925736, 0.74457836, 0.80703235, 0.6795908 , 0.4112299 ,
        0.60110843, 0.32747   , 0.4742515 , 0.57148874, 0.11809592,
        0.37909943, 0.57450265, 0.4534629 , 0.4331435 , 0.5107883 ,
        0.52181256, 0.52877164, 0.5952083 , 0.51749295, 0.2519888 ,
        0.3827619 ],
       [0.83819544, 0.6792901 , 0.82225263, 0.59778   , 0.4467274 ,
        0.71488297, 0.4026624 , 0.4975381 , 0.5864483 , 0.17567718,
        0.37321538, 0.6265934 , 0.49665412, 0.45622283, 0.5865581 ,
        0.57748616, 0.50651133, 0.5680938 , 0.5247112 , 0.29756355,
        0.419262  ],
       [0.862828  , 0.71632946, 0.8440254 , 0.6514621 , 0.38810313,
        0.73899895, 0.35407072, 0.51394963, 0.6051907 , 0.10856348,
        0.3591198 , 0.58158875, 0.474846  , 0.46928063, 0.5575608 ,
        0.5806221 , 0.5548263 , 0.5549511 , 0.5808619 , 0.23594311,
        0.4032487 ],
       [0.8423841 , 0.7527422 , 0.79166555, 0.6698292 , 0.42994094,
        0.6526007 , 0.42818356, 0.46310222, 0.551491 

In [79]:
scores.shape

(10, 21)

In [16]:
# # load the embeddings
# embedding_path = Path('00_Terraria_embeddings.pkl')
# embeddings = np.load(embedding_path)

# # inference to get the topics and prob for evaluation
# # hence, we need the probs to get topic-doc-matrix
# topics, probs = topic_model.transform(X, embeddings=embeddings)

In [17]:
probs.shape

(81776, 20)

Extracting Topics

In [28]:
# look at the most frequent topics 

freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,30807,-1_game_fun_10_like,"[game, fun, 10, like, minecraft, hours, play, ...",
1,0,19379,0_terraria_game_minecraft_like,"[terraria, game, minecraft, like, bosses, sand...",
2,1,15848,1_game_fun_great_friends,"[game, fun, great, friends, play, best, 10, go...",
3,2,4146,2_minecraft_2d_better_like,"[minecraft, 2d, better, like, game, 10, fun, g...",
4,3,2342,3_good_ok_cool_awesome,"[good, ok, cool, awesome, pretty, alright, aws...",


In [29]:
topic_model.get_topic(0)  # Select the most frequent topic

[['terraria', 0.04546136142022207],
 ['game', 0.029715244585525744],
 ['minecraft', 0.022286892174031094],
 ['like', 0.019741095325865838],
 ['bosses', 0.01918579669916258],
 ['sandbox', 0.01830580611353157],
 ['just', 0.016558929897253067],
 ['games', 0.016156287481810104],
 ['play', 0.01580017737860383],
 ['fun', 0.01579072651927058]]

(Copy from BERTopic ipynb in colab)

There are a number of attributes that you can access after having trained your BERTopic model:


| Attribute | Description |
|------------------------|---------------------------------------------------------------------------------------------|
| topics_               | The topics that are generated for each document after training or updating the topic model. |
| probabilities_ | The probabilities that are generated for each document if HDBSCAN is used. |
| topic_sizes_           | The size of each topic                                                                      |
| topic_mapper_          | A class for tracking topics and their mappings anytime they are merged/reduced.             |
| topic_representations_ | The top *n* terms per topic and their respective c-TF-IDF values.                             |
| c_tf_idf_              | The topic-term matrix as calculated through c-TF-IDF.                                       |
| topic_labels_          | The default labels for each topic.                                                          |
| custom_labels_         | Custom labels for each topic as generated through `.set_topic_labels`.                                                               |
| topic_embeddings_      | The embeddings for each topic if `embedding_model` was used.                                                              |
| representative_docs_   | The representative documents for each topic if HDBSCAN is used. (affects evaluation (calling get_topic_info()), transform with the provided data to get the topic and the probability and re-calculate them)                                                |

Save and load BERTopic models and components

Visualization

In [30]:
# visualize topics

topic_model.visualize_topics()

In [48]:
# visualize topic probabilities
# to understand how confident BERTopic is that certain topics are present in the documents

topic_model.visualize_distribution(probs[200], min_probability=0.001)

In [49]:
# visualize how topics are hierarchically reduced

topic_model.visualize_hierarchy(top_n_topics=50)


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead



In [50]:
# visualize selecteed terms for a few topics
# creating bar charts out of the c-TF-IDF scores for each topic representation.

topic_model.visualize_barchart(top_n_topics=5)

In [51]:
# visualize topic similarity
# Having generated topic embeddings, through both c-TF-IDF and embeddings,
# we can create a similarity matrix by simply applying cosine similarities through those topic embeddings.
# The result will be a matrix indicating how similar certain topics are to each other.

topic_model.visualize_heatmap(n_clusters=10, width=1000, height=1000)

Evaluation

Calculate metrics with octis

Reference

https://www.theanalyticslab.nl/topic-modeling-with-bertopic/

In [52]:
result_bertopic = {}

top_words = 10     # the functions will only return that number of top words
def _get_topics(topic_model):
    topic_list = []
    empty_topic_l_idx = []

    for idx, topics in topic_model.get_topics().items():
        if idx < 0:
            continue

        topics_sorted = sorted(topics, key=lambda x: x[1], reverse=True)
        topic_l = [t[0] for t in topics_sorted if t[0].strip() != '']

        # it's possible that resulting in an empty list
        # also, topic with only one word fails at calculating NPMI
        if len(topic_l) <= 1:
            empty_topic_l_idx.append(idx)
            continue

        topic_list.append(topic_l)
        # print(len(topic_l))

    return topic_list, empty_topic_l_idx

def _get_topic_word_matrix(topic_model, empty_topic_idxs):

    # use ctfidf value to calculate the probability of a word assigned to a topic
    # but this is not the probability of a word in a topic
    # maybe there's a better way

    c_tfidf_all = topic_model.c_tf_idf_.todense()

    topic_word_matrix = np.exp(c_tfidf_all) / np.exp(c_tfidf_all).sum(axis=1)

    # remove empty topics from the largest index
    for idx in empty_topic_idxs[::-1]:
        topic_word_matrix = np.delete(topic_word_matrix, idx, axis=0)

    # a better way: https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html#visualize-probablities-or-distribution
    

    return topic_word_matrix

def _get_topic_document_matrix(probabilities, empty_topic_idxs):

    topic_document_matrix = probabilities.T

    for idx in empty_topic_idxs[::-1]:
        topic_document_matrix = np.delete(topic_document_matrix, idx, axis=1)

    return topic_document_matrix

result_bertopic['topics'], empty_topic_idxs = _get_topics(topic_model)
result_bertopic['topic-word-matrix'] = _get_topic_word_matrix(topic_model, empty_topic_idxs)
result_bertopic['topic-document-matrix'] = _get_topic_document_matrix(probs, empty_topic_idxs)

In [53]:
result_bertopic['topics'], result_bertopic['topic-word-matrix'], result_bertopic['topic-document-matrix']

([['game', 'this', 'it', 'and', 'the', 'to', 'of', 'you', 'is', 'fun'],
  ['terraria', 'the', 'and', 'to', 'you', 'is', 'of', 'it', 'game', 'that'],
  ['minecraft', 'and', 'game', 'this', 'it', 'is', 'of', 'you', 'the', 'to'],
  ['game', 'this', 'best', 'great', 'ever', 'love', 'is', 'good', 'one', 'it'],
  ['10',
   'again',
   'killed',
   'would',
   'the',
   'my',
   'you',
   'and',
   'to',
   'unicorn'],
  ['my', 'it', 'but', 'fix', 'the', 'and', 'game', 'to', 'me', 'this'],
  ['addictive',
   'addicting',
   'fun',
   'very',
   'addicted',
   'game',
   'hours',
   'and',
   'this',
   'it'],
  ['10', 'would', 'again', '11', 'ign', 'play', 'life', 'tunk', 'my', 'good'],
  ['good',
   'ok',
   'its',
   'pretty',
   'alright',
   'it',
   'guess',
   'cool',
   'yeah',
   'okay'],
  ['bye',
   'cool',
   'slit',
   'dink',
   'so',
   'tickle',
   'pickle',
   'zone',
   'it',
   'let'],
  ['review',
   'reviews',
   'badgei',
   'le',
   'this',
   'the',
   'game',
   'badge

In [57]:
topic_freq = topic_model.get_topic_freq()
topic_freq[topic_freq['Topic'] != -1]

Unnamed: 0,Topic,Count
3,0,29399
0,1,14477
5,2,13139
8,3,8073
7,4,2529
12,6,1726
6,7,1547
13,5,1500
17,12,1409
14,9,1353


Evaluation with gensim

(as gives more freedom to control the CoherenceModel by gensim)

In [59]:
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

# https://stackoverflow.com/questions/70548316/gensim-coherencemodel-gives-valueerror-unable-to-interpret-topic-as-either-a-l

# filter topics that contain only one word from the corpus for calculating npmi
# https://github.com/piskvorky/gensim/issues/3328


topic_words, empty_topic_l_idx = _get_topics(topic_model)

documents = pd.DataFrame({"Document": X,
                          "ID": range(len(X)),
                          "Topic": topics})

# remove documents which their topic contains 1<= words
documents = documents[~documents['Topic'].isin(empty_topic_idxs)]

documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

bertopic_vectorizer = topic_model.vectorizer_model
bertopic_analyzer = bertopic_vectorizer.build_analyzer()

words = bertopic_vectorizer.get_feature_names_out()
tokens = [bertopic_analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

In [60]:
# ~3 min on i714700 with CountVectorizer ~ 6000 words

# we first analysze NPMI

coherence_model = CoherenceModel(topics=topic_words,
                                 texts=tokens,
                                corpus=corpus,
                                dictionary=dictionary,
                                topn=10,
                                coherence='c_v')

# npmi = Coherence(texts=tokens,topk=10, measure='c_npmi')
# nmpi_score = npmi.score(result_bertopic)

cv_score = coherence_model.get_coherence()
cv_score


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0.3994560925733617

In [61]:
coherence_model_npmi = CoherenceModel(topics=topic_words,
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    topn=10,
                                    coherence='c_npmi')

npmi_score = coherence_model_npmi.get_coherence()
npmi_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

-0.0029256775418027474

In [31]:
def get_topic_diversity(topics, topk=10):
    ''' Topic Diversity as the percentage of unique words in the top M words of all topics
    Modified from octis implementation
    
    Parameters
    ----------
    topics : list of list of str
        List of topics, where each topic is a list of words.
    topk : int, optional
    '''
    if topics is None:
        return 0
    # if topk > len(topics[0]):
    #     raise Exception('Words in topics are less than ' + str(self.topk))

    unique_words = set()
    for topic in topics:
        unique_words = unique_words.union(set(topic[:topk]))
    td = len(unique_words) / (topk * len(topics))
    return td

get_topic_diversity(topic_words)

0.655

In [33]:
import itertools

import sys
sys.path.append('../')

from rbo import rbo

def get_word2index(list1, list2):
    words = set(list1)
    words = words.union(set(list2))
    word2index = {w: i for i, w in enumerate(words)}
    return word2index

def get_inverted_RBO(topics, topk=10, weight=0.9):
    ''' Inverted Rank-Biased Overlap (iRBO)
    to measure the diversity of the topics
    Modified from octis implementation

    Parameters
    ----------
    topics : list of list of str
        List of topics, where each topic is a list of words.
    topk : int, optional
    weight : float, optional
    '''

    if topics is None:
        return 0
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than topk')
    else:
        collect = []
        for list1, list2 in itertools.combinations(topics, 2):
            word2index = get_word2index(list1, list2)
            indexed_list1 = [word2index[word] for word in list1]
            indexed_list2 = [word2index[word] for word in list2]
            rbo_val = rbo(indexed_list1[:topk], indexed_list2[:topk], p=weight)[2]
            collect.append(rbo_val)
        return 1 - np.mean(collect)
    
get_inverted_RBO(topic_words)

0.9363353717539098

In [34]:
def _KL(P, Q):
    """
    Perform Kullback-Leibler divergence

    Parameters
    ----------
    P : distribution P
    Q : distribution Q

    Returns
    -------
    divergence : divergence from Q to P
    """
    # add epsilon to grant absolute continuity
    epsilon = 0.00001
    P = P+epsilon
    Q = Q+epsilon

    divergence = np.sum(np.multiply(P, np.log(P/Q)))        # changed the operator from * to np.multiply to do element-wise multiplication
    return divergence

def get_kl_divergence(topic_word_metrix):
    """Compute KL divergence between topic-word distributions
    to measure document covrage
    Modified from octis implementation
    https://github.com/MIND-Lab/OCTIS/blob/master/octis/evaluation_metrics/diversity_metrics.py#L209

    Parameters
    ----------
    topic_word_metrix : topic-word distribution matrix
    """
    beta = topic_word_metrix
    kl_div = 0
    count = 0
    for i, j in itertools.combinations(range(len(beta)), 2):
        kl_div += _KL(beta[i], beta[j])
        count += 1
    return kl_div / count

get_kl_divergence(result_bertopic['topic-word-matrix'])

0.00022574783055084367

In [35]:
result_bertopic['topic-word-matrix'].shape

(21, 6968)