Demo ipynb for CTM (hyperparameters grid/random search)

Combined TM

In [1]:
import pandas as pd
import numpy as np


from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
# from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

import nltk
import os

from pathlib import Path
import json
from datetime import datetime

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_path = Path('../../dataset/topic_modelling/top_10_games/00_Terraria.pkl')

dataset = pd.read_pickle(dataset_path)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 75499 entries, 57735 to 133233
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         75499 non-null  int64 
 1   app_id        75499 non-null  int64 
 2   app_name      75499 non-null  object
 3   review_text   75499 non-null  object
 4   review_score  75499 non-null  int64 
 5   review_votes  75499 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 4.0+ MB


In [3]:
%load_ext autoreload

In [4]:
# data preprocessing

import sys
sys.path.append('../../sa/')

%autoreload 2
import str_cleaning_functions

# copied from lda_demo_gridsearch.ipynb
def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

# def cleaning_strlist(str_list):
#     str_list = list(map(lambda x: clean(x), str_list))
#     str_list = list(map(lambda x: deEmojify(x), str_list))

#     str_list = list(map(lambda x: x.lower(), str_list))
#     str_list = list(map(lambda x: remove_num(x), str_list))
#     str_list = list(map(lambda x: unify_whitespaces(x), str_list))

#     str_list = list(map(lambda x: _deaccent(x), str_list))
#     str_list = list(map(lambda x: remove_non_alphabets(x), str_list))
#     str_list = list(map(lambda x: remove_stopword(x), str_list))
#     return str_list

# copied from bert_demo_gridsearch.ipynb
def cleaning_little(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))


In [5]:
# create a copy of the dataset, as we need both untouched text and cleaned text

dataset_preprocessed = dataset.copy()

In [6]:
cleaning(dataset_preprocessed, 'review_text')


cleaning_little(dataset, 'review_text')

In [7]:
X_preprocessed = dataset_preprocessed['review_text'].values
X = dataset['review_text'].values

Apply lemmatizing to the preprocessed dataset as well (for BoW)

In [8]:
# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

In [9]:
X_preprocessed = list(map(lambda x: lemmatization(x), X_preprocessed))
X_preprocessed = list(map(lambda x: ' '.join(x), X_preprocessed))

Training

In [10]:
# copy from: https://github.com/MilaNLProc/contextualized-topic-models/blob/master/contextualized_topic_models/utils/data_preparation.py#L44
# call bert_embeddings_from_list() to produce embeddings by ourself

import warnings
from sentence_transformers import SentenceTransformer
import torch
import platform


if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series mac machine

print(device)

def bert_embeddings_from_list(
    texts, 
    model_name_or_path, 
    batch_size=32, 
    max_seq_length=None,            # 128 is the default valule in TopicModelDataPreparation() init. Passing none to use the default value of each model
    device='cpu'):
    """
    Creates SBERT Embeddings from a list
    """

    model = SentenceTransformer(model_name_or_path, device=device)

    if max_seq_length is not None:
        model.max_seq_length = max_seq_length
    else:
        max_seq_length = model.max_seq_length

    check_max_local_length(max_seq_length, texts)

    return np.array(model.encode(texts, batch_size=batch_size, show_progress_bar=True))


def check_max_local_length(max_seq_length, texts):
    max_local_length = np.max([len(t.split()) for t in texts])
    if max_local_length > max_seq_length:
        warnings.simplefilter("always", DeprecationWarning)
        warnings.warn(
            f"the longest document in your collection has {max_local_length} words, the model instead "
            f"truncates to {max_seq_length} tokens."
        )

cuda


In [11]:
from gensim.models import CoherenceModel
from copy import deepcopy

from sklearn.model_selection import ParameterGrid, ParameterSampler

sys.path.append('../')

from eval_metrics import compute_inverted_rbo, compute_topic_diversity, compute_pairwise_jaccard_similarity, \
                        METRICS, SEARCH_BEHAVIOUR, COHERENCE_MODEL_METRICS

In [12]:
# init params

def _init_count_vectorizer_params(
        max_features=2000,
        ngram_range=(1,1)
):
    params_dict = {}
    params_dict['max_features'] = max_features
    params_dict['ngram_range'] = ngram_range

    return params_dict

def _init_sbert_params(
    model_name_or_path='all-mpnet-base-v2'
):
    params_dict = {}
    params_dict['model_name_or_path'] = model_name_or_path

    return params_dict

# params are copied from source code of CTM: https://github.com/MilaNLProc/contextualized-topic-models/blob/master/contextualized_topic_models/models/ctm.py#L131
# commented params are params that has no plan on fine-tuning them (not significant to our project)
def _init_ctm_params(
        # bow_size,
        # contextual_size,
        # inference_type="combined",
        n_components=10,
        # model_type="prodLDA",
        hidden_sizes=(100, 100),
        # activation="softplus",
        dropout=0.2,
        # learn_priors=True,
        # batch_size=64,
        lr=2e-3,
        momentum=0.99,
        solver="adam",
        num_epochs=100,
        # reduce_on_plateau=False,      # only valid if there's a testing data (seems no need to havbe label, just partition a testing dataset with train_test_split()))
        # num_data_loader_workers=mp.cpu_count(),
        # label_size=0,
        # loss_weights=None
):
    params_dict = {}
    # params_dict['bow_size'] = bow_size                        # decided by the count vectorizer params (max_features)
    # params_dict['contextual_size'] = contextual_size          # decided by the sbert model
    # params_dict['inference_type'] = inference_type
    params_dict['n_components'] = n_components
    # params_dict['model_type'] = model_type
    params_dict['hidden_sizes'] = hidden_sizes
    # params_dict['activation'] = activation
    params_dict['dropout'] = dropout
    # params_dict['learn_priors'] = learn_priors
    # params_dict['batch_size'] = batch_size
    params_dict['lr'] = lr
    params_dict['momentum'] = momentum
    params_dict['solver'] = solver
    params_dict['num_epochs'] = num_epochs

    return params_dict

In [13]:
def _init_config_dict(config_path:Path, model_name:str, hyperparameters:dict, search_space_dict:dict, 
                      metrics:list[METRICS], monitor:METRICS,
                      search_behaviour:SEARCH_BEHAVIOUR, search_rs:int, search_n_iter:int):
    
    if not config_path.exists():
        config = {}

        sbert_params = _init_sbert_params(**hyperparameters['sbert_params'])
        countvect_params = _init_count_vectorizer_params(**hyperparameters['countvect_params'])
        ctm_params = _init_ctm_params(**hyperparameters['ctm_params'])

        config['model'] = model_name
        config['sbert_params'] = sbert_params
        config['countvect_params'] = countvect_params
        config['ctm_params'] = ctm_params

        if 'sbert_params' in search_space_dict:
            for k in search_space_dict['sbert_params'].keys():
                sbert_params.pop(k, '')     # add a default value to avoid key error
        if 'countvect_params' in search_space_dict:
            for k in search_space_dict['countvect_params'].keys():
                countvect_params.pop(k, '')
        if 'ctm_params' in search_space_dict:
            for k in search_space_dict['ctm_params'].keys():
                ctm_params.pop(k, '')

        config['search_space'] = search_space_dict
        config['metrics'] = list(map(lambda x: x.value, metrics))
        config['monitor'] = monitor.value

        config['search_behaviour'] = search_behaviour.value
        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            config['search_rs'] = search_rs
            config['search_n_iter'] = search_n_iter

        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)

        print('Created config file at {}'.format(config_path))
    else:
        with open(config_path, 'r') as f:
            config = json.load(f)

        # check whether the input params are consistent with the config file
        assert config['model'] == model_name, 'input model_name is not consistent with the config["model"]'
        assert config['metrics'] == list(map(lambda x: x.value, metrics)), 'input metrics is not consistent with config["metrics"]'
        assert config['monitor'] == monitor.value, 'input monitor is not consistent with config["monitor"]'
        assert config['search_behaviour'] == search_behaviour.value, 'input search_behaviour is not consistent with config["search_behaviour"]'
        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            assert config['search_rs'] == search_rs, 'input search_rs is not consistent with config["search_rs"]'
            assert config['search_n_iter'] == search_n_iter, 'input search_n_iter is not consistent with config["search_n_iter"]'

        # check whether the hyperparameters are consistent with the config file
        sbert_params = _init_sbert_params(**hyperparameters['sbert_params'])
        countvect_params = _init_count_vectorizer_params(**hyperparameters['countvect_params'])
        ctm_params = _init_ctm_params(**hyperparameters['ctm_params'])

        assert config['sbert_params'].keys() <= sbert_params.keys(), 'existing config["sbert_params"] contains additional hyperparameters'
        assert config['countvect_params'].keys() <= countvect_params.keys(), 'existing config["countvect_params"] contains additional hyperparameters'
        assert config['ctm_params'].keys() <= ctm_params.keys(), 'existing config["ctm_params"] contains additional hyperparameters'

        for key in sbert_params.keys() & config['sbert_params'].keys():
            assert sbert_params[key] == config['sbert_params'][key], 'existing config["sbert_params"] contains different hyperparameters'
        for key in countvect_params.keys() & config['countvect_params'].keys():
            assert countvect_params[key] == config['countvect_params'][key], 'existing config["countvect_params"] contains different hyperparameters'
        for key in ctm_params.keys() & config['ctm_params'].keys():
            assert ctm_params[key] == config['ctm_params'][key], 'existing config["ctm_params"] contains different hyperparameters'

        # check whether the search_space is consistent with the config file
        if 'sbert_params' in config['search_space']:
            assert config['search_space']['sbert_params'].keys() == search_space_dict['sbert_params'].keys(), 'input search_space_dict["sbert_params"] contains different hyperparameter keys than existing config["search_space"]["sbert_params"]'
            for k in search_space_dict['sbert_params'].keys():
                assert k in config['search_space']['sbert_params'], f'input search_space_dict["sbert_params"]["{key}"] contains value than existing config["search_space"]["sbert_params"]["{key}"]'
        if 'countvect_params' in config['search_space']:
            assert config['search_space']['countvect_params'].keys() == search_space_dict['countvect_params'].keys(), 'input search_space_dict["countvect_params"] contains different hyperparameter keys than existing config["search_space"]["countvect_params"]'
            for k in search_space_dict['countvect_params'].keys():
                assert k in config['search_space']['countvect_params'], f'input search_space_dict["countvect_params"]["{key}"] contains value than existing config["search_space"]["countvect_params"]["{key}"]'
        if 'ctm_params' in config['search_space']:
            assert config['search_space']['ctm_params'].keys() == search_space_dict['ctm_params'].keys(), 'input search_space_dict["ctm_params"] contains different hyperparameter keys than existing config["search_space"]["ctm_params"]'
            for k in search_space_dict['ctm_params'].keys():
                assert k in config['search_space']['ctm_params'], f'input search_space_dict["ctm_params"]["{key}"] contains value than existing config["search_space"]["ctm_params"]["{key}"]'
        
        print('Loaded existing config file from {}'.format(config_path))
        print('Hyperparameters and search space are consistent with the input parameters')

    return config


In [14]:
def _init_result_dict(result_path:Path, monitor_type:str):
    if not result_path.exists():
        result = {}

        result['best_metric'] = -float('inf')
        result['best_model_checkpoint'] = ""
        result['best_hyperparameters'] = dict()
        result["monitor_type"] = monitor_type
        result["log_history"] = list()

    else:
        with open(result_path, 'r') as f:
            result = json.load(f)

        assert result['monitor_type'] == monitor_type

        print('Loaded existing result file from {}'.format(result_path))
    
    return result

In [15]:
def _load_ctm_model(model_checkpoint:Path, ctm_params:dict):

    model_path = [p for p in model_checkpoint.iterdir() if p.is_dir()][-1]        # get the last dir (since there 's only one dir inside) -> get the only dir

    # get the first file in the dir
    epoch_file = [p for p in model_path.iterdir() if p.is_file()][0]
    epoch_num = int(epoch_file.stem.split('_')[-1])

    if 'hidden_sizes' in ctm_params:
        ctm_params['hidden_sizes'] = tuple(ctm_params['hidden_sizes'])

    ctm = CombinedTM(**ctm_params)

    ctm.load(model_path, epoch_num)

    return ctm

In [16]:
def _get_topics(ctm, k=10):
    return ctm.get_topic_lists(k)

def _get_topic_word_metrix(ctm):
    return ctm.get_topic_word_distribution()

# ref: https://contextualized-topic-models.readthedocs.io/en/latest/readme.html (go to the section: Mono-Lingual Topic Modeling)
# testing_dataset = qt.transform(text_for_contextual=testing_text_for_contextual, text_for_bow=testing_text_for_bow)
# # n_sample how many times to sample the distribution (see the doc)
# ctm.get_doc_topic_distribution(testing_dataset, n_samples=20) # returns a (n_documents, n_topics) matrix with the topic distribution of each document
def _get_topic_document_metrix(ctm, dataset, n_samples=20):
    return ctm.get_doc_topic_distribution(dataset, n_samples=n_samples).T

In [17]:
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer

def model_search(text_for_contextual, text_for_bow, hyperparameters:dict, search_space:dict, save_folder:Path,
                 metrics:list[METRICS]=[METRICS.C_NPMI], monitor:METRICS=METRICS.C_NPMI, 
                 save_each_models=True, run_from_checkpoints=False,
                 search_behaviour=SEARCH_BEHAVIOUR.GRID_SEARCH, search_rs=42, search_n_iter=10):
    
    config_json_path = save_folder.joinpath('config.json')
    result_json_path = save_folder.joinpath('result.json')

    if monitor not in metrics:
        raise Exception('monitor is not in metrics. Please modify the metrics passed in.')

    if run_from_checkpoints:
        if not save_folder.exists():
            print('Save folder:' + str(save_folder.resolve()) + ' does not exist. Function terminates.')
            raise Exception('No checkpoints found. Function terminates.')
        
        # check for existing configs
        if not config_json_path.exists():
            raise Exception('No config.json found. Function terminates.')
        
        # check for existing results
        if not result_json_path.exists():
            print('no result.json is found. Assuming no existing checkpoints.')
    else:
        if save_folder.exists():
            raise Exception('Checkpoints found. Please delete the checkpoints or set run_from_checkpoints=True. Function terminates.')

    if not save_folder.exists():
        save_folder.mkdir()

    config = _init_config_dict(config_json_path, 'ctm', hyperparameters, search_space,
                               metrics, monitor, search_behaviour, search_rs, search_n_iter)
    result = _init_result_dict(result_json_path, monitor.value)

    print('Search folder: {}'.format(save_folder))

    # init
    best_model_path = result['best_model_checkpoint']
    best_metric_score = result['best_metric']
    best_model = _load_ctm_model(Path(best_model_path),
                                 result['best_hyperparameters']['ctm_params']) if best_model_path != "" else None
    best_hyperparameters = result['best_hyperparameters']

    print(f'Best model checkpoint: {best_model_path}')
    print(f'Best metric score: {best_metric_score}')
    print(f'Best model: {best_model}')

    # search
    # like bertopic, we create a temp dict for initiating the search space
    # then we apply sklearn parameter sampler / parameter grid to get the search space
    temp_search_space = {}
    for k, v in search_space.items():
        for kk, vv in v.items():
            temp_search_space[k + '__' + kk] = vv

    if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
        search_iterator = ParameterSampler(temp_search_space, search_n_iter, random_state=search_rs)
    elif search_behaviour == SEARCH_BEHAVIOUR.GRID_SEARCH:
        search_iterator = ParameterGrid(temp_search_space)

    print('\n')

    for search_space_dict in search_iterator:

        # unwrap the search space dict

        model_name = ''

        _sbert_params = {}
        _countvect_params = {}
        _ctm_params = {}

        for k, v in search_space_dict.items():
            if k.startswith('sbert_params'):
                _sbert_params[k.split('__')[1]] = v
                model_name += 'sb_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('countvect_params'):
                _countvect_params[k.split('__')[1]] = v
                model_name += 'cvect_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('ctm_params'):
                _ctm_params[k.split('__')[1]] = v
                model_name += 'ctm_' + k.split('__')[1] + '_' + str(v) + '_'

        model_name = model_name[:-1]     # remove the last '_'

        model_path = save_folder.joinpath(config['model'] + '_' + model_name)

        # check whether the model exists
        if model_path.exists():
            print('Skipping current search space: {}'.format(search_space_dict))
            continue
    
        ##########
        # Training starts
        ##########

        print('Current search space: {}'.format(search_space_dict))

        sbert_params = deepcopy(config['sbert_params'])     # deepcopy just for safety (not messing up with the original config)
        countvect_params = deepcopy(config['countvect_params'])
        ctm_params = deepcopy(config['ctm_params'])

        sbert_params.update(_sbert_params)
        countvect_params.update(_countvect_params)
        ctm_params.update(_ctm_params)

        countvect_params['ngram_range'] = tuple(countvect_params['ngram_range'])     # convert list to tuple

        # create bow
        vectorizer = CountVectorizer(**countvect_params)
        vectorizer.fit_transform(text_for_bow)
        temp_vocabulary = set(vectorizer.get_feature_names_out())

        preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in temp_vocabulary])
                            for doc in text_for_bow]
        text_for_bow = preprocessed_docs_tmp
    
        # create sbert embeddings
        if platform.system() == 'Linux' or platform.system() == 'Windows':
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            device = torch.device('mps')        # m-series machine
        
        tp = TopicModelDataPreparation()

        # check existing embeddings
        # reuse them if found
        embeddings_path = save_folder.joinpath(f'embeddings_{sbert_params["model_name_or_path"]}.pkl')
        if embeddings_path.exists():
            with open(embeddings_path, 'rb') as f:
                embeddings = np.load(f)

            print(f'Found existing sbert embeddings at {embeddings_path}. Reusing them.')
        else:
            embeddings = bert_embeddings_from_list(text_for_contextual, **sbert_params, device=device)

            with open(embeddings_path, 'wb') as f:
                np.save(f, embeddings)
         
        training_dataset = tp.fit(text_for_contextual=text_for_contextual, text_for_bow=text_for_bow, custom_embeddings=embeddings)

        # ctm

        ctm_params['bow_size'] = len(tp.vocab)
        ctm_params['contextual_size'] = embeddings.shape[1]
        ctm_params['hidden_sizes'] = tuple(ctm_params['hidden_sizes'])     # convert list to tuple

        ctm = CombinedTM(**ctm_params)
        ctm.device = device
        ctm.fit(training_dataset, verbose=True)

        ##########
        # Training ends
        ##########

        ##########
        # Evaluation starts
        ##########

        # init data for gensim coherence model
        topic_words = _get_topics(ctm, k=10)
        topics = ctm.get_predicted_topics(training_dataset, n_samples=20)

        documents = pd.DataFrame({"Document": X,
                                "ID": range(len(X)),
                                "Topic": topics})
        
        docs_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
        texts = [doc.split() for doc in docs_per_topic.Document.values]
        
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        # init octis format result for convenience
        result_octis = {}
        result_octis['topics'] = topic_words
        result_octis['topic-word-matrix'] = _get_topic_word_metrix(ctm)
        result_octis['topic-document-matrix'] = _get_topic_document_metrix(ctm, training_dataset, n_samples=20)

        print('Compute evaluation metrics')

        metrics_score = dict()

        for metric in metrics:
            if metric in COHERENCE_MODEL_METRICS:
                coherencemodel = CoherenceModel(topics=topic_words, texts=texts, corpus=corpus, dictionary=dictionary, topn=10, coherence=metric.value)
                score = coherencemodel.get_coherence()
            elif metric == METRICS.TOPIC_DIVERSITY:
                score = compute_topic_diversity(result_octis, topk=10)
            elif metric == METRICS.INVERTED_RBO:
                score = compute_inverted_rbo(result_octis, topk=10)
            elif metric == METRICS.PAIRWISE_JACCARD_SIMILARITY:
                score = compute_pairwise_jaccard_similarity(result_octis, topk=10)
            else:
                raise Exception('Unknown metric: {}'.format(metric.value))

            metrics_score[metric.value] = score

            print(f'Evaluation metric ({metric.value}): {score}')

        monitor_score = metrics_score[monitor.value]

        ##########
        # Evaluation ends
        ##########

        ##########
        # Save models
        ##########

        if not model_path.exists():
            model_path.mkdir()
        
        if save_each_models:
            ctm.save(models_dir=model_path)

        ##########
        # Save models ends
        ##########

        ###########
        # Update result dict and json file
        ###########
            
        model_hyperparameters = {
            'sbert_params': sbert_params,
            'countvect_params': countvect_params,
            'ctm_params': ctm_params
        }

        if monitor_score > best_metric_score:
            best_metric_score = monitor_score
            best_model_path = model_path
            best_model = ctm
            best_hyperparameters = model_hyperparameters

        model_log_history = dict()
        model_log_history.update(metrics_score)
        model_log_history['model_name'] = model_name
        model_log_history['hyperparameters']  = model_hyperparameters

        result['best_metric'] = best_metric_score
        result['best_model_checkpoint'] = str(best_model_path)
        result['best_hyperparameters'] = best_hyperparameters
        result['log_history'].append(model_log_history)

        # save result
        with open(result_json_path, 'w') as f:
            json.dump(result, f, indent=2)
        
        print('Saved result.json at:', result_json_path)
        print('\n\n')
    
    print('Search ends')
    return best_model, best_model_path, best_hyperparameters


In [18]:
# grid search / random search

# hyperparameters
sbert_params = _init_sbert_params(model_name_or_path='all-mpnet-base-v2')
countvect_params = _init_count_vectorizer_params(max_features=2000, ngram_range=(1,1))
ctm_params = _init_ctm_params(n_components=10, hidden_sizes=(100, 100), dropout=0.2, lr=2e-3, momentum=0.99, solver="adam", num_epochs=20)

search_space_dict = {
    'sbert_params': {
        'model_name_or_path': ['all-mpnet-base-v2', 'all-roberta-large-v1']
    },
    'countvect_params': {
        'max_features' : [1500, 2000, 2500],
        'ngram_range': [[1, 1], [1, 2]]     # datatype is list as json does not support tuple
    },
    'ctm_params':{
        'n_components': [10, 20],
        'hidden_sizes': [(100, 100), (200, 200), (100, 100, 100), (200, 200, 200)],
        'num_epochs':[20, 50, 100]
    }
}

# search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
search_behaviour = SEARCH_BEHAVIOUR.RANDOM_SEARCH

# training_datetime = datetime.now()
training_datetime = datetime(2024, 1, 23, 0, 21, 11)
training_folder = Path(f'ctm_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')

best_model, best_model_path, best_hyperparameters = model_search(
    X,
    X_preprocessed,
    hyperparameters={
        'sbert_params': sbert_params,
        'countvect_params': countvect_params,
        'ctm_params': ctm_params
    },
    search_space=search_space_dict,
    save_folder=training_folder,
    metrics=[METRICS.C_NPMI, METRICS.C_V, METRICS.UMASS, METRICS.C_UCI, METRICS.TOPIC_DIVERSITY, METRICS.INVERTED_RBO, METRICS.PAIRWISE_JACCARD_SIMILARITY],
    monitor=METRICS.C_NPMI,
    save_each_models=True,
    run_from_checkpoints=True,
    search_behaviour=search_behaviour,
    search_rs=42,
    search_n_iter=50
)

Loaded existing config file from ctm_random_search_20240123_002111/config.json
Hyperparameters and search space are consistent with the input parameters
Loaded existing result file from ctm_random_search_20240123_002111/result.json
Search folder: ctm_random_search_20240123_002111




Best model checkpoint: ctm_random_search_20240123_002111/ctmsb_model_name_or_path_all-mpnet-base-v2_ctm_num_epochs_100_ctm_n_components_10_ctm_hidden_sizes_(200, 200)_cvect_ngram_range_[1, 2]_cvect_max_features_2500
Best metric score: -0.01634058093071984
Best model: <contextualized_topic_models.models.ctm.CombinedTM object at 0x7fc678128970>


Skipping current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 1500}
Skipping current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}
Skipping current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_para

Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 126.15193300514125	Time: 0:00:04.192255: : 20it [01:23,  4.18s/it]
100%|██████████| 1180/1180 [00:02<00:00, 532.38it/s]
100%|██████████| 1180/1180 [00:02<00:00, 517.09it/s]
100%|██████████| 1180/1180 [00:02<00:00, 467.01it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.022405069062131307
Evaluation metric (c_v): 0.4419939284615909
Evaluation metric (u_mass): -0.06146510052604809
Evaluation metric (c_uci): -1.4714216072126418
Evaluation metric (topic_diversity): 0.65
Evaluation metric (inverted_rbo): 0.9430718514006015
Evaluation metric (pairwise_jaccard_similarity): 0.04013869913332188
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 117.76040749715688	Time: 0:00:04.184945: : 20it [01:25,  4.27s/it]
100%|██████████| 1180/1180 [00:02<00:00, 529.85it/s]
100%|██████████| 1180/1180 [00:02<00:00, 490.01it/s]
100%|██████████| 1180/1180 [00:02<00:00, 458.58it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.04352667736457535
Evaluation metric (c_v): 0.42423206005821656
Evaluation metric (u_mass): -0.019025505205554096
Evaluation metric (c_uci): -1.7152795169878814
Evaluation metric (topic_diversity): 0.78
Evaluation metric (inverted_rbo): 0.9480674231792063
Evaluation metric (pairwise_jaccard_similarity): 0.03350915414898903
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (100, 100), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 119.77782767621818	Time: 0:00:04.175240: : 50it [03:29,  4.19s/it]
100%|██████████| 1180/1180 [00:02<00:00, 504.48it/s]
100%|██████████| 1180/1180 [00:02<00:00, 495.38it/s]
100%|██████████| 1180/1180 [00:02<00:00, 458.60it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.0038959708568421765
Evaluation metric (c_v): 0.4592628621951394
Evaluation metric (u_mass): -0.05179622466306812
Evaluation metric (c_uci): -0.8297934674912829
Evaluation metric (topic_diversity): 0.66
Evaluation metric (inverted_rbo): 0.9418689357324812
Evaluation metric (pairwise_jaccard_similarity): 0.04193753216891557
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 119.68527478581268	Time: 0:00:04.169416: : 100it [06:54,  4.15s/it]
100%|██████████| 1180/1180 [00:02<00:00, 547.51it/s]
100%|██████████| 1180/1180 [00:02<00:00, 504.95it/s]
100%|██████████| 1180/1180 [00:02<00:00, 441.48it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.0013471877673410717
Evaluation metric (c_v): 0.4523692709936279
Evaluation metric (u_mass): -0.05910723233979739
Evaluation metric (c_uci): -0.9773590596372734
Evaluation metric (topic_diversity): 0.695
Evaluation metric (inverted_rbo): 0.9539896136411654
Evaluation metric (pairwise_jaccard_similarity): 0.030547408252312915
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 117.36041986467072	Time: 0:00:04.110979: : 100it [06:50,  4.10s/it]
100%|██████████| 1180/1180 [00:02<00:00, 541.79it/s]
100%|██████████| 1180/1180 [00:02<00:00, 509.30it/s]
100%|██████████| 1180/1180 [00:02<00:00, 463.43it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.0192228713780478
Evaluation metric (c_v): 0.4450774459410975
Evaluation metric (u_mass): -0.013657520976097764
Evaluation metric (c_uci): -1.3314279432494813
Evaluation metric (topic_diversity): 0.78
Evaluation metric (inverted_rbo): 0.9562870317692064
Evaluation metric (pairwise_jaccard_similarity): 0.036043267209417876
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 111.2839107950225	Time: 0:00:04.163022: : 100it [06:52,  4.12s/it]
100%|██████████| 1180/1180 [00:02<00:00, 527.07it/s]
100%|██████████| 1180/1180 [00:02<00:00, 495.09it/s]
100%|██████████| 1180/1180 [00:02<00:00, 437.61it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.001214901179241406
Evaluation metric (c_v): 0.4508387659963097
Evaluation metric (u_mass): -0.01881238888919292
Evaluation metric (c_uci): -0.7483445329354885
Evaluation metric (topic_diversity): 0.84
Evaluation metric (inverted_rbo): 0.9665107370471429
Evaluation metric (pairwise_jaccard_similarity): 0.02231968810916179
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 111.33553580170066	Time: 0:00:04.067451: : 50it [03:20,  4.01s/it]
100%|██████████| 1180/1180 [00:02<00:00, 549.32it/s]
100%|██████████| 1180/1180 [00:02<00:00, 521.81it/s]
100%|██████████| 1180/1180 [00:02<00:00, 454.66it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.0069130114568306065
Evaluation metric (c_v): 0.44536760339716785
Evaluation metric (u_mass): -0.004576176553609037
Evaluation metric (c_uci): -0.5714438586053755
Evaluation metric (topic_diversity): 0.87
Evaluation metric (inverted_rbo): 0.9722331117749207
Evaluation metric (pairwise_jaccard_similarity): 0.01572449642625081
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 114.14193633696708	Time: 0:00:04.055095: : 20it [01:21,  4.08s/it]
100%|██████████| 1180/1180 [00:02<00:00, 533.23it/s]
100%|██████████| 1180/1180 [00:02<00:00, 529.94it/s]
100%|██████████| 1180/1180 [00:02<00:00, 461.10it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.016800113313361124
Evaluation metric (c_v): 0.4387913385990032
Evaluation metric (u_mass): -0.043650357204819454
Evaluation metric (c_uci): -1.1782658869876705
Evaluation metric (topic_diversity): 0.675
Evaluation metric (inverted_rbo): 0.9350703618463534
Evaluation metric (pairwise_jaccard_similarity): 0.04087657223554241
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 113.77651998893602	Time: 0:00:04.103952: : 50it [03:25,  4.10s/it]
100%|██████████| 1180/1180 [00:02<00:00, 519.71it/s]
100%|██████████| 1180/1180 [00:02<00:00, 498.58it/s]
100%|██████████| 1180/1180 [00:02<00:00, 463.01it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.015737983468166548
Evaluation metric (c_v): 0.4315667794612296
Evaluation metric (u_mass): -0.055752366846366086
Evaluation metric (c_uci): -1.1907935052385212
Evaluation metric (topic_diversity): 0.685
Evaluation metric (inverted_rbo): 0.9378375655442481
Evaluation metric (pairwise_jaccard_similarity): 0.03683596245515749
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 113.32729080433154	Time: 0:00:03.972438: : 50it [03:19,  3.99s/it]
100%|██████████| 1180/1180 [00:02<00:00, 549.91it/s]
100%|██████████| 1180/1180 [00:02<00:00, 535.61it/s]
100%|██████████| 1180/1180 [00:02<00:00, 461.27it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.0018944893303838076
Evaluation metric (c_v): 0.45132304291340297
Evaluation metric (u_mass): -0.042937292247730505
Evaluation metric (c_uci): -0.8496342541925961
Evaluation metric (topic_diversity): 0.725
Evaluation metric (inverted_rbo): 0.9550105533607894
Evaluation metric (pairwise_jaccard_similarity): 0.026016149765538715
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 111.33361884999618	Time: 0:00:04.191064: : 50it [03:27,  4.15s/it]
100%|██████████| 1180/1180 [00:02<00:00, 532.25it/s]
100%|██████████| 1180/1180 [00:02<00:00, 498.75it/s]
100%|██████████| 1180/1180 [00:02<00:00, 450.03it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.00013579196930781635
Evaluation metric (c_v): 0.44945925055420144
Evaluation metric (u_mass): -0.02229281110424119
Evaluation metric (c_uci): -0.6195682868039065
Evaluation metric (topic_diversity): 0.83
Evaluation metric (inverted_rbo): 0.9579776837163492
Evaluation metric (pairwise_jaccard_similarity): 0.02769368956159462
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 113.47237584548445	Time: 0:00:04.021507: : 50it [03:20,  4.00s/it]
100%|██████████| 1180/1180 [00:02<00:00, 533.01it/s]
100%|██████████| 1180/1180 [00:02<00:00, 518.14it/s]
100%|██████████| 1180/1180 [00:02<00:00, 459.35it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.011210547256574265
Evaluation metric (c_v): 0.4619663153040821
Evaluation metric (u_mass): -0.03668040412604955
Evaluation metric (c_uci): -0.6285594072616103
Evaluation metric (topic_diversity): 0.73
Evaluation metric (inverted_rbo): 0.9495193197523685
Evaluation metric (pairwise_jaccard_similarity): 0.02783661941230785
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 111.28803524635322	Time: 0:00:04.045146: : 20it [01:22,  4.11s/it]
100%|██████████| 1180/1180 [00:02<00:00, 541.35it/s]
100%|██████████| 1180/1180 [00:02<00:00, 528.13it/s]
100%|██████████| 1180/1180 [00:02<00:00, 569.35it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.018666471353272217
Evaluation metric (c_v): 0.444103763632757
Evaluation metric (u_mass): -0.024832325876313777
Evaluation metric (c_uci): -1.1985277567980381
Evaluation metric (topic_diversity): 0.83
Evaluation metric (inverted_rbo): 0.969971255183492
Evaluation metric (pairwise_jaccard_similarity): 0.023720521346940332
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 111.08086212455872	Time: 0:00:03.970094: : 100it [06:42,  4.03s/it]
100%|██████████| 1180/1180 [00:02<00:00, 550.97it/s]
100%|██████████| 1180/1180 [00:02<00:00, 515.44it/s]
100%|██████████| 1180/1180 [00:02<00:00, 464.91it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.0161712746142729
Evaluation metric (c_v): 0.42029838281860277
Evaluation metric (u_mass): -0.006917521346000735
Evaluation metric (c_uci): -0.9305043320777239
Evaluation metric (topic_diversity): 0.83
Evaluation metric (inverted_rbo): 0.9650282144271428
Evaluation metric (pairwise_jaccard_similarity): 0.023284791499445786
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 113.55763553299067	Time: 0:00:04.124515: : 50it [03:24,  4.08s/it]
100%|██████████| 1180/1180 [00:02<00:00, 539.27it/s]
100%|██████████| 1180/1180 [00:02<00:00, 523.97it/s]
100%|██████████| 1180/1180 [00:02<00:00, 468.89it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.0031875715066945927
Evaluation metric (c_v): 0.45575823146739414
Evaluation metric (u_mass): -0.05099697190384954
Evaluation metric (c_uci): -0.7797572280000715
Evaluation metric (topic_diversity): 0.695
Evaluation metric (inverted_rbo): 0.9476642395323308
Evaluation metric (pairwise_jaccard_similarity): 0.03179801930005611
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 113.2379905823634	Time: 0:00:04.291360: : 50it [03:27,  4.15s/it] 
100%|██████████| 1180/1180 [00:02<00:00, 528.37it/s]
100%|██████████| 1180/1180 [00:02<00:00, 501.19it/s]
100%|██████████| 1180/1180 [00:02<00:00, 576.02it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.005789096531604772
Evaluation metric (c_v): 0.44299375250622897
Evaluation metric (u_mass): -0.028211090371161374
Evaluation metric (c_uci): -0.6535050869151682
Evaluation metric (topic_diversity): 0.695
Evaluation metric (inverted_rbo): 0.9518340208389474
Evaluation metric (pairwise_jaccard_similarity): 0.03113989824923505
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 111.3273397903507	Time: 0:00:04.116747: : 50it [03:25,  4.11s/it] 
100%|██████████| 1180/1180 [00:02<00:00, 525.24it/s]
100%|██████████| 1180/1180 [00:02<00:00, 499.34it/s]
100%|██████████| 1180/1180 [00:02<00:00, 505.43it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.015165863818045577
Evaluation metric (c_v): 0.4314557890093694
Evaluation metric (u_mass): -0.025535564791045285
Evaluation metric (c_uci): -1.0552583752442448
Evaluation metric (topic_diversity): 0.83
Evaluation metric (inverted_rbo): 0.9595603114292064
Evaluation metric (pairwise_jaccard_similarity): 0.02665405343423919
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 111.14263188990624	Time: 0:00:04.164387: : 50it [03:27,  4.14s/it]
100%|██████████| 1180/1180 [00:02<00:00, 525.48it/s]
100%|██████████| 1180/1180 [00:02<00:00, 499.31it/s]
100%|██████████| 1180/1180 [00:02<00:00, 455.02it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.01656921340935488
Evaluation metric (c_v): 0.4375067561647626
Evaluation metric (u_mass): -0.022840284186458996
Evaluation metric (c_uci): -1.1264110050692921
Evaluation metric (topic_diversity): 0.81
Evaluation metric (inverted_rbo): 0.9559098492963493
Evaluation metric (pairwise_jaccard_similarity): 0.02974047318732561
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 111.4479065459092	Time: 0:00:04.070344: : 20it [01:24,  4.24s/it] 
100%|██████████| 1180/1180 [00:02<00:00, 532.09it/s]
100%|██████████| 1180/1180 [00:02<00:00, 512.10it/s]
100%|██████████| 1180/1180 [00:02<00:00, 436.88it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.02139352754301036
Evaluation metric (c_v): 0.43796317859183437
Evaluation metric (u_mass): -0.02759162162084256
Evaluation metric (c_uci): -1.1933083732458594
Evaluation metric (topic_diversity): 0.77
Evaluation metric (inverted_rbo): 0.9428515141353968
Evaluation metric (pairwise_jaccard_similarity): 0.036437771989887575
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 110.91769171691325	Time: 0:00:04.152519: : 100it [06:43,  4.04s/it]
100%|██████████| 1180/1180 [00:02<00:00, 517.06it/s]
100%|██████████| 1180/1180 [00:02<00:00, 504.26it/s]
100%|██████████| 1180/1180 [00:02<00:00, 439.31it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.023362142308020507
Evaluation metric (c_v): 0.4243728398973595
Evaluation metric (u_mass): -0.02869039176249688
Evaluation metric (c_uci): -1.3099561360388934
Evaluation metric (topic_diversity): 0.84
Evaluation metric (inverted_rbo): 0.9663859486120635
Evaluation metric (pairwise_jaccard_similarity): 0.024236517218973356
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 113.09362611131611	Time: 0:00:04.155997: : 100it [06:56,  4.17s/it]
100%|██████████| 1180/1180 [00:02<00:00, 518.48it/s]
100%|██████████| 1180/1180 [00:02<00:00, 513.37it/s]
100%|██████████| 1180/1180 [00:02<00:00, 448.31it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.008683610296622141
Evaluation metric (c_v): 0.4481908934759118
Evaluation metric (u_mass): -0.06271357803975869
Evaluation metric (c_uci): -1.110290614224168
Evaluation metric (topic_diversity): 0.695
Evaluation metric (inverted_rbo): 0.9431117823273308
Evaluation metric (pairwise_jaccard_similarity): 0.03353475277460937
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 111.27209011915885	Time: 0:00:04.058103: : 20it [01:22,  4.11s/it]
100%|██████████| 1180/1180 [00:02<00:00, 518.53it/s]
100%|██████████| 1180/1180 [00:02<00:00, 501.29it/s]
100%|██████████| 1180/1180 [00:02<00:00, 447.95it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.033667088022001086
Evaluation metric (c_v): 0.41472769598233683
Evaluation metric (u_mass): -0.0479807676554168
Evaluation metric (c_uci): -1.3417423817895242
Evaluation metric (topic_diversity): 0.81
Evaluation metric (inverted_rbo): 0.953935299950635
Evaluation metric (pairwise_jaccard_similarity): 0.03412643809960631
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 113.26561937663796	Time: 0:00:04.007417: : 100it [06:42,  4.03s/it]
100%|██████████| 1180/1180 [00:02<00:00, 531.20it/s]
100%|██████████| 1180/1180 [00:02<00:00, 510.88it/s]
100%|██████████| 1180/1180 [00:02<00:00, 460.30it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.0007882726820865948
Evaluation metric (c_v): 0.44965085303056956
Evaluation metric (u_mass): -0.03385625905630996
Evaluation metric (c_uci): -0.8895163721623959
Evaluation metric (topic_diversity): 0.7
Evaluation metric (inverted_rbo): 0.9550669426948872
Evaluation metric (pairwise_jaccard_similarity): 0.029244849442014168
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 113.39269967115563	Time: 0:00:04.373333: : 50it [03:28,  4.18s/it]
100%|██████████| 1180/1180 [00:02<00:00, 528.48it/s]
100%|██████████| 1180/1180 [00:02<00:00, 515.79it/s]
100%|██████████| 1180/1180 [00:02<00:00, 455.35it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.014847039855361576
Evaluation metric (c_v): 0.45603220048038046
Evaluation metric (u_mass): -0.05065805084029875
Evaluation metric (c_uci): -0.47085914305878046
Evaluation metric (topic_diversity): 0.67
Evaluation metric (inverted_rbo): 0.9418049665372557
Evaluation metric (pairwise_jaccard_similarity): 0.035002625242155944
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 111.06883618303837	Time: 0:00:04.037958: : 50it [03:23,  4.06s/it]
100%|██████████| 1180/1180 [00:02<00:00, 512.47it/s]
100%|██████████| 1180/1180 [00:02<00:00, 501.43it/s]
100%|██████████| 1180/1180 [00:02<00:00, 451.91it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.024995489289709016
Evaluation metric (c_v): 0.42694906694498613
Evaluation metric (u_mass): -0.0467620040398521
Evaluation metric (c_uci): -1.2724980375429022
Evaluation metric (topic_diversity): 0.85
Evaluation metric (inverted_rbo): 0.9713725201806349
Evaluation metric (pairwise_jaccard_similarity): 0.021020142949967506
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 111.00241719149653	Time: 0:00:04.028245: : 50it [03:21,  4.03s/it]
100%|██████████| 1180/1180 [00:02<00:00, 523.43it/s]
100%|██████████| 1180/1180 [00:02<00:00, 514.06it/s]
100%|██████████| 1180/1180 [00:02<00:00, 448.36it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.0020837911113456425
Evaluation metric (c_v): 0.4658087600485712
Evaluation metric (u_mass): -0.025566932790549207
Evaluation metric (c_uci): -0.7662948795469878
Evaluation metric (topic_diversity): 0.87
Evaluation metric (inverted_rbo): 0.9726540724992063
Evaluation metric (pairwise_jaccard_similarity): 0.018499407560295073
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 111.15640762238506	Time: 0:00:03.975164: : 20it [01:20,  4.04s/it]
100%|██████████| 1180/1180 [00:02<00:00, 527.43it/s]
100%|██████████| 1180/1180 [00:02<00:00, 512.76it/s]
100%|██████████| 1180/1180 [00:02<00:00, 449.29it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.019778167212297092
Evaluation metric (c_v): 0.4446891916490384
Evaluation metric (u_mass): -0.02157180705509547
Evaluation metric (c_uci): -1.158705397171771
Evaluation metric (topic_diversity): 0.86
Evaluation metric (inverted_rbo): 0.9661600463806349
Evaluation metric (pairwise_jaccard_similarity): 0.021975690861139777
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 111.15944916884509	Time: 0:00:04.165466: : 100it [06:48,  4.08s/it]
100%|██████████| 1180/1180 [00:02<00:00, 497.29it/s]
100%|██████████| 1180/1180 [00:02<00:00, 490.70it/s]
100%|██████████| 1180/1180 [00:02<00:00, 417.33it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.011382675764187427
Evaluation metric (c_v): 0.43942880658097766
Evaluation metric (u_mass): -0.004106981862058852
Evaluation metric (c_uci): -0.9612329666000606
Evaluation metric (topic_diversity): 0.79
Evaluation metric (inverted_rbo): 0.9575633490592064
Evaluation metric (pairwise_jaccard_similarity): 0.034393644042766854
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 111.02212079204271	Time: 0:00:04.211622: : 100it [07:08,  4.28s/it]
100%|██████████| 1180/1180 [00:02<00:00, 515.80it/s]
100%|██████████| 1180/1180 [00:02<00:00, 496.44it/s]
100%|██████████| 1180/1180 [00:02<00:00, 446.28it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.03618487887589
Evaluation metric (c_v): 0.4292095359200562
Evaluation metric (u_mass): -0.0353391678546612
Evaluation metric (c_uci): -1.6627652767764478
Evaluation metric (topic_diversity): 0.84
Evaluation metric (inverted_rbo): 0.963482413043492
Evaluation metric (pairwise_jaccard_similarity): 0.028357658853014885
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 113.32740183822982	Time: 0:00:04.245882: : 100it [06:52,  4.12s/it]
100%|██████████| 1180/1180 [00:02<00:00, 507.80it/s]
100%|██████████| 1180/1180 [00:02<00:00, 509.86it/s]
100%|██████████| 1180/1180 [00:02<00:00, 566.46it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.0119485099451722
Evaluation metric (c_v): 0.4360222856552113
Evaluation metric (u_mass): -0.06913261207922486
Evaluation metric (c_uci): -1.1360586728930935
Evaluation metric (topic_diversity): 0.665
Evaluation metric (inverted_rbo): 0.9417348163224812
Evaluation metric (pairwise_jaccard_similarity): 0.038035530996260954
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 113.50896870594494	Time: 0:00:04.604682: : 50it [03:32,  4.26s/it]
100%|██████████| 1180/1180 [00:02<00:00, 516.13it/s]
100%|██████████| 1180/1180 [00:02<00:00, 505.99it/s]
100%|██████████| 1180/1180 [00:02<00:00, 433.16it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.0008543689740492539
Evaluation metric (c_v): 0.43594910447325363
Evaluation metric (u_mass): -0.04147507298165241
Evaluation metric (c_uci): -0.7805076860376771
Evaluation metric (topic_diversity): 0.665
Evaluation metric (inverted_rbo): 0.9413739278497368
Evaluation metric (pairwise_jaccard_similarity): 0.03617855267684173
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 111.17671617761924	Time: 0:00:04.251412: : 50it [03:31,  4.24s/it]
100%|██████████| 1180/1180 [00:02<00:00, 505.73it/s]
100%|██████████| 1180/1180 [00:02<00:00, 438.04it/s]
100%|██████████| 1180/1180 [00:02<00:00, 447.15it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.024900778529041958
Evaluation metric (c_v): 0.42354619492206497
Evaluation metric (u_mass): -0.032610879085398586
Evaluation metric (c_uci): -1.1229607797220156
Evaluation metric (topic_diversity): 0.8
Evaluation metric (inverted_rbo): 0.9579812121477778
Evaluation metric (pairwise_jaccard_similarity): 0.03132286052822689
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 111.23390872039666	Time: 0:00:04.241204: : 20it [01:23,  4.18s/it]
100%|██████████| 1180/1180 [00:02<00:00, 518.18it/s]
100%|██████████| 1180/1180 [00:02<00:00, 506.27it/s]
100%|██████████| 1180/1180 [00:02<00:00, 450.30it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.01582852242098282
Evaluation metric (c_v): 0.4299724451643674
Evaluation metric (u_mass): -0.025581771002452236
Evaluation metric (c_uci): -0.9970560482768305
Evaluation metric (topic_diversity): 0.84
Evaluation metric (inverted_rbo): 0.9611506545749207
Evaluation metric (pairwise_jaccard_similarity): 0.022527997553797353
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 111.14201096137568	Time: 0:00:04.139123: : 50it [03:31,  4.23s/it]
100%|██████████| 1180/1180 [00:02<00:00, 516.27it/s]
100%|██████████| 1180/1180 [00:02<00:00, 498.06it/s]
100%|██████████| 1180/1180 [00:02<00:00, 420.38it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.005296554119026634
Evaluation metric (c_v): 0.44053318198163094
Evaluation metric (u_mass): -0.016771171647793837
Evaluation metric (c_uci): -0.8217342930067375
Evaluation metric (topic_diversity): 0.83
Evaluation metric (inverted_rbo): 0.962464734793492
Evaluation metric (pairwise_jaccard_similarity): 0.026770630279402206
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 113.24775143171186	Time: 0:00:04.228096: : 100it [07:01,  4.22s/it]
100%|██████████| 1180/1180 [00:02<00:00, 516.51it/s]
100%|██████████| 1180/1180 [00:02<00:00, 492.86it/s]
100%|██████████| 1180/1180 [00:02<00:00, 431.96it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.013643061901091168
Evaluation metric (c_v): 0.4411398566738045
Evaluation metric (u_mass): -0.050770121564050266
Evaluation metric (c_uci): -1.2554657063042132
Evaluation metric (topic_diversity): 0.73
Evaluation metric (inverted_rbo): 0.9535994983431579
Evaluation metric (pairwise_jaccard_similarity): 0.027554179566563454
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (100, 100), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 113.30175669300445	Time: 0:00:03.996523: : 100it [06:42,  4.02s/it]
100%|██████████| 1180/1180 [00:02<00:00, 522.48it/s]
100%|██████████| 1180/1180 [00:02<00:00, 526.65it/s]
100%|██████████| 1180/1180 [00:02<00:00, 458.25it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.004025450633727128
Evaluation metric (c_v): 0.4519716532098581
Evaluation metric (u_mass): -0.052269215945936046
Evaluation metric (c_uci): -0.7581013176115248
Evaluation metric (topic_diversity): 0.695
Evaluation metric (inverted_rbo): 0.9509793937574436
Evaluation metric (pairwise_jaccard_similarity): 0.03024987749201533
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 113.09048830765198	Time: 0:00:04.116330: : 50it [03:24,  4.08s/it]
100%|██████████| 1180/1180 [00:02<00:00, 523.78it/s]
100%|██████████| 1180/1180 [00:02<00:00, 500.73it/s]
100%|██████████| 1180/1180 [00:02<00:00, 452.95it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.002517379442514077
Evaluation metric (c_v): 0.44350202397185406
Evaluation metric (u_mass): -0.045635050597720314
Evaluation metric (c_uci): -0.7750037648537296
Evaluation metric (topic_diversity): 0.69
Evaluation metric (inverted_rbo): 0.9459110708032706
Evaluation metric (pairwise_jaccard_similarity): 0.03524432857168718
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 113.052032192447	Time: 0:00:04.026962: : 50it [03:22,  4.04s/it]  
100%|██████████| 1180/1180 [00:02<00:00, 522.76it/s]
100%|██████████| 1180/1180 [00:02<00:00, 506.72it/s]
100%|██████████| 1180/1180 [00:02<00:00, 436.03it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.00888411651333724
Evaluation metric (c_v): 0.4599606751703931
Evaluation metric (u_mass): -0.04630400562683048
Evaluation metric (c_uci): -0.671891483030875
Evaluation metric (topic_diversity): 0.73
Evaluation metric (inverted_rbo): 0.9541273685609775
Evaluation metric (pairwise_jaccard_similarity): 0.027604873897850928
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 111.10146308814802	Time: 0:00:04.049318: : 50it [03:24,  4.09s/it]
100%|██████████| 1180/1180 [00:02<00:00, 532.05it/s]
100%|██████████| 1180/1180 [00:02<00:00, 508.88it/s]
100%|██████████| 1180/1180 [00:02<00:00, 572.52it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.006579957678364714
Evaluation metric (c_v): 0.44612087880452556
Evaluation metric (u_mass): -0.014379631270424877
Evaluation metric (c_uci): -0.6642249422064144
Evaluation metric (topic_diversity): 0.87
Evaluation metric (inverted_rbo): 0.9733929346057143
Evaluation metric (pairwise_jaccard_similarity): 0.017046974735313226
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 113.20137333243049	Time: 0:00:04.112963: : 100it [06:48,  4.09s/it]
100%|██████████| 1180/1180 [00:02<00:00, 516.07it/s]
100%|██████████| 1180/1180 [00:02<00:00, 504.16it/s]
100%|██████████| 1180/1180 [00:02<00:00, 434.26it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.0008797719700542224
Evaluation metric (c_v): 0.46416128305194365
Evaluation metric (u_mass): -0.05589096071060153
Evaluation metric (c_uci): -0.9790589704979864
Evaluation metric (topic_diversity): 0.745
Evaluation metric (inverted_rbo): 0.9620784800721804
Evaluation metric (pairwise_jaccard_similarity): 0.023910071877319725
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 111.27972256156526	Time: 0:00:04.247542: : 20it [01:23,  4.16s/it]
100%|██████████| 1180/1180 [00:02<00:00, 524.70it/s]
100%|██████████| 1180/1180 [00:02<00:00, 501.12it/s]
100%|██████████| 1180/1180 [00:02<00:00, 440.74it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.003139562747977248
Evaluation metric (c_v): 0.43973717433523374
Evaluation metric (u_mass): -0.01585013590048491
Evaluation metric (c_uci): -0.46399260198878095
Evaluation metric (topic_diversity): 0.8
Evaluation metric (inverted_rbo): 0.950922654147619
Evaluation metric (pairwise_jaccard_similarity): 0.03141077093605474
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 113.73916451257604	Time: 0:00:04.128098: : 20it [01:22,  4.14s/it]
100%|██████████| 1180/1180 [00:02<00:00, 516.68it/s]
100%|██████████| 1180/1180 [00:02<00:00, 499.33it/s]
100%|██████████| 1180/1180 [00:02<00:00, 450.94it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.016652439399946654
Evaluation metric (c_v): 0.43360865706880025
Evaluation metric (u_mass): -0.054498632486339224
Evaluation metric (c_uci): -1.145853102346401
Evaluation metric (topic_diversity): 0.645
Evaluation metric (inverted_rbo): 0.9325892246204135
Evaluation metric (pairwise_jaccard_similarity): 0.04199908957119319
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 112.86371176722092	Time: 0:00:04.158859: : 100it [06:44,  4.04s/it]
100%|██████████| 1180/1180 [00:02<00:00, 530.10it/s]
100%|██████████| 1180/1180 [00:02<00:00, 520.99it/s]
100%|██████████| 1180/1180 [00:02<00:00, 455.65it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.024055018223118688
Evaluation metric (c_v): 0.4412536736182925
Evaluation metric (u_mass): -0.1147810306482672
Evaluation metric (c_uci): -1.5641806037982606
Evaluation metric (topic_diversity): 0.7
Evaluation metric (inverted_rbo): 0.9587390073157519
Evaluation metric (pairwise_jaccard_similarity): 0.0304858508500353
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100, 100), 'countvect_params__ngram_range': [1, 1], 'countvect_params__max_features': 1500}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 111.36397520781979	Time: 0:00:04.185548: : 20it [01:22,  4.14s/it]
100%|██████████| 1180/1180 [00:02<00:00, 513.28it/s]
100%|██████████| 1180/1180 [00:02<00:00, 505.90it/s]
100%|██████████| 1180/1180 [00:02<00:00, 457.07it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.029446820934948898
Evaluation metric (c_v): 0.41376924044215524
Evaluation metric (u_mass): -0.023800787725279954
Evaluation metric (c_uci): -1.3212243579927894
Evaluation metric (topic_diversity): 0.81
Evaluation metric (inverted_rbo): 0.9542839264392063
Evaluation metric (pairwise_jaccard_similarity): 0.0307055765776096
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 50, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [3772800/3774950]	Train Loss: 111.00010600263937	Time: 0:00:03.976478: : 50it [03:21,  4.03s/it]
100%|██████████| 1180/1180 [00:02<00:00, 525.28it/s]
100%|██████████| 1180/1180 [00:02<00:00, 507.58it/s]
100%|██████████| 1180/1180 [00:02<00:00, 453.80it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.007796926318669045
Evaluation metric (c_v): 0.4346345721731887
Evaluation metric (u_mass): -0.055355372647268565
Evaluation metric (c_uci): -0.9687860259198329
Evaluation metric (topic_diversity): 0.84
Evaluation metric (inverted_rbo): 0.9663677034992063
Evaluation metric (pairwise_jaccard_similarity): 0.02491877842755036
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-mpnet-base-v2', 'ctm_params__num_epochs': 20, 'ctm_params__n_components': 10, 'ctm_params__hidden_sizes': (100, 100), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [20/20]	 Seen Samples: [1509120/1509980]	Train Loss: 111.31449293180276	Time: 0:00:03.990766: : 20it [01:21,  4.09s/it]
100%|██████████| 1180/1180 [00:02<00:00, 533.59it/s]
100%|██████████| 1180/1180 [00:02<00:00, 516.20it/s]
100%|██████████| 1180/1180 [00:02<00:00, 440.21it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): -0.010799783589159314
Evaluation metric (c_v): 0.436333377235598
Evaluation metric (u_mass): -0.01311349848400469
Evaluation metric (c_uci): -0.8855430237827646
Evaluation metric (topic_diversity): 0.81
Evaluation metric (inverted_rbo): 0.9527252665606349
Evaluation metric (pairwise_jaccard_similarity): 0.02945763100561862
Saved result.json at: ctm_random_search_20240123_002111/result.json



Current search space: {'sbert_params__model_name_or_path': 'all-roberta-large-v1', 'ctm_params__num_epochs': 100, 'ctm_params__n_components': 20, 'ctm_params__hidden_sizes': (200, 200), 'countvect_params__ngram_range': [1, 2], 'countvect_params__max_features': 2000}




Found existing sbert embeddings at ctm_random_search_20240123_002111/embeddings_all-roberta-large-v1.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (200, 200)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [100/100]	 Seen Samples: [7545600/7549900]	Train Loss: 112.92632984164612	Time: 0:00:03.994332: : 100it [06:46,  4.07s/it]
100%|██████████| 1180/1180 [00:02<00:00, 517.85it/s]
100%|██████████| 1180/1180 [00:02<00:00, 497.93it/s]
100%|██████████| 1180/1180 [00:02<00:00, 564.28it/s]


Compute evaluation metrics
Evaluation metric (c_npmi): 0.0033628898156659497
Evaluation metric (c_v): 0.464137135987051
Evaluation metric (u_mass): -0.04618440021873823
Evaluation metric (c_uci): -0.9267832903431461
Evaluation metric (topic_diversity): 0.76
Evaluation metric (inverted_rbo): 0.9659142649526692
Evaluation metric (pairwise_jaccard_similarity): 0.020351058244165637
Saved result.json at: ctm_random_search_20240123_002111/result.json



Search ends




In [69]:
# load the best model from the checkpoints

search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
training_datetime = datetime(2024, 1, 22, 16, 6, 40)
training_folder = Path(f'ctm_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')

training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)


# load the embeddings
model_name_or_path = training_result['best_hyperparameters']['sbert_params']['model_name_or_path']
embeddings_path = training_folder.joinpath(f'embeddings_{model_name_or_path}.pkl')
with open(embeddings_path, 'rb') as f:
    embeddings = np.load(f)

best_model_path = training_result['best_model_checkpoint']
ctm_hyperparameters = training_result['best_hyperparameters']['ctm_params']

ctm_hyperparameters['bow_size'] = 2000
ctm_hyperparameters['contextual_size'] = 768

# best_model_path = [p for p in Path(best_model_path).iterdir() if p.is_dir()][-1]        # get the last dir (since there 's only one dir inside) -> get the only dir

best_model = _load_ctm_model(Path(best_model_path), ctm_hyperparameters, epoch=99)
topic_lists = best_model.get_topic_lists(k=10)



In [72]:
topic_lists[8]

['content',
 'update',
 'game',
 'hour',
 'new',
 'one',
 'time',
 'developer',
 'play',
 'still']

inference / evaluation

In [62]:
# create bow

countvect_params = training_result['best_hyperparameters']['countvect_params']
countvect_params['ngram_range'] = tuple(countvect_params['ngram_range'])     # convert list to tuple

vectorizer = CountVectorizer(**countvect_params, max_features=2000)
vectorizer.fit_transform(X_preprocessed)
temp_vocabulary = set(vectorizer.get_feature_names_out())

preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in temp_vocabulary])
                    for doc in X_preprocessed]
text_for_bow = preprocessed_docs_tmp

tp = TopicModelDataPreparation()

training_dataset = tp.fit(text_for_contextual=X, text_for_bow=text_for_bow, custom_embeddings=embeddings)

In [63]:
training_dataset.X_bow.todense().shape

(75499, 2000)

In [73]:
doc_topic_distribution = best_model.get_doc_topic_distribution(training_dataset, n_samples=20)

top_docs = best_model.get_top_documents_per_topic_id(X, doc_topic_distribution, 8, k=10)

  0%|          | 0/1180 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [76]:
for tt in [t[0] for t in top_docs]:
    print(tt)

LEGIT THE BEST GAME EVER BETTER THAN MINECRAFT, SO MANY THINGS TO DO!!!!!!! BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!
Just realized I've had this game for years and never reviewed it. Which is just horrible of me. Of all the games in my steam library this deserves a review. I've enjoyed this 

In [50]:
# within the topic lists (the words)
# find out common words between topics

from itertools import combinations

topic_list = best_model.get_topic_lists(k=10)

common_words = set()
for topic1, topic2 in combinations(topic_list, 2):
    common_words.update(set(topic1).intersection(set(topic2)))

common_words = list(common_words)
common_words.sort()
common_words

['boss',
 'buy',
 'check',
 'content',
 'course',
 'explore',
 'felt',
 'friend',
 'fun',
 'game',
 'get',
 'great',
 'hour',
 'item',
 'like',
 'list',
 'love',
 'mention',
 'minecraft',
 'new',
 'number',
 'one',
 'play',
 'recommend',
 'say',
 'special',
 'spoil',
 'still',
 'stuff',
 'terrarium',
 'time',
 'update',
 'well',
 'world']