Demo ipynb for CTM (hyperparameters grid/random search)

Combined TM

In [1]:
import pandas as pd
import numpy as np


from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
# from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

import nltk
import os

from pathlib import Path
import json
from datetime import datetime

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset_path = Path('../../dataset/topic_modelling/top_10_games/00_Terraria.pkl')
dataset_path = Path('../../dataset/topic_modelling/top_11_genres/01_Indie.pkl')

dataset = pd.read_pickle(dataset_path)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 741913 entries, 25636 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         741913 non-null  int64 
 1   app_id        741913 non-null  int64 
 2   app_name      741913 non-null  object
 3   review_text   741913 non-null  object
 4   review_score  741913 non-null  int64 
 5   review_votes  741913 non-null  int64 
 6   genre_id      741913 non-null  object
 7   category_id   741913 non-null  object
dtypes: int64(4), object(4)
memory usage: 50.9+ MB


In [3]:
%load_ext autoreload

In [4]:
# data preprocessing

import sys
sys.path.append('../../sa/')

%autoreload 2
import str_cleaning_functions

# copied from lda_demo_gridsearch.ipynb
def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

# def cleaning_strlist(str_list):
#     str_list = list(map(lambda x: clean(x), str_list))
#     str_list = list(map(lambda x: deEmojify(x), str_list))

#     str_list = list(map(lambda x: x.lower(), str_list))
#     str_list = list(map(lambda x: remove_num(x), str_list))
#     str_list = list(map(lambda x: unify_whitespaces(x), str_list))

#     str_list = list(map(lambda x: _deaccent(x), str_list))
#     str_list = list(map(lambda x: remove_non_alphabets(x), str_list))
#     str_list = list(map(lambda x: remove_stopword(x), str_list))
#     return str_list

# copied from bert_demo_gridsearch.ipynb
def cleaning_little(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))


In [5]:
# create a copy of the dataset, as we need both untouched text and cleaned text

dataset_preprocessed = dataset.copy()

In [6]:
cleaning(dataset_preprocessed, 'review_text')


cleaning_little(dataset, 'review_text')

In [7]:
X_preprocessed_temp = dataset_preprocessed['review_text'].values
X_temp = dataset['review_text'].values

In [8]:
assert X_temp.shape, X_preprocessed_temp.shape

In [9]:
assert len(X_temp), len(X_preprocessed_temp)

In [10]:
# remove docs with 0 len

X, X_preprocessed = [], []

for i, (doc, doc_preprocessed) in enumerate(zip(list(X_temp), list(X_preprocessed_temp))):
    if len(doc) == 0 or len(doc_preprocessed) == 0:
        continue

    X.append(doc)
    X_preprocessed.append(doc_preprocessed)

In [11]:
len(X), len(X_preprocessed)

(737139, 737139)

Apply lemmatizing to the preprocessed dataset as well (for BoW)

In [12]:
# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

In [13]:
X_preprocessed = list(map(lambda x: lemmatization(x), X_preprocessed))
X_preprocessed = list(map(lambda x: ' '.join(x), X_preprocessed))

Training

In [14]:
# copy from: https://github.com/MilaNLProc/contextualized-topic-models/blob/master/contextualized_topic_models/utils/data_preparation.py#L44
# call bert_embeddings_from_list() to produce embeddings by ourself

import warnings
from sentence_transformers import SentenceTransformer
import torch
import platform


if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series mac machine

print(device)

def bert_embeddings_from_list(
    texts, 
    model_name_or_path, 
    batch_size=32, 
    max_seq_length=None,            # 128 is the default valule in TopicModelDataPreparation() init. Passing none to use the default value of each model
    device='cpu'):
    """
    Creates SBERT Embeddings from a list
    """

    model = SentenceTransformer(model_name_or_path, device=device)

    if max_seq_length is not None:
        model.max_seq_length = max_seq_length
    else:
        max_seq_length = model.max_seq_length

    check_max_local_length(max_seq_length, texts)

    return np.array(model.encode(texts, batch_size=batch_size, show_progress_bar=True))


def check_max_local_length(max_seq_length, texts):
    max_local_length = np.max([len(t.split()) for t in texts])
    if max_local_length > max_seq_length:
        warnings.simplefilter("always", DeprecationWarning)
        warnings.warn(
            f"the longest document in your collection has {max_local_length} words, the model instead "
            f"truncates to {max_seq_length} tokens."
        )

cuda


In [15]:
from gensim.models import CoherenceModel
from copy import deepcopy

from sklearn.model_selection import ParameterGrid, ParameterSampler

sys.path.append('../')

from eval_metrics import compute_inverted_rbo, compute_topic_diversity, compute_pairwise_jaccard_similarity, \
                        METRICS, SEARCH_BEHAVIOUR, COHERENCE_MODEL_METRICS

In [16]:
def _print_message(message):
    '''Print message with a timestamp in front of it

    Timestamp format: YYYY-MM-DD HH:MM:SS,mmm
    '''
    print(f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]} - {message}')

In [17]:
# init params

def _init_count_vectorizer_params(
        max_features=2000,
        ngram_range=(1,1)
):
    params_dict = {}
    params_dict['max_features'] = max_features
    params_dict['ngram_range'] = ngram_range

    return params_dict

def _init_sbert_params(
    model_name_or_path='all-mpnet-base-v2'
):
    params_dict = {}
    params_dict['model_name_or_path'] = model_name_or_path

    return params_dict

# params are copied from source code of CTM: https://github.com/MilaNLProc/contextualized-topic-models/blob/master/contextualized_topic_models/models/ctm.py#L131
# commented params are params that has no plan on fine-tuning them (not significant to our project)
def _init_ctm_params(
        # bow_size,
        # contextual_size,
        # inference_type="combined",
        n_components=10,
        # model_type="prodLDA",
        hidden_sizes=[100, 100],        # pass as list as json does not support tuple
        # activation="softplus",
        dropout=0.2,
        # learn_priors=True,
        # batch_size=64,
        lr=2e-3,
        momentum=0.99,
        solver="adam",
        num_epochs=100,
        # reduce_on_plateau=False,      # only valid if there's a testing data (seems no need to havbe label, just partition a testing dataset with train_test_split()))
        # num_data_loader_workers=mp.cpu_count(),
        # label_size=0,
        # loss_weights=None
):
    params_dict = {}
    # params_dict['bow_size'] = bow_size                        # decided by the count vectorizer params (max_features)
    # params_dict['contextual_size'] = contextual_size          # decided by the sbert model
    # params_dict['inference_type'] = inference_type
    params_dict['n_components'] = n_components
    # params_dict['model_type'] = model_type
    params_dict['hidden_sizes'] = hidden_sizes
    # params_dict['activation'] = activation
    params_dict['dropout'] = dropout
    # params_dict['learn_priors'] = learn_priors
    # params_dict['batch_size'] = batch_size
    params_dict['lr'] = lr
    params_dict['momentum'] = momentum
    params_dict['solver'] = solver
    params_dict['num_epochs'] = num_epochs

    return params_dict

In [18]:
def _init_config_dict(config_path:Path, model_name:str, hyperparameters:dict, search_space_dict:dict, 
                      metrics:list[METRICS], monitor:METRICS,
                      search_behaviour:SEARCH_BEHAVIOUR, search_rs:int, search_n_iter:int):
    
    if not config_path.exists():
        config = {}

        sbert_params = _init_sbert_params(**hyperparameters['sbert_params'])
        countvect_params = _init_count_vectorizer_params(**hyperparameters['countvect_params'])
        ctm_params = _init_ctm_params(**hyperparameters['ctm_params'])

        config['model'] = model_name
        config['sbert_params'] = sbert_params
        config['countvect_params'] = countvect_params
        config['ctm_params'] = ctm_params

        if 'sbert_params' in search_space_dict:
            for k in search_space_dict['sbert_params'].keys():
                sbert_params.pop(k, '')     # add a default value to avoid key error
        if 'countvect_params' in search_space_dict:
            for k in search_space_dict['countvect_params'].keys():
                countvect_params.pop(k, '')
        if 'ctm_params' in search_space_dict:
            for k in search_space_dict['ctm_params'].keys():
                ctm_params.pop(k, '')

        config['search_space'] = search_space_dict
        config['metrics'] = list(map(lambda x: x.value, metrics))
        config['monitor'] = monitor.value

        config['search_behaviour'] = search_behaviour.value
        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            config['search_rs'] = search_rs
            config['search_n_iter'] = search_n_iter

        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)

        _print_message('Created config file at {}'.format(config_path))
        # print('Created config file at {}'.format(config_path))
    else:
        with open(config_path, 'r') as f:
            config = json.load(f)

        # check whether the input params are consistent with the config file
        assert config['model'] == model_name, 'input model_name is not consistent with the config["model"]'
        assert config['metrics'] == list(map(lambda x: x.value, metrics)), 'input metrics is not consistent with config["metrics"]'
        assert config['monitor'] == monitor.value, 'input monitor is not consistent with config["monitor"]'
        assert config['search_behaviour'] == search_behaviour.value, 'input search_behaviour is not consistent with config["search_behaviour"]'
        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            assert config['search_rs'] == search_rs, 'input search_rs is not consistent with config["search_rs"]'
            assert config['search_n_iter'] == search_n_iter, 'input search_n_iter is not consistent with config["search_n_iter"]'

        # check whether the hyperparameters are consistent with the config file
        sbert_params = _init_sbert_params(**hyperparameters['sbert_params'])
        countvect_params = _init_count_vectorizer_params(**hyperparameters['countvect_params'])
        ctm_params = _init_ctm_params(**hyperparameters['ctm_params'])

        assert config['sbert_params'].keys() <= sbert_params.keys(), 'existing config["sbert_params"] contains additional hyperparameters'
        assert config['countvect_params'].keys() <= countvect_params.keys(), 'existing config["countvect_params"] contains additional hyperparameters'
        assert config['ctm_params'].keys() <= ctm_params.keys(), 'existing config["ctm_params"] contains additional hyperparameters'

        for key in sbert_params.keys() & config['sbert_params'].keys():
            assert sbert_params[key] == config['sbert_params'][key], 'existing config["sbert_params"] contains different hyperparameters'
        for key in countvect_params.keys() & config['countvect_params'].keys():
            assert countvect_params[key] == config['countvect_params'][key], 'existing config["countvect_params"] contains different hyperparameters'
        for key in ctm_params.keys() & config['ctm_params'].keys():
            assert ctm_params[key] == config['ctm_params'][key], 'existing config["ctm_params"] contains different hyperparameters'

        # check whether the search_space is consistent with the config file
        if 'sbert_params' in config['search_space']:
            assert config['search_space']['sbert_params'].keys() == search_space_dict['sbert_params'].keys(), 'input search_space_dict["sbert_params"] contains different hyperparameter keys than existing config["search_space"]["sbert_params"]'
            for k in search_space_dict['sbert_params'].keys():
                assert k in config['search_space']['sbert_params'], f'input search_space_dict["sbert_params"]["{key}"] contains value than existing config["search_space"]["sbert_params"]["{key}"]'
        if 'countvect_params' in config['search_space']:
            assert config['search_space']['countvect_params'].keys() == search_space_dict['countvect_params'].keys(), 'input search_space_dict["countvect_params"] contains different hyperparameter keys than existing config["search_space"]["countvect_params"]'
            for k in search_space_dict['countvect_params'].keys():
                assert k in config['search_space']['countvect_params'], f'input search_space_dict["countvect_params"]["{key}"] contains value than existing config["search_space"]["countvect_params"]["{key}"]'
        if 'ctm_params' in config['search_space']:
            assert config['search_space']['ctm_params'].keys() == search_space_dict['ctm_params'].keys(), 'input search_space_dict["ctm_params"] contains different hyperparameter keys than existing config["search_space"]["ctm_params"]'
            for k in search_space_dict['ctm_params'].keys():
                assert k in config['search_space']['ctm_params'], f'input search_space_dict["ctm_params"]["{key}"] contains value than existing config["search_space"]["ctm_params"]["{key}"]'
        
        _print_message('Loaded existing config file from {}'.format(config_path))
        _print_message('Hyperparameters and search space are consistent with the input parameters')
        # print('Loaded existing config file from {}'.format(config_path))
        # print('Hyperparameters and search space are consistent with the input parameters')

    return config


In [19]:
def _init_result_dict(result_path:Path, monitor_type:str):
    if not result_path.exists():
        result = {}

        result['best_metric'] = -float('inf')
        result['best_model_checkpoint'] = ""
        result['best_hyperparameters'] = dict()
        result["monitor_type"] = monitor_type
        result["log_history"] = list()

    else:
        with open(result_path, 'r') as f:
            result = json.load(f)

        assert result['monitor_type'] == monitor_type

        _print_message('Loaded existing result file from {}'.format(result_path))
        # print('Loaded existing result file from {}'.format(result_path))
    
    return result

In [20]:
def _load_ctm_model(model_checkpoint:Path, ctm_params:dict):

    model_path = [p for p in model_checkpoint.iterdir() if p.is_dir()][-1]        # get the last dir (since there 's only one dir inside) -> get the only dir

    # get the first file in the dir
    epoch_file = [p for p in model_path.iterdir() if p.is_file()][0]
    epoch_num = int(epoch_file.stem.split('_')[-1])

    if 'hidden_sizes' in ctm_params:
        ctm_params['hidden_sizes'] = tuple(ctm_params['hidden_sizes'])

    ctm = CombinedTM(**ctm_params)

    ctm.load(model_path, epoch_num)

    return ctm

In [21]:
def _get_topics(ctm, k=10):
    return ctm.get_topic_lists(k)

def _get_topic_word_metrix(ctm):
    return ctm.get_topic_word_distribution()

# ref: https://contextualized-topic-models.readthedocs.io/en/latest/readme.html (go to the section: Mono-Lingual Topic Modeling)
# testing_dataset = qt.transform(text_for_contextual=testing_text_for_contextual, text_for_bow=testing_text_for_bow)
# # n_sample how many times to sample the distribution (see the doc)
# ctm.get_doc_topic_distribution(testing_dataset, n_samples=20) # returns a (n_documents, n_topics) matrix with the topic distribution of each document
def _get_topic_document_metrix(ctm, dataset, n_samples=20):
    return ctm.get_doc_topic_distribution(dataset, n_samples=n_samples).T

In [25]:
import pickle
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from contextualized_topic_models.datasets.dataset import CTMDataset

def model_search(X_contextual, X_bow, hyperparameters:dict, search_space:dict, save_folder:Path,
                 additional_stopwords:list[str]=None,
                 metrics:list[METRICS]=[METRICS.C_NPMI], monitor:METRICS=METRICS.C_NPMI, 
                 save_each_models=True, run_from_checkpoints=False,
                 search_behaviour=SEARCH_BEHAVIOUR.GRID_SEARCH, search_rs=42, search_n_iter=10):
    
    config_json_path = save_folder.joinpath('config.json')
    result_json_path = save_folder.joinpath('result.json')

    if monitor not in metrics:
        raise Exception('monitor is not in metrics. Please modify the metrics passed in.')

    if run_from_checkpoints:
        if not save_folder.exists():
            _print_message('Save folder:' + str(save_folder.resolve()) + ' does not exist. Function terminates.')
            # print('Save folder:' + str(save_folder.resolve()) + ' does not exist. Function terminates.')
            raise Exception('No checkpoints found. Function terminates.')
        
        # check for existing configs
        if not config_json_path.exists():
            raise Exception('No config.json found. Function terminates.')
        
        # check for existing results
        if not result_json_path.exists():
            _print_message('No result.json is found. Assuming no existing checkpoints.')
            # print('No result.json is found. Assuming no existing checkpoints.')
    else:
        if save_folder.exists():
            raise Exception('Checkpoints found. Please delete the checkpoints or set run_from_checkpoints=True. Function terminates.')

    if not save_folder.exists():
        save_folder.mkdir()

    config = _init_config_dict(config_json_path, 'ctm', hyperparameters, search_space,
                               metrics, monitor, search_behaviour, search_rs, search_n_iter)
    result = _init_result_dict(result_json_path, monitor.value)

    _print_message('Search folder: {}'.format(save_folder))
    # print('Search folder: {}'.format(save_folder))

    # init
    best_model_path = result['best_model_checkpoint']
    best_metric_score = result['best_metric']
    best_model = _load_ctm_model(Path(best_model_path),
                                 result['best_hyperparameters']['ctm_params']) if best_model_path != "" else None
    best_hyperparameters = result['best_hyperparameters']


    _print_message('Best model checkpoint: {}'.format(best_model_path))
    _print_message('Best metric score: {}'.format(best_metric_score))
    _print_message('Best model: {}'.format(best_model))
    # print(f'Best model checkpoint: {best_model_path}')
    # print(f'Best metric score: {best_metric_score}')
    # print(f'Best model: {best_model}')

    # search
    # like bertopic, we create a temp dict for initiating the search space
    # then we apply sklearn parameter sampler / parameter grid to get the search space
    temp_search_space = {}
    for k, v in search_space.items():
        for kk, vv in v.items():
            temp_search_space[k + '__' + kk] = vv

    if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
        search_iterator = ParameterSampler(temp_search_space, search_n_iter, random_state=search_rs)
    elif search_behaviour == SEARCH_BEHAVIOUR.GRID_SEARCH:
        search_iterator = ParameterGrid(temp_search_space)

    print('\n')

    for search_space_dict in search_iterator:

        # unwrap the search space dict

        model_name = ''

        _sbert_params = {}
        _countvect_params = {}
        _ctm_params = {}

        for k, v in search_space_dict.items():
            if k.startswith('sbert_params'):
                _sbert_params[k.split('__')[1]] = v
                model_name += 'sb_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('countvect_params'):
                _countvect_params[k.split('__')[1]] = v
                model_name += 'cvect_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('ctm_params'):
                _ctm_params[k.split('__')[1]] = v
                model_name += 'ctm_' + k.split('__')[1] + '_' + str(v) + '_'

        model_name = model_name[:-1]     # remove the last '_'

        model_path = save_folder.joinpath(config['model'] + '_' + model_name)

        # check whether the model exists
        if model_path.exists():
            _print_message('Skipping current search space: {}'.format(search_space_dict))
            # print('Skipping current search space: {}'.format(search_space_dict))
            continue

    
        ##########
        # Training starts
        ##########

        _print_message('Current search space: {}'.format(search_space_dict))
        # print('Current search space: {}'.format(search_space_dict))

        sbert_params = deepcopy(config['sbert_params'])     # deepcopy just for safety (not messing up with the original config)
        countvect_params = deepcopy(config['countvect_params'])
        ctm_params = deepcopy(config['ctm_params'])

        sbert_params.update(_sbert_params)
        countvect_params.update(_countvect_params)
        ctm_params.update(_ctm_params)

        countvect_params['ngram_range'] = tuple(countvect_params['ngram_range'])     # convert list to tuple

        ##########
        # Preprocessing
        ##########

        # for re-producting the result (and inferencing)
        # we need to load the vectorizer, do the exact steps in preprocessing for creating bow
        # then create a CTMDataset for inferencing

        # create bow
        vectorizer = CountVectorizer(
            stop_words="english" if additional_stopwords is None else list(ENGLISH_STOP_WORDS.union(additional_stopwords)),
            analyzer='word',
            **countvect_params)
        
        vectorizer = vectorizer.fit(X_bow)
        vocab = vectorizer.get_feature_names_out()
        vocab_set = set(vocab)

        preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in vocab_set])
                            for doc in X_bow]
        
        text_for_contextual, text_for_bow = [], []
        X_tmp = []

        
        assert len(X_contextual) == len(preprocessed_docs_tmp), f'len(text_for_contextual): {len(X_contextual)}, len(preprocessed_docs_tmp): {len(preprocessed_docs_tmp)}'
        assert len(X) == len(X_contextual), f'len(X): {len(X)}, len(text_for_contextual): {len(X_contextual)}'
        
        # remove empty docs
        for i, (tfc, tfb) in enumerate(zip(X_contextual, preprocessed_docs_tmp)):
            if len(tfb) == 0 or len(tfc) == 0:
                continue
                
            text_for_contextual.append(tfc)
            text_for_bow.append(tfb)
            X_tmp.append(X[i])

        assert len(text_for_contextual) == len(text_for_bow), f'len(text_for_contextual_tmp): {len(text_for_contextual)}, len(text_for_bow_tmp): {len(text_for_bow)}'
        assert len(X_tmp) == len(text_for_contextual), f'len(X_tmp): {len(X_tmp)}, len(text_for_contextual_tmp): {len(text_for_contextual)}'


        train_bow_embeddings = vectorizer.transform(text_for_bow)

        
        # isntead of using default TopicModelDataPreparation(), build the dataset by referencing the source code of it
        # source code: https://github.com/MilaNLProc/contextualized-topic-models/blob/master/contextualized_topic_models/utils/data_preparation.py
        # according to the source code, we only need to create the idx2token, then use the countvectorizer above to build the dataset
        idx2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}

        
        # create sbert embeddings
        if platform.system() == 'Linux' or platform.system() == 'Windows':
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            device = torch.device('mps')        # m-series machine
        

        # check existing embeddings
        # reuse them if found
        embeddings_path = save_folder.joinpath(f'embeddings_{sbert_params["model_name_or_path"]}.pkl')
        if embeddings_path.exists():
            with open(embeddings_path, 'rb') as f:
                embeddings = np.load(f)

            assert embeddings.shape[0] == len(text_for_contextual), f'embeddings.shape[0]: {embeddings.shape[0]}, len(text_for_contextual): {len(text_for_contextual)}'

            _print_message(f'Found existing sbert embeddings at {embeddings_path}. Reusing them.')
            # print(f'Found existing sbert embeddings at {embeddings_path}. Reusing them.')
        else:
            embeddings = bert_embeddings_from_list(text_for_contextual, **sbert_params, device=device)

            with open(embeddings_path, 'wb') as f:
                np.save(f, embeddings)
         


        # tp = TopicModelDataPreparation()
        # training_dataset = tp.fit(text_for_contextual=text_for_contextual, text_for_bow=text_for_bow, custom_embeddings=embeddings)
        training_dataset = CTMDataset(
            X_contextual=embeddings,
            X_bow=train_bow_embeddings,
            idx2token=idx2token,
            labels=None
        )
        
        # ctm

        ctm_params['bow_size'] = len(vocab)
        ctm_params['contextual_size'] = embeddings.shape[1]
        ctm_params['hidden_sizes'] = tuple(ctm_params['hidden_sizes'])     # convert list to tuple

        ctm = CombinedTM(**ctm_params)
        ctm.device = device
        ctm.fit(training_dataset, verbose=True)

        ##########
        # Training ends
        ##########

        ##########
        # Evaluation starts
        ##########

        # init data for gensim coherence model
        topic_words = _get_topics(ctm, k=10)
        topics = ctm.get_predicted_topics(training_dataset, n_samples=20)

        documents = pd.DataFrame({"Document": X_tmp,
                                "ID": range(len(X_tmp)),
                                "Topic": topics})
        
        docs_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
        texts = [doc.split() for doc in docs_per_topic.Document.values]
        
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        # init octis format result for convenience
        result_octis = {}
        result_octis['topics'] = topic_words
        result_octis['topic-word-matrix'] = _get_topic_word_metrix(ctm)
        result_octis['topic-document-matrix'] = _get_topic_document_metrix(ctm, training_dataset, n_samples=20)

        _print_message('Compute evaluation metrics')
        # print('Compute evaluation metrics')

        metrics_score = dict()

        for metric in metrics:
            if metric in COHERENCE_MODEL_METRICS:
                coherencemodel = CoherenceModel(topics=topic_words, texts=texts, corpus=corpus, dictionary=dictionary, topn=10, coherence=metric.value)
                score = coherencemodel.get_coherence()
            elif metric == METRICS.TOPIC_DIVERSITY:
                score = compute_topic_diversity(result_octis, topk=10)
            elif metric == METRICS.INVERTED_RBO:
                score = compute_inverted_rbo(result_octis, topk=10)
            elif metric == METRICS.PAIRWISE_JACCARD_SIMILARITY:
                score = compute_pairwise_jaccard_similarity(result_octis, topk=10)
            else:
                raise Exception('Unknown metric: {}'.format(metric.value))

            metrics_score[metric.value] = score

            _print_message('Evaluation metric ({}): {}'.format(metric.value, score))
            # print(f'Evaluation metric ({metric.value}): {score}')

        monitor_score = metrics_score[monitor.value]

        ##########
        # Evaluation ends
        ##########

        ##########
        # Save models
        ##########

        if not model_path.exists():
            model_path.mkdir()
        
        if save_each_models:
            ctm.save(models_dir=model_path)

        # save the vectorizer, and the CV in TopicModelDataPreparation
        # then we can reproduce the result better
        vectorizer_path = model_path.joinpath('count_vectorizer.pkl')
        pickle.dump(vectorizer, open(vectorizer_path, 'wb'))
        

        ##########
        # Save models ends
        ##########

        ###########
        # Update result dict and json file
        ###########
            
        model_hyperparameters = {
            'sbert_params': sbert_params,
            'countvect_params': countvect_params,
            'ctm_params': ctm_params
        }

        if monitor_score > best_metric_score:
            best_metric_score = monitor_score
            best_model_path = model_path
            best_model = ctm
            best_hyperparameters = model_hyperparameters

        model_log_history = dict()
        model_log_history.update(metrics_score)
        model_log_history['model_name'] = model_name
        model_log_history['hyperparameters']  = model_hyperparameters

        result['best_metric'] = best_metric_score
        result['best_model_checkpoint'] = str(best_model_path)
        result['best_hyperparameters'] = best_hyperparameters
        result['log_history'].append(model_log_history)

        # save result
        with open(result_json_path, 'w') as f:
            json.dump(result, f, indent=2)

        _print_message('Saved result.json at: {}'.format(result_json_path))        
        # print('Saved result.json at:', result_json_path)
        print('\n\n')
    
    _print_message('Search ends')
    # print('Search ends')
    return best_model, best_model_path, best_hyperparameters


In [23]:
# load/create custom stopwords stored in a txt from dataset folder
from pathlib import Path

custom_stopwords_path = Path('../../dataset/topic_modelling/stopwords.txt')
custom_stowords_games_path = Path('../../dataset/topic_modelling/stopwords_games.txt')
game_name_list_path = Path('../../dataset/topic_modelling/game_name_list.txt')

with open(custom_stopwords_path, 'r') as f:
    custom_stopwords = f.read().splitlines()

with open(custom_stowords_games_path, 'r') as f:
    custom_stowords_games = f.read().splitlines()

with open(game_name_list_path, 'r') as f:
    game_name_list = f.read().splitlines()

# also include the stopword list from nltk
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')

custom_stopwords = custom_stopwords + custom_stowords_games + game_name_list + nltk_stopwords
custom_stopwords = set(custom_stopwords)

print(custom_stopwords)
print(len(custom_stopwords))


155928


In [28]:
# grid search / random search

# hyperparameters
sbert_params = _init_sbert_params(model_name_or_path='all-mpnet-base-v2')
countvect_params = _init_count_vectorizer_params(max_features=2000, ngram_range=[1,1])
ctm_params = _init_ctm_params(n_components=10, hidden_sizes=[100, 100], dropout=0.2, lr=2e-3, momentum=0.99, solver="adam", num_epochs=1)

search_space_dict = {
    'sbert_params': {
        'model_name_or_path': ['all-MiniLM-L6-v2', 'all-mpnet-base-v2']
    },
    # 'countvect_params': {
    #     'max_features' : [1500, 2000, 2500],
    #     'ngram_range': [[1, 1], [1, 2]]     # datatype is list as json does not support tuple
    # },
    'ctm_params':{
        'n_components': [50, 60],
        # 'hidden_sizes': [(100, 100), (200, 200), (100, 100, 100), (200, 200, 200)],
        # 'num_epochs':[50]
    }
}

search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
# search_behaviour = SEARCH_BEHAVIOUR.RANDOM_SEARCH

# training_datetime = datetime.now()
training_datetime = datetime(2024, 1, 29, 21, 29, 10)
training_folder = Path(f'ctm_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')

best_model, best_model_path, best_hyperparameters = model_search(
    X,
    X_preprocessed,
    hyperparameters={
        'sbert_params': sbert_params,
        'countvect_params': countvect_params,
        'ctm_params': ctm_params
    },
    search_space=search_space_dict,
    save_folder=training_folder,
    metrics=[METRICS.C_NPMI, METRICS.C_V, METRICS.UMASS, METRICS.C_UCI, METRICS.TOPIC_DIVERSITY, METRICS.INVERTED_RBO, METRICS.PAIRWISE_JACCARD_SIMILARITY],
    monitor=METRICS.C_NPMI,
    save_each_models=True,
    run_from_checkpoints=True,
    search_behaviour=search_behaviour,
    # search_rs=42,
    # search_n_iter=50
)

2024-01-29 23:33:55,587 - Loaded existing config file from ctm_grid_search_20240129_212910/config.json
2024-01-29 23:33:55,587 - Hyperparameters and search space are consistent with the input parameters
2024-01-29 23:33:55,587 - Loaded existing result file from ctm_grid_search_20240129_212910/result.json
2024-01-29 23:33:55,587 - Search folder: ctm_grid_search_20240129_212910




2024-01-29 23:33:56,192 - Best model checkpoint: ctm_grid_search_20240129_212910/ctm_ctm_n_components_50_sb_model_name_or_path_all-mpnet-base-v2
2024-01-29 23:33:56,192 - Best metric score: -0.005837587367076017
2024-01-29 23:33:56,192 - Best model: <contextualized_topic_models.models.ctm.CombinedTM object at 0x7fed3a691370>


2024-01-29 23:33:56,193 - Current search space: {'ctm_params__n_components': 50, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}


Batches: 100%|██████████| 22822/22822 [04:02<00:00, 94.22it/s] 


Settings: 
                   N Components: 50
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.98
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [1/1]	 Seen Samples: [730240/730288]	Train Loss: 171.71782992711513	Time: 0:00:40.584913: : 1it [00:40, 40.59s/it]
100%|██████████| 11411/11411 [00:21<00:00, 540.95it/s]
100%|██████████| 11411/11411 [00:21<00:00, 541.97it/s]
100%|██████████| 11411/11411 [00:20<00:00, 560.56it/s]


2024-01-29 23:40:25,910 - Compute evaluation metrics
2024-01-29 23:42:43,473 - Evaluation metric (c_npmi): -0.002532868284855686
2024-01-29 23:47:23,436 - Evaluation metric (c_v): 0.44107600089532073
2024-01-29 23:47:23,661 - Evaluation metric (u_mass): -0.05721015061399102
2024-01-29 23:49:42,436 - Evaluation metric (c_uci): -0.5983010530625318
2024-01-29 23:49:42,436 - Evaluation metric (topic_diversity): 0.536
2024-01-29 23:49:42,477 - Evaluation metric (inverted_rbo): 0.9423969696176443
2024-01-29 23:49:42,477 - Evaluation metric (pairwise_jaccard_similarity): 0.03583501185320881


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-01-29 23:49:43,240 - Saved result.json at: ctm_grid_search_20240129_212910/result.json



2024-01-29 23:49:43,240 - Skipping current search space: {'ctm_params__n_components': 50, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-29 23:49:43,240 - Current search space: {'ctm_params__n_components': 60, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-29 23:49:57,940 - Found existing sbert embeddings at ctm_grid_search_20240129_212910/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 60
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9833333333333333
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Sav

Epoch: [1/1]	 Seen Samples: [730240/730288]	Train Loss: 176.373023384355	Time: 0:00:41.577665: : 1it [00:41, 41.58s/it]
100%|██████████| 11411/11411 [00:22<00:00, 513.48it/s]
100%|██████████| 11411/11411 [00:22<00:00, 511.03it/s]
100%|██████████| 11411/11411 [00:22<00:00, 511.45it/s]


2024-01-29 23:52:06,723 - Compute evaluation metrics
2024-01-29 23:54:29,865 - Evaluation metric (c_npmi): -0.004316873403745715
2024-01-29 23:59:30,886 - Evaluation metric (c_v): 0.42994207306751464
2024-01-29 23:59:31,155 - Evaluation metric (u_mass): -0.06738535198833447
2024-01-30 00:01:53,213 - Evaluation metric (c_uci): -0.5297482102067792
2024-01-30 00:01:53,214 - Evaluation metric (topic_diversity): 0.465
2024-01-30 00:01:53,274 - Evaluation metric (inverted_rbo): 0.9327779475363519
2024-01-30 00:01:53,275 - Evaluation metric (pairwise_jaccard_similarity): 0.046312033737914225


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-01-30 00:01:54,149 - Saved result.json at: ctm_grid_search_20240129_212910/result.json



2024-01-30 00:01:54,150 - Current search space: {'ctm_params__n_components': 60, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-30 00:02:10,954 - Found existing sbert embeddings at ctm_grid_search_20240129_212910/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 60
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9833333333333333
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [1/1]	 Seen Samples: [730240/730288]	Train Loss: 175.18746339562273	Time: 0:00:43.667483: : 1it [00:43, 43.67s/it]
100%|██████████| 11411/11411 [00:23<00:00, 484.01it/s]
100%|██████████| 11411/11411 [00:22<00:00, 502.75it/s]
100%|██████████| 11411/11411 [00:24<00:00, 473.92it/s]


2024-01-30 00:04:25,904 - Compute evaluation metrics
2024-01-30 00:06:49,082 - Evaluation metric (c_npmi): -0.006314201989779476
2024-01-30 00:11:40,496 - Evaluation metric (c_v): 0.4377658726008981
2024-01-30 00:11:40,904 - Evaluation metric (u_mass): -0.04553349820826513
2024-01-30 00:14:01,857 - Evaluation metric (c_uci): -0.6885305190639257
2024-01-30 00:14:01,858 - Evaluation metric (topic_diversity): 0.48
2024-01-30 00:14:01,915 - Evaluation metric (inverted_rbo): 0.9379828954741969
2024-01-30 00:14:01,916 - Evaluation metric (pairwise_jaccard_similarity): 0.03900584838035676


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-01-30 00:14:02,807 - Saved result.json at: ctm_grid_search_20240129_212910/result.json



2024-01-30 00:14:02,807 - Search ends


In [33]:
# load the model from disk to compare the results

search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
training_datetime = datetime(2024, 1, 29, 21, 29, 10)
training_folder = Path(f'ctm_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')

training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)


# load the embeddings
model_name_or_path = training_result['best_hyperparameters']['sbert_params']['model_name_or_path']
embeddings_path = training_folder.joinpath(f'embeddings_{model_name_or_path}.pkl')
with open(embeddings_path, 'rb') as f:
    embeddings = np.load(f)

best_model_path = training_result['best_model_checkpoint']
ctm_hyperparameters = training_result['best_hyperparameters']['ctm_params']
sbert_params = training_result['best_hyperparameters']['sbert_params']

# ctm_hyperparameters['bow_size'] = 2000
# ctm_hyperparameters['contextual_size'] = 768

best_model_loaded = _load_ctm_model(Path(best_model_path), ctm_hyperparameters)

# create the dataset on the fly
vectorizer = pickle.load(open(Path(best_model_path).joinpath('count_vectorizer.pkl'), 'rb'))


def create_ctm_dataset(X_contextual, X_bow, vectorizer, sbert_params):

    vocab = vectorizer.get_feature_names_out()
    vocab_set = set(vocab)

    preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in vocab_set])
                        for doc in X_bow]

    text_for_contextual, text_for_bow = [], []
    X_tmp = []


    assert len(X_contextual) == len(preprocessed_docs_tmp), f'len(text_for_contextual): {len(X_contextual)}, len(preprocessed_docs_tmp): {len(preprocessed_docs_tmp)}'
    assert len(X) == len(X_contextual), f'len(X): {len(X)}, len(text_for_contextual): {len(X_contextual)}'

    # remove empty docs
    for i, (tfc, tfb) in enumerate(zip(X_contextual, preprocessed_docs_tmp)):
        if len(tfb) == 0 or len(tfc) == 0:
            continue
            
        text_for_contextual.append(tfc)
        text_for_bow.append(tfb)
        X_tmp.append(X[i])

    assert len(text_for_contextual) == len(text_for_bow), f'len(text_for_contextual_tmp): {len(text_for_contextual)}, len(text_for_bow_tmp): {len(text_for_bow)}'
    assert len(X_tmp) == len(text_for_contextual), f'len(X_tmp): {len(X_tmp)}, len(text_for_contextual_tmp): {len(text_for_contextual)}'


    train_bow_embeddings = vectorizer.transform(text_for_bow)


    # isntead of using default TopicModelDataPreparation(), build the dataset by referencing the source code of it
    # source code: https://github.com/MilaNLProc/contextualized-topic-models/blob/master/contextualized_topic_models/utils/data_preparation.py
    # according to the source code, we only need to create the idx2token, then use the countvectorizer above to build the dataset
    idx2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}

    # create sbert embeddings
    if platform.system() == 'Linux' or platform.system() == 'Windows':
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    else:
        device = torch.device('mps')        # m-series machine
    

    # check existing embeddings
    # reuse them if found
    embeddings_path = training_folder.joinpath(f'embeddings_{sbert_params["model_name_or_path"]}.pkl')
    if embeddings_path.exists():
        with open(embeddings_path, 'rb') as f:
            embeddings = np.load(f)

        assert embeddings.shape[0] == len(text_for_contextual)

        _print_message(f'Found existing sbert embeddings at {embeddings_path}. Reusing them.')
        # print(f'Found existing sbert embeddings at {embeddings_path}. Reusing them.')
    else:
        embeddings = bert_embeddings_from_list(text_for_contextual, **sbert_params, device=device)

        with open(embeddings_path, 'wb') as f:
            np.save(f, embeddings)
        
    # embeddings = bert_embeddings_from_list(text_for_contextual, **sbert_params, device=device)


    # tp = TopicModelDataPreparation()
    # training_dataset = tp.fit(text_for_contextual=text_for_contextual, text_for_bow=text_for_bow, custom_embeddings=embeddings)
    dataset = CTMDataset(
        X_contextual=embeddings,
        X_bow=train_bow_embeddings,
        idx2token=idx2token,
        labels=None
    )

    return dataset

training_dataset = create_ctm_dataset(X, X_preprocessed, vectorizer, sbert_params)


doc_topic_dist_1 = best_model.get_doc_topic_distribution(training_dataset, n_samples=20)
doc_topic_dist_2 = best_model_loaded.get_doc_topic_distribution(training_dataset, n_samples=20)

  vectorizer = pickle.load(open(Path(best_model_path).joinpath('count_vectorizer.pkl'), 'rb'))


2024-01-30 00:31:25,572 - Found existing sbert embeddings at ctm_grid_search_20240129_212910/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.


100%|██████████| 11411/11411 [00:23<00:00, 492.96it/s]
100%|██████████| 11411/11411 [00:23<00:00, 489.78it/s]


In [34]:
most_likely_topic_1 = np.argmax(doc_topic_dist_1, axis=1)
most_likely_topic_2 = np.argmax(doc_topic_dist_2, axis=1)

np.testing.assert_allclose(most_likely_topic_1, most_likely_topic_2, rtol=1e-1, atol=1e-1)
np.testing.assert_allclose(doc_topic_dist_1, doc_topic_dist_2, rtol=1e-1, atol=1e-1)

# the difference is topic is because of sampling
# source code: https://github.com/MilaNLProc/contextualized-topic-models/blob/master/contextualized_topic_models/models/ctm.py#L576

AssertionError: 
Not equal to tolerance rtol=0.1, atol=0.1

Mismatched elements: 333159 / 730288 (45.6%)
Max absolute difference: 49
Max relative difference: 48.
 x: array([ 5, 32,  8, ..., 17, 48, 13])
 y: array([ 5,  7,  8, ..., 47, 25, 48])

In [21]:
# grid search / random search

# hyperparameters
sbert_params = _init_sbert_params(model_name_or_path='all-mpnet-base-v2')
countvect_params = _init_count_vectorizer_params(max_features=2000, ngram_range=(1,1))
ctm_params = _init_ctm_params(n_components=10, hidden_sizes=(100, 100), dropout=0.2, lr=2e-3, momentum=0.99, solver="adam", num_epochs=50)

search_space_dict = {
    'sbert_params': {
        'model_name_or_path': ['all-MiniLM-L6-v2', 'all-mpnet-base-v2']
    },
    # 'countvect_params': {
    #     'max_features' : [1500, 2000, 2500],
    #     'ngram_range': [[1, 1], [1, 2]]     # datatype is list as json does not support tuple
    # },
    'ctm_params':{
        'n_components': [200, 190, 180, 170, 160, 150, 140, 130, 120, 110, 100, 90, 80, 70, 60, 50, 40 ,30, 20, 10],
        # 'hidden_sizes': [(100, 100), (200, 200), (100, 100, 100), (200, 200, 200)],
        # 'num_epochs':[ 50]
    }
}

search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
# search_behaviour = SEARCH_BEHAVIOUR.RANDOM_SEARCH

training_datetime = datetime.now()
# training_datetime = datetime(2024, 1, 23, 0, 21, 11)
training_folder = Path(f'ctm_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')

best_model, best_model_path, best_hyperparameters = model_search(
    X,
    X_preprocessed,
    hyperparameters={
        'sbert_params': sbert_params,
        'countvect_params': countvect_params,
        'ctm_params': ctm_params
    },
    search_space=search_space_dict,
    save_folder=training_folder,
    metrics=[METRICS.C_NPMI, METRICS.C_V, METRICS.UMASS, METRICS.C_UCI, METRICS.TOPIC_DIVERSITY, METRICS.INVERTED_RBO, METRICS.PAIRWISE_JACCARD_SIMILARITY],
    monitor=METRICS.C_NPMI,
    save_each_models=True,
    run_from_checkpoints=False,
    search_behaviour=search_behaviour,
    # search_rs=42,
    # search_n_iter=50
)

2024-01-27 11:03:52,427 - Created config file at ctm_grid_search_20240127_110352/config.json
2024-01-27 11:03:52,427 - Search folder: ctm_grid_search_20240127_110352
2024-01-27 11:03:52,427 - Best model checkpoint: 
2024-01-27 11:03:52,427 - Best metric score: -inf
2024-01-27 11:03:52,427 - Best model: None


2024-01-27 11:03:52,428 - Current search space: {'ctm_params__n_components': 200, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}


Batches: 100%|██████████| 23185/23185 [04:06<00:00, 94.05it/s] 


Settings: 
                   N Components: 200
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.995
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 220.17320254229116	Time: 0:00:38.677430: : 50it [33:11, 39.83s/it]
100%|██████████| 11593/11593 [00:22<00:00, 526.83it/s]
100%|██████████| 11593/11593 [00:23<00:00, 497.64it/s]
100%|██████████| 11593/11593 [00:22<00:00, 520.13it/s]


2024-01-27 11:43:12,520 - Compute evaluation metrics
2024-01-27 11:44:14,004 - Evaluation metric (c_npmi): 0.003249066359927702
2024-01-27 11:46:26,692 - Evaluation metric (c_v): 0.4412687475824011
2024-01-27 11:46:27,054 - Evaluation metric (u_mass): -0.21001490136906856
2024-01-27 11:47:25,927 - Evaluation metric (c_uci): -0.5939265213358271
2024-01-27 11:47:25,928 - Evaluation metric (topic_diversity): 0.224
2024-01-27 11:47:26,538 - Evaluation metric (inverted_rbo): 0.9280700260201536
2024-01-27 11:47:26,549 - Evaluation metric (pairwise_jaccard_similarity): 0.04773621100615902




2024-01-27 11:47:29,377 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 11:47:29,378 - Current search space: {'ctm_params__n_components': 200, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}


Batches: 100%|██████████| 23185/23185 [10:09<00:00, 38.02it/s] 


Settings: 
                   N Components: 200
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.995
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 219.96361032858482	Time: 0:00:39.969078: : 50it [33:24, 40.08s/it]
100%|██████████| 11593/11593 [00:23<00:00, 496.42it/s]
100%|██████████| 11593/11593 [00:24<00:00, 476.21it/s]
100%|██████████| 11593/11593 [00:23<00:00, 496.22it/s]


2024-01-27 12:33:00,034 - Compute evaluation metrics
2024-01-27 12:34:00,557 - Evaluation metric (c_npmi): 0.00585487548970081
2024-01-27 12:36:00,833 - Evaluation metric (c_v): 0.44233981163612013
2024-01-27 12:36:01,267 - Evaluation metric (u_mass): -0.1777667596522363
2024-01-27 12:36:55,490 - Evaluation metric (c_uci): -0.5512625549240123
2024-01-27 12:36:55,491 - Evaluation metric (topic_diversity): 0.2295
2024-01-27 12:36:56,151 - Evaluation metric (inverted_rbo): 0.9290841284207559
2024-01-27 12:36:56,162 - Evaluation metric (pairwise_jaccard_similarity): 0.04727921719244396




2024-01-27 12:36:59,092 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 12:36:59,092 - Current search space: {'ctm_params__n_components': 190, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-27 12:37:05,885 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 190
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9947368421052631
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 216.32933130520965	Time: 0:00:43.164329: : 50it [34:20, 41.21s/it]
100%|██████████| 11593/11593 [00:24<00:00, 466.08it/s]
100%|██████████| 11593/11593 [00:24<00:00, 472.59it/s]
100%|██████████| 11593/11593 [00:24<00:00, 474.34it/s]


2024-01-27 13:13:06,795 - Compute evaluation metrics
2024-01-27 13:14:18,254 - Evaluation metric (c_npmi): 0.0028147078832305283
2024-01-27 13:16:42,041 - Evaluation metric (c_v): 0.43879147655731154
2024-01-27 13:16:42,559 - Evaluation metric (u_mass): -0.2273640292899888
2024-01-27 13:17:45,431 - Evaluation metric (c_uci): -0.6005368364053062
2024-01-27 13:17:45,432 - Evaluation metric (topic_diversity): 0.23894736842105263
2024-01-27 13:17:45,989 - Evaluation metric (inverted_rbo): 0.9272909389857624
2024-01-27 13:17:45,999 - Evaluation metric (pairwise_jaccard_similarity): 0.047077822946156374




2024-01-27 13:17:48,587 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 13:17:48,588 - Current search space: {'ctm_params__n_components': 190, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-27 13:17:56,188 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 190
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9947368421052631
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 216.80830972749172	Time: 0:00:45.669810: : 50it [38:04, 45.69s/it]
100%|██████████| 11593/11593 [00:24<00:00, 471.50it/s]
100%|██████████| 11593/11593 [00:24<00:00, 464.55it/s]
100%|██████████| 11593/11593 [00:24<00:00, 466.03it/s]


2024-01-27 13:57:40,290 - Compute evaluation metrics
2024-01-27 13:58:41,410 - Evaluation metric (c_npmi): 0.0026959935927890187
2024-01-27 14:00:47,130 - Evaluation metric (c_v): 0.4462237823951088
2024-01-27 14:00:47,535 - Evaluation metric (u_mass): -0.24630192005470894
2024-01-27 14:01:44,608 - Evaluation metric (c_uci): -0.6480796410380808
2024-01-27 14:01:44,608 - Evaluation metric (topic_diversity): 0.23578947368421052
2024-01-27 14:01:45,176 - Evaluation metric (inverted_rbo): 0.9272317022848479
2024-01-27 14:01:45,185 - Evaluation metric (pairwise_jaccard_similarity): 0.0495862074007638




2024-01-27 14:01:48,060 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 14:01:48,061 - Current search space: {'ctm_params__n_components': 180, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-27 14:01:54,712 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 180
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9944444444444445
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 212.75230029404287	Time: 0:00:45.096180: : 50it [37:25, 44.90s/it]
100%|██████████| 11593/11593 [00:25<00:00, 458.26it/s]
100%|██████████| 11593/11593 [00:25<00:00, 455.05it/s]
100%|██████████| 11593/11593 [00:24<00:00, 466.21it/s]


2024-01-27 14:41:02,525 - Compute evaluation metrics
2024-01-27 14:42:05,546 - Evaluation metric (c_npmi): -0.0002685750881433731
2024-01-27 14:44:20,360 - Evaluation metric (c_v): 0.4429744824053779
2024-01-27 14:44:20,840 - Evaluation metric (u_mass): -0.19099363772863023
2024-01-27 14:45:19,574 - Evaluation metric (c_uci): -0.6871458690018398
2024-01-27 14:45:19,575 - Evaluation metric (topic_diversity): 0.24444444444444444
2024-01-27 14:45:20,095 - Evaluation metric (inverted_rbo): 0.927681614062223
2024-01-27 14:45:20,105 - Evaluation metric (pairwise_jaccard_similarity): 0.04877385633146518




2024-01-27 14:45:22,699 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 14:45:22,699 - Current search space: {'ctm_params__n_components': 180, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-27 14:45:30,598 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 180
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9944444444444445
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 213.14562467808062	Time: 0:00:45.299070: : 50it [38:01, 45.64s/it]
100%|██████████| 11593/11593 [00:26<00:00, 443.03it/s]
100%|██████████| 11593/11593 [00:26<00:00, 440.22it/s]
100%|██████████| 11593/11593 [00:25<00:00, 452.24it/s]


2024-01-27 15:25:18,674 - Compute evaluation metrics
2024-01-27 15:26:22,239 - Evaluation metric (c_npmi): 0.0126109700743842
2024-01-27 15:28:39,699 - Evaluation metric (c_v): 0.45170013267521164
2024-01-27 15:28:40,077 - Evaluation metric (u_mass): -0.1720349236325532
2024-01-27 15:29:39,803 - Evaluation metric (c_uci): -0.39925659749925174
2024-01-27 15:29:39,804 - Evaluation metric (topic_diversity): 0.25
2024-01-27 15:29:40,331 - Evaluation metric (inverted_rbo): 0.9329783633180961
2024-01-27 15:29:40,340 - Evaluation metric (pairwise_jaccard_similarity): 0.043026209247791665




2024-01-27 15:29:43,012 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 15:29:43,012 - Current search space: {'ctm_params__n_components': 170, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-27 15:29:50,555 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 170
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9941176470588236
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 209.00021957544888	Time: 0:00:43.998562: : 50it [36:55, 44.31s/it]
100%|██████████| 11593/11593 [00:25<00:00, 454.36it/s]
100%|██████████| 11593/11593 [00:25<00:00, 460.98it/s]
100%|██████████| 11593/11593 [00:25<00:00, 451.00it/s]


2024-01-27 16:08:29,211 - Compute evaluation metrics
2024-01-27 16:09:33,983 - Evaluation metric (c_npmi): 0.003138402771408203
2024-01-27 16:11:52,874 - Evaluation metric (c_v): 0.4355960507233355
2024-01-27 16:11:53,162 - Evaluation metric (u_mass): -0.27783156169037154
2024-01-27 16:12:54,084 - Evaluation metric (c_uci): -0.5939228513103504
2024-01-27 16:12:54,085 - Evaluation metric (topic_diversity): 0.2570588235294118
2024-01-27 16:12:54,539 - Evaluation metric (inverted_rbo): 0.9313441858791865
2024-01-27 16:12:54,547 - Evaluation metric (pairwise_jaccard_similarity): 0.044773921628352725




2024-01-27 16:12:57,043 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 16:12:57,043 - Current search space: {'ctm_params__n_components': 170, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-27 16:13:06,391 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 170
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9941176470588236
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 209.32712364460042	Time: 0:00:44.574925: : 50it [37:35, 45.11s/it]
100%|██████████| 11593/11593 [00:25<00:00, 452.00it/s]
100%|██████████| 11593/11593 [00:25<00:00, 446.49it/s]
100%|██████████| 11593/11593 [00:24<00:00, 482.76it/s]


2024-01-27 16:52:25,191 - Compute evaluation metrics
2024-01-27 16:53:29,775 - Evaluation metric (c_npmi): 0.007661369975939954
2024-01-27 16:55:42,043 - Evaluation metric (c_v): 0.44599108011817434
2024-01-27 16:55:42,617 - Evaluation metric (u_mass): -0.199714907937403
2024-01-27 16:56:41,917 - Evaluation metric (c_uci): -0.4841503056906246
2024-01-27 16:56:41,918 - Evaluation metric (topic_diversity): 0.24294117647058824
2024-01-27 16:56:42,381 - Evaluation metric (inverted_rbo): 0.9233948684342022
2024-01-27 16:56:42,390 - Evaluation metric (pairwise_jaccard_similarity): 0.05084556036883425




2024-01-27 16:56:44,805 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 16:56:44,805 - Current search space: {'ctm_params__n_components': 160, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-27 16:56:52,124 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 160
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.99375
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 205.22765205546526	Time: 0:00:44.043418: : 50it [37:55, 45.51s/it]
100%|██████████| 11593/11593 [00:25<00:00, 448.27it/s]
100%|██████████| 11593/11593 [00:25<00:00, 448.66it/s]
100%|██████████| 11593/11593 [00:25<00:00, 458.62it/s]


2024-01-27 17:36:31,789 - Compute evaluation metrics
2024-01-27 17:37:41,058 - Evaluation metric (c_npmi): 0.006657078236035977
2024-01-27 17:40:10,913 - Evaluation metric (c_v): 0.44482231993038235
2024-01-27 17:40:11,365 - Evaluation metric (u_mass): -0.14664880654506457
2024-01-27 17:41:18,692 - Evaluation metric (c_uci): -0.49733576896274406
2024-01-27 17:41:18,693 - Evaluation metric (topic_diversity): 0.27125
2024-01-27 17:41:19,101 - Evaluation metric (inverted_rbo): 0.9316980771116655
2024-01-27 17:41:19,109 - Evaluation metric (pairwise_jaccard_similarity): 0.045665780842519745




2024-01-27 17:41:21,392 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 17:41:21,393 - Current search space: {'ctm_params__n_components': 160, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-27 17:41:28,598 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 160
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.99375
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 205.48428097653175	Time: 0:00:45.091466: : 50it [37:58, 45.57s/it]
100%|██████████| 11593/11593 [00:26<00:00, 437.98it/s]
100%|██████████| 11593/11593 [00:28<00:00, 407.36it/s]
100%|██████████| 11593/11593 [00:26<00:00, 442.23it/s]


2024-01-27 18:21:15,906 - Compute evaluation metrics
2024-01-27 18:22:21,414 - Evaluation metric (c_npmi): 0.004409391581641898
2024-01-27 18:24:38,913 - Evaluation metric (c_v): 0.4517322116286371
2024-01-27 18:24:39,409 - Evaluation metric (u_mass): -0.13661992121313768
2024-01-27 18:25:41,335 - Evaluation metric (c_uci): -0.5797123313766954
2024-01-27 18:25:41,335 - Evaluation metric (topic_diversity): 0.265625
2024-01-27 18:25:41,750 - Evaluation metric (inverted_rbo): 0.9327270162801612
2024-01-27 18:25:41,757 - Evaluation metric (pairwise_jaccard_similarity): 0.04543654143732815




2024-01-27 18:25:43,993 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 18:25:43,994 - Current search space: {'ctm_params__n_components': 150, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-27 18:25:50,875 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 150
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9933333333333333
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 201.6491088294588	Time: 0:00:44.153146: : 50it [37:06, 44.52s/it] 
100%|██████████| 11593/11593 [00:29<00:00, 392.00it/s]
100%|██████████| 11593/11593 [00:25<00:00, 449.21it/s]
100%|██████████| 11593/11593 [00:25<00:00, 452.20it/s]


2024-01-27 19:04:46,282 - Compute evaluation metrics
2024-01-27 19:06:01,551 - Evaluation metric (c_npmi): 0.009944474686864995
2024-01-27 19:08:36,389 - Evaluation metric (c_v): 0.452119644053273
2024-01-27 19:08:36,749 - Evaluation metric (u_mass): -0.13510186139657648
2024-01-27 19:09:47,629 - Evaluation metric (c_uci): -0.4391700508727239
2024-01-27 19:09:47,629 - Evaluation metric (topic_diversity): 0.2866666666666667
2024-01-27 19:09:47,989 - Evaluation metric (inverted_rbo): 0.93532089499921
2024-01-27 19:09:47,996 - Evaluation metric (pairwise_jaccard_similarity): 0.04292962101468555




2024-01-27 19:09:50,166 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 19:09:50,166 - Current search space: {'ctm_params__n_components': 150, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-27 19:09:59,114 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 150
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9933333333333333
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 201.61835552463043	Time: 0:00:44.820154: : 50it [37:38, 45.18s/it]
100%|██████████| 11593/11593 [00:26<00:00, 443.21it/s]
100%|██████████| 11593/11593 [00:25<00:00, 447.77it/s]
100%|██████████| 11593/11593 [00:25<00:00, 456.91it/s]


2024-01-27 19:49:23,676 - Compute evaluation metrics
2024-01-27 19:50:43,732 - Evaluation metric (c_npmi): 0.00291236376743721
2024-01-27 19:53:33,775 - Evaluation metric (c_v): 0.4390209908295437
2024-01-27 19:53:34,169 - Evaluation metric (u_mass): -0.1456518102665069
2024-01-27 19:54:49,110 - Evaluation metric (c_uci): -0.6004061299767649
2024-01-27 19:54:49,111 - Evaluation metric (topic_diversity): 0.2886666666666667
2024-01-27 19:54:49,477 - Evaluation metric (inverted_rbo): 0.93406394417589
2024-01-27 19:54:49,484 - Evaluation metric (pairwise_jaccard_similarity): 0.04316561152058663




2024-01-27 19:54:51,804 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 19:54:51,804 - Current search space: {'ctm_params__n_components': 140, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-27 19:54:59,497 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 140
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9928571428571429
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 197.96573887239907	Time: 0:00:44.575554: : 50it [37:03, 44.46s/it]
100%|██████████| 11593/11593 [00:25<00:00, 451.51it/s]
100%|██████████| 11593/11593 [00:25<00:00, 451.33it/s]
100%|██████████| 11593/11593 [00:25<00:00, 461.86it/s]


2024-01-27 20:33:47,244 - Compute evaluation metrics
2024-01-27 20:35:10,605 - Evaluation metric (c_npmi): 0.007552109322970551
2024-01-27 20:38:11,449 - Evaluation metric (c_v): 0.44628150076103096
2024-01-27 20:38:11,818 - Evaluation metric (u_mass): -0.14105084444498986
2024-01-27 20:39:32,918 - Evaluation metric (c_uci): -0.5147381047228741
2024-01-27 20:39:32,918 - Evaluation metric (topic_diversity): 0.29642857142857143
2024-01-27 20:39:33,238 - Evaluation metric (inverted_rbo): 0.9349411716870584
2024-01-27 20:39:33,244 - Evaluation metric (pairwise_jaccard_similarity): 0.042428262041088016




2024-01-27 20:39:35,394 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 20:39:35,395 - Current search space: {'ctm_params__n_components': 140, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-27 20:39:44,093 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 140
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9928571428571429
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 198.1317318626072	Time: 0:00:44.463487: : 50it [37:35, 45.11s/it] 
100%|██████████| 11593/11593 [00:25<00:00, 449.62it/s]
100%|██████████| 11593/11593 [00:25<00:00, 450.75it/s]
100%|██████████| 11593/11593 [00:25<00:00, 448.84it/s]


2024-01-27 21:19:05,036 - Compute evaluation metrics
2024-01-27 21:20:22,367 - Evaluation metric (c_npmi): 0.004503848946243294
2024-01-27 21:23:08,338 - Evaluation metric (c_v): 0.4466287047190621
2024-01-27 21:23:08,698 - Evaluation metric (u_mass): -0.10531100157447167
2024-01-27 21:24:21,351 - Evaluation metric (c_uci): -0.5464665117829534
2024-01-27 21:24:21,352 - Evaluation metric (topic_diversity): 0.30214285714285716
2024-01-27 21:24:21,667 - Evaluation metric (inverted_rbo): 0.9377920648328352
2024-01-27 21:24:21,673 - Evaluation metric (pairwise_jaccard_similarity): 0.04110381810371165




2024-01-27 21:24:23,764 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 21:24:23,764 - Current search space: {'ctm_params__n_components': 130, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-27 21:24:31,310 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 130
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9923076923076923
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 194.20576647909203	Time: 0:00:45.105224: : 50it [37:50, 45.41s/it]
100%|██████████| 11593/11593 [00:25<00:00, 450.98it/s]
100%|██████████| 11593/11593 [00:25<00:00, 451.09it/s]
100%|██████████| 11593/11593 [00:25<00:00, 452.42it/s]


2024-01-27 22:04:06,856 - Compute evaluation metrics
2024-01-27 22:05:26,887 - Evaluation metric (c_npmi): 0.011838775302480984
2024-01-27 22:08:16,437 - Evaluation metric (c_v): 0.4566583078299743
2024-01-27 22:08:16,893 - Evaluation metric (u_mass): -0.10376791726502635
2024-01-27 22:09:33,586 - Evaluation metric (c_uci): -0.3968864331032356
2024-01-27 22:09:33,586 - Evaluation metric (topic_diversity): 0.3253846153846154
2024-01-27 22:09:33,862 - Evaluation metric (inverted_rbo): 0.9390511299885884
2024-01-27 22:09:33,866 - Evaluation metric (pairwise_jaccard_similarity): 0.040610593960504655




2024-01-27 22:09:35,764 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 22:09:35,764 - Current search space: {'ctm_params__n_components': 130, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-27 22:09:44,668 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 130
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9923076923076923
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 194.21720055566317	Time: 0:00:46.961741: : 50it [39:24, 47.28s/it]
100%|██████████| 11593/11593 [00:25<00:00, 446.75it/s]
100%|██████████| 11593/11593 [00:26<00:00, 441.97it/s]
100%|██████████| 11593/11593 [00:25<00:00, 448.19it/s]


2024-01-27 22:50:53,692 - Compute evaluation metrics
2024-01-27 22:52:11,812 - Evaluation metric (c_npmi): 0.008752352701725374
2024-01-27 22:54:59,100 - Evaluation metric (c_v): 0.452429401835685
2024-01-27 22:54:59,465 - Evaluation metric (u_mass): -0.1054279957035347
2024-01-27 22:56:15,784 - Evaluation metric (c_uci): -0.46951352823845166
2024-01-27 22:56:15,784 - Evaluation metric (topic_diversity): 0.30615384615384617
2024-01-27 22:56:16,062 - Evaluation metric (inverted_rbo): 0.934559879013479
2024-01-27 22:56:16,067 - Evaluation metric (pairwise_jaccard_similarity): 0.04351479387811302




2024-01-27 22:56:18,117 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 22:56:18,118 - Current search space: {'ctm_params__n_components': 120, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-27 22:56:24,917 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 120
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9916666666666667
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 190.43876021117322	Time: 0:00:45.399940: : 50it [37:45, 45.32s/it]
100%|██████████| 11593/11593 [00:25<00:00, 452.01it/s]
100%|██████████| 11593/11593 [00:25<00:00, 449.12it/s]
100%|██████████| 11593/11593 [00:25<00:00, 449.71it/s]


2024-01-27 23:35:54,608 - Compute evaluation metrics
2024-01-27 23:37:15,208 - Evaluation metric (c_npmi): 0.009947005022972376
2024-01-27 23:40:12,223 - Evaluation metric (c_v): 0.4554942365012313
2024-01-27 23:40:12,605 - Evaluation metric (u_mass): -0.09362488060127704
2024-01-27 23:41:31,505 - Evaluation metric (c_uci): -0.45590104291781747
2024-01-27 23:41:31,505 - Evaluation metric (topic_diversity): 0.3425
2024-01-27 23:41:31,746 - Evaluation metric (inverted_rbo): 0.9473482160305422
2024-01-27 23:41:31,750 - Evaluation metric (pairwise_jaccard_similarity): 0.035896845898602654




2024-01-27 23:41:33,531 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-27 23:41:33,532 - Current search space: {'ctm_params__n_components': 120, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-27 23:41:40,500 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 120
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9916666666666667
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 190.5304427953158	Time: 0:00:44.835153: : 50it [37:55, 45.52s/it] 
100%|██████████| 11593/11593 [00:26<00:00, 439.65it/s]
100%|██████████| 11593/11593 [00:26<00:00, 439.89it/s]
100%|██████████| 11593/11593 [00:26<00:00, 441.73it/s]


2024-01-28 00:21:21,365 - Compute evaluation metrics
2024-01-28 00:22:51,538 - Evaluation metric (c_npmi): 0.008649406727014594
2024-01-28 00:26:05,309 - Evaluation metric (c_v): 0.4528259565137237
2024-01-28 00:26:05,657 - Evaluation metric (u_mass): -0.1758025083729229
2024-01-28 00:27:32,363 - Evaluation metric (c_uci): -0.4909286717708376
2024-01-28 00:27:32,363 - Evaluation metric (topic_diversity): 0.335
2024-01-28 00:27:32,593 - Evaluation metric (inverted_rbo): 0.9389529610892827
2024-01-28 00:27:32,597 - Evaluation metric (pairwise_jaccard_similarity): 0.04157833656266111




2024-01-28 00:27:34,338 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 00:27:34,338 - Current search space: {'ctm_params__n_components': 110, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-28 00:27:41,408 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 110
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.990909090909091
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 186.90725705309356	Time: 0:00:44.545167: : 50it [37:38, 45.17s/it]
100%|██████████| 11593/11593 [00:26<00:00, 445.09it/s]
100%|██████████| 11593/11593 [00:25<00:00, 448.56it/s]
100%|██████████| 11593/11593 [00:25<00:00, 451.64it/s]


2024-01-28 01:07:04,671 - Compute evaluation metrics
2024-01-28 01:08:38,136 - Evaluation metric (c_npmi): 0.010079173896185874
2024-01-28 01:11:56,777 - Evaluation metric (c_v): 0.45567028712677343
2024-01-28 01:11:57,152 - Evaluation metric (u_mass): -0.09685304752149364
2024-01-28 01:13:25,924 - Evaluation metric (c_uci): -0.45620729384331593
2024-01-28 01:13:25,925 - Evaluation metric (topic_diversity): 0.3709090909090909
2024-01-28 01:13:26,121 - Evaluation metric (inverted_rbo): 0.9464656486188717
2024-01-28 01:13:26,125 - Evaluation metric (pairwise_jaccard_similarity): 0.03589837462491976




2024-01-28 01:13:27,679 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 01:13:27,679 - Current search space: {'ctm_params__n_components': 110, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-28 01:13:36,501 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 110
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.990909090909091
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 186.9646589879417	Time: 0:00:46.133843: : 50it [39:12, 47.05s/it] 
100%|██████████| 11593/11593 [00:26<00:00, 434.91it/s]
100%|██████████| 11593/11593 [00:26<00:00, 434.29it/s]
100%|██████████| 11593/11593 [00:26<00:00, 438.17it/s]


2024-01-28 01:54:36,720 - Compute evaluation metrics
2024-01-28 01:56:09,311 - Evaluation metric (c_npmi): 0.007435766615815168
2024-01-28 01:59:25,943 - Evaluation metric (c_v): 0.4552384848869664
2024-01-28 01:59:26,275 - Evaluation metric (u_mass): -0.10733032748297443
2024-01-28 02:00:55,217 - Evaluation metric (c_uci): -0.5370328655194282
2024-01-28 02:00:55,217 - Evaluation metric (topic_diversity): 0.3654545454545455
2024-01-28 02:00:55,413 - Evaluation metric (inverted_rbo): 0.9432422605738425
2024-01-28 02:00:55,416 - Evaluation metric (pairwise_jaccard_similarity): 0.03682480062797048




2024-01-28 02:00:57,075 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 02:00:57,075 - Current search space: {'ctm_params__n_components': 100, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-28 02:01:04,767 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 100
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.99
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 183.37435316595233	Time: 0:00:45.336885: : 50it [37:35, 45.12s/it]
100%|██████████| 11593/11593 [00:25<00:00, 446.07it/s]
100%|██████████| 11593/11593 [00:26<00:00, 443.98it/s]
100%|██████████| 11593/11593 [00:25<00:00, 453.14it/s]


2024-01-28 02:40:25,258 - Compute evaluation metrics
2024-01-28 02:42:08,659 - Evaluation metric (c_npmi): 0.008259462005277371
2024-01-28 02:45:54,399 - Evaluation metric (c_v): 0.45291684060037324
2024-01-28 02:45:54,707 - Evaluation metric (u_mass): -0.09871513978296088
2024-01-28 02:47:36,906 - Evaluation metric (c_uci): -0.4734779621932831
2024-01-28 02:47:36,907 - Evaluation metric (topic_diversity): 0.384
2024-01-28 02:47:37,063 - Evaluation metric (inverted_rbo): 0.9500593220883752
2024-01-28 02:47:37,066 - Evaluation metric (pairwise_jaccard_similarity): 0.03313038946681923




2024-01-28 02:47:38,500 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 02:47:38,500 - Current search space: {'ctm_params__n_components': 100, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-28 02:47:47,841 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 100
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.99
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 183.68306913369273	Time: 0:00:46.947237: : 50it [39:04, 46.88s/it]
100%|██████████| 11593/11593 [00:26<00:00, 432.39it/s]
100%|██████████| 11593/11593 [00:26<00:00, 436.76it/s]
100%|██████████| 11593/11593 [00:26<00:00, 445.75it/s]


2024-01-28 03:28:37,015 - Compute evaluation metrics
2024-01-28 03:30:17,610 - Evaluation metric (c_npmi): 0.0030865823824956706
2024-01-28 03:33:52,314 - Evaluation metric (c_v): 0.449709964928254
2024-01-28 03:33:52,791 - Evaluation metric (u_mass): -0.09119564577133078
2024-01-28 03:35:31,801 - Evaluation metric (c_uci): -0.622975468003582
2024-01-28 03:35:31,801 - Evaluation metric (topic_diversity): 0.367
2024-01-28 03:35:31,958 - Evaluation metric (inverted_rbo): 0.9448858994171919
2024-01-28 03:35:31,961 - Evaluation metric (pairwise_jaccard_similarity): 0.038971204616199906




2024-01-28 03:35:33,411 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 03:35:33,412 - Current search space: {'ctm_params__n_components': 90, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-28 03:35:40,352 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 90
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9888888888888889
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 179.7223285059011	Time: 0:00:45.285765: : 50it [37:40, 45.20s/it] 
100%|██████████| 11593/11593 [00:25<00:00, 452.18it/s]
100%|██████████| 11593/11593 [00:25<00:00, 447.53it/s]
100%|██████████| 11593/11593 [00:25<00:00, 458.26it/s]


2024-01-28 04:15:03,971 - Compute evaluation metrics
2024-01-28 04:16:48,808 - Evaluation metric (c_npmi): 0.014787606676714008
2024-01-28 04:20:33,545 - Evaluation metric (c_v): 0.4629881639552706
2024-01-28 04:20:33,842 - Evaluation metric (u_mass): -0.07502465852678028
2024-01-28 04:22:16,064 - Evaluation metric (c_uci): -0.3699743965406955
2024-01-28 04:22:16,064 - Evaluation metric (topic_diversity): 0.4222222222222222
2024-01-28 04:22:16,193 - Evaluation metric (inverted_rbo): 0.9495886999993882
2024-01-28 04:22:16,195 - Evaluation metric (pairwise_jaccard_similarity): 0.033196206367899946




2024-01-28 04:22:17,469 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 04:22:17,470 - Current search space: {'ctm_params__n_components': 90, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-28 04:22:25,338 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 90
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9888888888888889
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 179.7098732889727	Time: 0:00:45.753550: : 50it [38:13, 45.86s/it] 
100%|██████████| 11593/11593 [00:26<00:00, 437.65it/s]
100%|██████████| 11593/11593 [00:26<00:00, 440.51it/s]
100%|██████████| 11593/11593 [00:25<00:00, 446.69it/s]


2024-01-28 05:02:24,238 - Compute evaluation metrics
2024-01-28 05:04:13,606 - Evaluation metric (c_npmi): 0.013248819737738136
2024-01-28 05:08:03,971 - Evaluation metric (c_v): 0.4625382892553399
2024-01-28 05:08:04,289 - Evaluation metric (u_mass): -0.08938634546967163
2024-01-28 05:09:49,241 - Evaluation metric (c_uci): -0.40816060487963873
2024-01-28 05:09:49,241 - Evaluation metric (topic_diversity): 0.4122222222222222
2024-01-28 05:09:49,369 - Evaluation metric (inverted_rbo): 0.9515586838413983
2024-01-28 05:09:49,371 - Evaluation metric (pairwise_jaccard_similarity): 0.0322489499684804




2024-01-28 05:09:50,637 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 05:09:50,638 - Current search space: {'ctm_params__n_components': 80, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-28 05:09:58,187 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 80
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9875
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 176.00692573031367	Time: 0:00:45.137561: : 50it [37:22, 44.85s/it]
100%|██████████| 11593/11593 [00:25<00:00, 452.65it/s]
100%|██████████| 11593/11593 [00:25<00:00, 447.17it/s]
100%|██████████| 11593/11593 [00:25<00:00, 452.99it/s]


2024-01-28 05:49:04,611 - Compute evaluation metrics
2024-01-28 05:51:15,488 - Evaluation metric (c_npmi): 0.015689721963306612
2024-01-28 05:55:51,151 - Evaluation metric (c_v): 0.4722795852329897
2024-01-28 05:55:51,461 - Evaluation metric (u_mass): -0.06299091575592557
2024-01-28 05:57:57,983 - Evaluation metric (c_uci): -0.3589707154069258
2024-01-28 05:57:57,983 - Evaluation metric (topic_diversity): 0.4675
2024-01-28 05:57:58,084 - Evaluation metric (inverted_rbo): 0.9606923867943693
2024-01-28 05:57:58,086 - Evaluation metric (pairwise_jaccard_similarity): 0.026649720009539232




2024-01-28 05:57:59,259 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 05:57:59,260 - Current search space: {'ctm_params__n_components': 80, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-28 05:58:08,732 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 80
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9875
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 176.18192584865088	Time: 0:00:46.356713: : 50it [39:51, 47.82s/it]
100%|██████████| 11593/11593 [00:26<00:00, 441.60it/s]
100%|██████████| 11593/11593 [00:26<00:00, 440.74it/s]
100%|██████████| 11593/11593 [00:26<00:00, 445.11it/s]


2024-01-28 06:39:43,509 - Compute evaluation metrics
2024-01-28 06:41:44,683 - Evaluation metric (c_npmi): 0.016601746034899473
2024-01-28 06:46:05,465 - Evaluation metric (c_v): 0.4648063043467001
2024-01-28 06:46:05,742 - Evaluation metric (u_mass): -0.07981585153644297
2024-01-28 06:48:07,397 - Evaluation metric (c_uci): -0.34885392181388014
2024-01-28 06:48:07,398 - Evaluation metric (topic_diversity): 0.4625
2024-01-28 06:48:07,501 - Evaluation metric (inverted_rbo): 0.9584132616169394
2024-01-28 06:48:07,502 - Evaluation metric (pairwise_jaccard_similarity): 0.02914696392402459




2024-01-28 06:48:08,620 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 06:48:08,620 - Current search space: {'ctm_params__n_components': 70, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-28 06:48:15,677 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 70
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9857142857142858
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 172.57163110579023	Time: 0:00:47.283663: : 50it [38:25, 46.11s/it]
100%|██████████| 11593/11593 [00:25<00:00, 447.18it/s]
100%|██████████| 11593/11593 [00:25<00:00, 447.80it/s]
100%|██████████| 11593/11593 [00:25<00:00, 454.60it/s]


2024-01-28 07:28:22,754 - Compute evaluation metrics
2024-01-28 07:30:37,994 - Evaluation metric (c_npmi): 0.012788925365824195
2024-01-28 07:35:32,067 - Evaluation metric (c_v): 0.4656832973562144
2024-01-28 07:35:32,391 - Evaluation metric (u_mass): -0.06174982972116871
2024-01-28 07:37:46,052 - Evaluation metric (c_uci): -0.4682526232982201
2024-01-28 07:37:46,053 - Evaluation metric (topic_diversity): 0.5
2024-01-28 07:37:46,131 - Evaluation metric (inverted_rbo): 0.964952863204333
2024-01-28 07:37:46,132 - Evaluation metric (pairwise_jaccard_similarity): 0.025488458600010144




2024-01-28 07:37:47,115 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 07:37:47,115 - Current search space: {'ctm_params__n_components': 70, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-28 07:37:53,910 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 70
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9857142857142858
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 173.26729311656754	Time: 0:00:46.850668: : 50it [38:11, 45.82s/it]
100%|██████████| 11593/11593 [00:26<00:00, 443.06it/s]
100%|██████████| 11593/11593 [00:25<00:00, 446.41it/s]
100%|██████████| 11593/11593 [00:25<00:00, 455.16it/s]


2024-01-28 08:17:48,795 - Compute evaluation metrics
2024-01-28 08:20:02,463 - Evaluation metric (c_npmi): 0.012141050392125461
2024-01-28 08:24:38,500 - Evaluation metric (c_v): 0.4583426082482775
2024-01-28 08:24:38,751 - Evaluation metric (u_mass): -0.07642289787139823
2024-01-28 08:26:49,954 - Evaluation metric (c_uci): -0.3624580474342351
2024-01-28 08:26:49,954 - Evaluation metric (topic_diversity): 0.45285714285714285
2024-01-28 08:26:50,032 - Evaluation metric (inverted_rbo): 0.9541271344466135
2024-01-28 08:26:50,034 - Evaluation metric (pairwise_jaccard_similarity): 0.03272634681207296




2024-01-28 08:26:51,016 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 08:26:51,017 - Current search space: {'ctm_params__n_components': 60, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-28 08:26:57,991 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 60
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9833333333333333
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 169.18969624152754	Time: 0:00:44.470281: : 50it [37:27, 44.95s/it]
100%|██████████| 11593/11593 [00:25<00:00, 446.93it/s]
100%|██████████| 11593/11593 [00:25<00:00, 450.87it/s]
100%|██████████| 11593/11593 [00:25<00:00, 454.34it/s]


2024-01-28 09:06:09,005 - Compute evaluation metrics
2024-01-28 09:08:37,955 - Evaluation metric (c_npmi): 0.021325348784404455
2024-01-28 09:13:49,018 - Evaluation metric (c_v): 0.4798417017768547
2024-01-28 09:13:49,282 - Evaluation metric (u_mass): -0.044537571907959796
2024-01-28 09:16:13,547 - Evaluation metric (c_uci): -0.26535297086947834
2024-01-28 09:16:13,547 - Evaluation metric (topic_diversity): 0.5583333333333333
2024-01-28 09:16:13,605 - Evaluation metric (inverted_rbo): 0.9693226438508313
2024-01-28 09:16:13,606 - Evaluation metric (pairwise_jaccard_similarity): 0.02191251223496605




2024-01-28 09:16:14,477 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 09:16:14,477 - Current search space: {'ctm_params__n_components': 60, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-28 09:16:23,335 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 60
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9833333333333333
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 169.26694841101877	Time: 0:00:45.564327: : 50it [38:42, 46.45s/it]
100%|██████████| 11593/11593 [00:26<00:00, 436.25it/s]
100%|██████████| 11593/11593 [00:26<00:00, 434.99it/s]
100%|██████████| 11593/11593 [00:25<00:00, 446.31it/s]


2024-01-28 09:56:51,115 - Compute evaluation metrics
2024-01-28 09:59:18,347 - Evaluation metric (c_npmi): 0.01757924387616224
2024-01-28 10:04:21,748 - Evaluation metric (c_v): 0.47811465886427484
2024-01-28 10:04:21,999 - Evaluation metric (u_mass): -0.06613303351478839
2024-01-28 10:06:46,302 - Evaluation metric (c_uci): -0.3593112616349833
2024-01-28 10:06:46,302 - Evaluation metric (topic_diversity): 0.5683333333333334
2024-01-28 10:06:46,359 - Evaluation metric (inverted_rbo): 0.9689217480188378
2024-01-28 10:06:46,360 - Evaluation metric (pairwise_jaccard_similarity): 0.023518638661368517




2024-01-28 10:06:47,197 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 10:06:47,197 - Current search space: {'ctm_params__n_components': 50, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-28 10:06:55,175 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 50
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.98
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 165.51378765000896	Time: 0:00:44.041330: : 50it [37:55, 45.51s/it]
100%|██████████| 11593/11593 [00:25<00:00, 448.26it/s]
100%|██████████| 11593/11593 [00:25<00:00, 448.92it/s]
100%|██████████| 11593/11593 [00:25<00:00, 448.17it/s]


2024-01-28 10:46:34,207 - Compute evaluation metrics
2024-01-28 10:48:58,666 - Evaluation metric (c_npmi): 0.021287658633403157
2024-01-28 10:53:57,122 - Evaluation metric (c_v): 0.48816348424490985
2024-01-28 10:53:57,355 - Evaluation metric (u_mass): -0.06144843729323909
2024-01-28 10:56:18,878 - Evaluation metric (c_uci): -0.3452859459334087
2024-01-28 10:56:18,878 - Evaluation metric (topic_diversity): 0.65
2024-01-28 10:56:18,919 - Evaluation metric (inverted_rbo): 0.977740078307895
2024-01-28 10:56:18,919 - Evaluation metric (pairwise_jaccard_similarity): 0.01700826423073218




2024-01-28 10:56:19,612 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 10:56:19,613 - Current search space: {'ctm_params__n_components': 50, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-28 10:56:28,726 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 50
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.98
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 165.62203979887084	Time: 0:00:48.570740: : 50it [39:37, 47.56s/it]
100%|██████████| 11593/11593 [00:30<00:00, 384.23it/s]
100%|██████████| 11593/11593 [00:26<00:00, 437.80it/s]
100%|██████████| 11593/11593 [00:26<00:00, 442.66it/s]


2024-01-28 11:37:55,441 - Compute evaluation metrics
2024-01-28 11:40:18,398 - Evaluation metric (c_npmi): 0.02020645838461285
2024-01-28 11:45:11,664 - Evaluation metric (c_v): 0.48678744784219935
2024-01-28 11:45:11,928 - Evaluation metric (u_mass): -0.06063438811638329
2024-01-28 11:47:31,643 - Evaluation metric (c_uci): -0.36012905052221383
2024-01-28 11:47:31,644 - Evaluation metric (topic_diversity): 0.62
2024-01-28 11:47:31,684 - Evaluation metric (inverted_rbo): 0.9757485393385481
2024-01-28 11:47:31,684 - Evaluation metric (pairwise_jaccard_similarity): 0.01857894378460501




2024-01-28 11:47:32,387 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 11:47:32,388 - Current search space: {'ctm_params__n_components': 40, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-28 11:47:40,025 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 40
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.975
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 162.1715994374846	Time: 0:00:43.950842: : 50it [38:40, 46.40s/it] 
100%|██████████| 11593/11593 [00:25<00:00, 453.77it/s]
100%|██████████| 11593/11593 [00:25<00:00, 448.05it/s]
100%|██████████| 11593/11593 [00:25<00:00, 455.62it/s]


2024-01-28 12:28:01,583 - Compute evaluation metrics
2024-01-28 12:30:19,645 - Evaluation metric (c_npmi): 0.01923167289102349
2024-01-28 12:34:52,783 - Evaluation metric (c_v): 0.4925892641664767
2024-01-28 12:34:53,020 - Evaluation metric (u_mass): -0.06600760036995149
2024-01-28 12:37:10,105 - Evaluation metric (c_uci): -0.4163977340560979
2024-01-28 12:37:10,106 - Evaluation metric (topic_diversity): 0.7725
2024-01-28 12:37:10,131 - Evaluation metric (inverted_rbo): 0.9849644050015202
2024-01-28 12:37:10,132 - Evaluation metric (pairwise_jaccard_similarity): 0.011265027828495302




2024-01-28 12:37:10,702 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 12:37:10,702 - Current search space: {'ctm_params__n_components': 40, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-28 12:37:19,658 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 40
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.975
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 162.30783833200312	Time: 0:00:46.967971: : 50it [38:30, 46.21s/it]
100%|██████████| 11593/11593 [00:25<00:00, 449.16it/s]
100%|██████████| 11593/11593 [00:26<00:00, 443.96it/s]
100%|██████████| 11593/11593 [00:25<00:00, 448.90it/s]


2024-01-28 13:17:32,840 - Compute evaluation metrics
2024-01-28 13:19:52,076 - Evaluation metric (c_npmi): 0.021814815808285298
2024-01-28 13:24:24,359 - Evaluation metric (c_v): 0.48831057361807345
2024-01-28 13:24:24,599 - Evaluation metric (u_mass): -0.047960227596935426
2024-01-28 13:26:42,979 - Evaluation metric (c_uci): -0.3668609830701309
2024-01-28 13:26:42,979 - Evaluation metric (topic_diversity): 0.735
2024-01-28 13:26:43,005 - Evaluation metric (inverted_rbo): 0.9787119553117399
2024-01-28 13:26:43,006 - Evaluation metric (pairwise_jaccard_similarity): 0.014987836579286906




2024-01-28 13:26:43,599 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 13:26:43,600 - Current search space: {'ctm_params__n_components': 30, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-28 13:26:51,052 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 30
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9666666666666667
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 158.98490022905455	Time: 0:00:44.528774: : 50it [37:27, 44.96s/it]
100%|██████████| 11593/11593 [00:25<00:00, 456.27it/s]
100%|██████████| 11593/11593 [00:25<00:00, 450.97it/s]
100%|██████████| 11593/11593 [00:25<00:00, 449.98it/s]


2024-01-28 14:06:00,857 - Compute evaluation metrics
2024-01-28 14:08:13,369 - Evaluation metric (c_npmi): 0.009324197722328896
2024-01-28 14:12:20,836 - Evaluation metric (c_v): 0.47989809491092605
2024-01-28 14:12:21,060 - Evaluation metric (u_mass): -0.04281504024144053
2024-01-28 14:14:29,145 - Evaluation metric (c_uci): -0.6675739439661913
2024-01-28 14:14:29,145 - Evaluation metric (topic_diversity): 0.7533333333333333
2024-01-28 14:14:29,160 - Evaluation metric (inverted_rbo): 0.9776998343207718
2024-01-28 14:14:29,160 - Evaluation metric (pairwise_jaccard_similarity): 0.016996927053508658




2024-01-28 14:14:29,620 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 14:14:29,621 - Current search space: {'ctm_params__n_components': 30, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-28 14:14:38,426 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 30
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9666666666666667
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 159.15694896297836	Time: 0:00:53.188012: : 50it [39:50, 47.81s/it]
100%|██████████| 11593/11593 [00:25<00:00, 445.97it/s]
100%|██████████| 11593/11593 [00:25<00:00, 448.12it/s]
100%|██████████| 11593/11593 [00:25<00:00, 450.23it/s]


2024-01-28 14:56:10,998 - Compute evaluation metrics
2024-01-28 14:58:21,136 - Evaluation metric (c_npmi): 0.005082233041581769
2024-01-28 15:02:30,746 - Evaluation metric (c_v): 0.47966558224263384
2024-01-28 15:02:31,097 - Evaluation metric (u_mass): -0.041009462077637025
2024-01-28 15:04:40,432 - Evaluation metric (c_uci): -0.707289573193784
2024-01-28 15:04:40,433 - Evaluation metric (topic_diversity): 0.7833333333333333
2024-01-28 15:04:40,447 - Evaluation metric (inverted_rbo): 0.9800083707568638
2024-01-28 15:04:40,447 - Evaluation metric (pairwise_jaccard_similarity): 0.013656386593292744




2024-01-28 15:04:40,906 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 15:04:40,907 - Current search space: {'ctm_params__n_components': 20, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-28 15:04:48,127 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 156.2635471134699	Time: 0:00:44.355721: : 50it [37:57, 45.55s/it] 
100%|██████████| 11593/11593 [00:25<00:00, 447.34it/s]
100%|██████████| 11593/11593 [00:25<00:00, 453.25it/s]
100%|██████████| 11593/11593 [00:26<00:00, 437.76it/s]


2024-01-28 15:44:27,850 - Compute evaluation metrics
2024-01-28 15:46:27,813 - Evaluation metric (c_npmi): 0.0075043854126487924
2024-01-28 15:50:02,927 - Evaluation metric (c_v): 0.4890566852285483
2024-01-28 15:50:03,134 - Evaluation metric (u_mass): -0.023278045022765925
2024-01-28 15:51:58,768 - Evaluation metric (c_uci): -0.5506559266165285
2024-01-28 15:51:58,768 - Evaluation metric (topic_diversity): 0.825
2024-01-28 15:51:58,774 - Evaluation metric (inverted_rbo): 0.9753498791097744
2024-01-28 15:51:58,775 - Evaluation metric (pairwise_jaccard_similarity): 0.01778340429147012




2024-01-28 15:51:59,085 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 15:51:59,086 - Current search space: {'ctm_params__n_components': 20, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
2024-01-28 15:52:07,705 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 156.24237632224936	Time: 0:00:52.082210: : 50it [39:22, 47.25s/it]
100%|██████████| 11593/11593 [00:26<00:00, 441.36it/s]
100%|██████████| 11593/11593 [00:26<00:00, 435.69it/s]
100%|██████████| 11593/11593 [00:26<00:00, 440.91it/s]


2024-01-28 16:33:15,168 - Compute evaluation metrics
2024-01-28 16:35:20,322 - Evaluation metric (c_npmi): -0.008396958506467772
2024-01-28 16:39:06,582 - Evaluation metric (c_v): 0.4615869721208029
2024-01-28 16:39:06,790 - Evaluation metric (u_mass): -0.04097863135991818
2024-01-28 16:41:08,925 - Evaluation metric (c_uci): -0.9781041441952185
2024-01-28 16:41:08,926 - Evaluation metric (topic_diversity): 0.8
2024-01-28 16:41:08,933 - Evaluation metric (inverted_rbo): 0.9756071903160151
2024-01-28 16:41:08,933 - Evaluation metric (pairwise_jaccard_similarity): 0.020163217641627294




2024-01-28 16:41:09,263 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 16:41:09,263 - Current search space: {'ctm_params__n_components': 10, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
2024-01-28 16:41:17,069 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 154.47202147081688	Time: 0:00:59.508436: : 50it [45:11, 54.22s/it]
100%|██████████| 11593/11593 [00:31<00:00, 366.44it/s]
100%|██████████| 11593/11593 [00:31<00:00, 362.75it/s]
100%|██████████| 11593/11593 [00:30<00:00, 381.30it/s]


2024-01-28 17:28:35,102 - Compute evaluation metrics
2024-01-28 17:30:39,995 - Evaluation metric (c_npmi): -0.0037132237654912206
2024-01-28 17:34:11,099 - Evaluation metric (c_v): 0.46088252809920444
2024-01-28 17:34:11,260 - Evaluation metric (u_mass): 1.0467537947527848e-12
2024-01-28 17:36:10,350 - Evaluation metric (c_uci): -0.5561309502903538
2024-01-28 17:36:10,351 - Evaluation metric (topic_diversity): 0.82
2024-01-28 17:36:10,352 - Evaluation metric (inverted_rbo): 0.963981010104127
2024-01-28 17:36:10,352 - Evaluation metric (pairwise_jaccard_similarity): 0.03378635659337414
2024-01-28 17:36:10,530 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 17:36:10,530 - Current search space: {'ctm_params__n_components': 10, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}




2024-01-28 17:36:19,833 - Found existing sbert embeddings at ctm_grid_search_20240127_110352/embeddings_all-mpnet-base-v2.pkl. Reusing them.
Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [50/50]	 Seen Samples: [37094400/37095650]	Train Loss: 154.46821695660788	Time: 0:00:55.603949: : 50it [45:13, 54.27s/it]
100%|██████████| 11593/11593 [00:27<00:00, 417.41it/s]
100%|██████████| 11593/11593 [00:27<00:00, 425.56it/s]
100%|██████████| 11593/11593 [00:26<00:00, 429.85it/s]


2024-01-28 18:23:24,548 - Compute evaluation metrics
2024-01-28 18:25:26,946 - Evaluation metric (c_npmi): -0.004019146639864242
2024-01-28 18:28:52,267 - Evaluation metric (c_v): 0.4466766404567052
2024-01-28 18:28:52,487 - Evaluation metric (u_mass): 1.0300871267071605e-12
2024-01-28 18:30:48,488 - Evaluation metric (c_uci): -0.5064477273098839
2024-01-28 18:30:48,488 - Evaluation metric (topic_diversity): 0.85
2024-01-28 18:30:48,497 - Evaluation metric (inverted_rbo): 0.9734065552092064
2024-01-28 18:30:48,498 - Evaluation metric (pairwise_jaccard_similarity): 0.025351127105513068




2024-01-28 18:30:48,709 - Saved result.json at: ctm_grid_search_20240127_110352/result.json



2024-01-28 18:30:48,709 - Search ends


In [20]:
# load the best model from the checkpoints

search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
training_datetime = datetime(2024, 1, 27, 11, 3, 52)
training_folder = Path(f'ctm_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')

training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)


# load the embeddings
model_name_or_path = training_result['best_hyperparameters']['sbert_params']['model_name_or_path']
embeddings_path = training_folder.joinpath(f'embeddings_{model_name_or_path}.pkl')
with open(embeddings_path, 'rb') as f:
    embeddings = np.load(f)

best_model_path = training_result['best_model_checkpoint']
ctm_hyperparameters = training_result['best_hyperparameters']['ctm_params']

ctm_hyperparameters['bow_size'] = 2000
ctm_hyperparameters['contextual_size'] = 768

# best_model_path = [p for p in Path(best_model_path).iterdir() if p.is_dir()][-1]        # get the last dir (since there 's only one dir inside) -> get the only dir

best_model = _load_ctm_model(Path(best_model_path), ctm_hyperparameters, epoch=99)
topic_lists = best_model.get_topic_lists(k=10)

TypeError: _load_ctm_model() got an unexpected keyword argument 'epoch'

In [72]:
topic_lists[8]

['content',
 'update',
 'game',
 'hour',
 'new',
 'one',
 'time',
 'developer',
 'play',
 'still']

inference / evaluation

In [62]:
# create bow

countvect_params = training_result['best_hyperparameters']['countvect_params']
countvect_params['ngram_range'] = tuple(countvect_params['ngram_range'])     # convert list to tuple

vectorizer = CountVectorizer(**countvect_params, max_features=2000)
vectorizer.fit_transform(X_preprocessed)
temp_vocabulary = set(vectorizer.get_feature_names_out())

preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in temp_vocabulary])
                    for doc in X_preprocessed]
text_for_bow = preprocessed_docs_tmp

tp = TopicModelDataPreparation()

training_dataset = tp.fit(text_for_contextual=X, text_for_bow=text_for_bow, custom_embeddings=embeddings)

In [63]:
training_dataset.X_bow.todense().shape

(75499, 2000)

In [73]:
doc_topic_distribution = best_model.get_doc_topic_distribution(training_dataset, n_samples=20)

top_docs = best_model.get_top_documents_per_topic_id(X, doc_topic_distribution, 8, k=10)

  0%|          | 0/1180 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [76]:
for tt in [t[0] for t in top_docs]:
    print(tt)

LEGIT THE BEST GAME EVER BETTER THAN MINECRAFT, SO MANY THINGS TO DO!!!!!!! BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!BEST GAME EVER!!!
Just realized I've had this game for years and never reviewed it. Which is just horrible of me. Of all the games in my steam library this deserves a review. I've enjoyed this 

In [50]:
# within the topic lists (the words)
# find out common words between topics

from itertools import combinations

topic_list = best_model.get_topic_lists(k=10)

common_words = set()
for topic1, topic2 in combinations(topic_list, 2):
    common_words.update(set(topic1).intersection(set(topic2)))

common_words = list(common_words)
common_words.sort()
common_words

['boss',
 'buy',
 'check',
 'content',
 'course',
 'explore',
 'felt',
 'friend',
 'fun',
 'game',
 'get',
 'great',
 'hour',
 'item',
 'like',
 'list',
 'love',
 'mention',
 'minecraft',
 'new',
 'number',
 'one',
 'play',
 'recommend',
 'say',
 'special',
 'spoil',
 'still',
 'stuff',
 'terrarium',
 'time',
 'update',
 'well',
 'world']