CTM Training (hyperparameters grid/random search)

Combined TM

In [1]:
import pandas as pd
import numpy as np


from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
# from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

import nltk
import os

from pathlib import Path
import json
from datetime import datetime

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload

In [3]:
import sys

sys.path.append('../')

In [4]:
# load the dataset

%autoreload 2
from dataset_loader import GENRES, load_dataset

genre = GENRES.INDIE
dataset = load_dataset(genre)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 725737 entries, 25636 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         725737 non-null  int64 
 1   app_id        725737 non-null  int64 
 2   app_name      725737 non-null  object
 3   review_text   725737 non-null  object
 4   review_score  725737 non-null  int64 
 5   review_votes  725737 non-null  int64 
 6   genre_id      725737 non-null  object
 7   category_id   725737 non-null  object
dtypes: int64(4), object(4)
memory usage: 49.8+ MB


In [5]:
# data preprocessing

import sys
sys.path.append('../../sa/')

%autoreload 2
import str_cleaning_functions

# copied from lda_demo_gridsearch.ipynb
def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

# def cleaning_strlist(str_list):
#     str_list = list(map(lambda x: clean(x), str_list))
#     str_list = list(map(lambda x: deEmojify(x), str_list))

#     str_list = list(map(lambda x: x.lower(), str_list))
#     str_list = list(map(lambda x: remove_num(x), str_list))
#     str_list = list(map(lambda x: unify_whitespaces(x), str_list))

#     str_list = list(map(lambda x: _deaccent(x), str_list))
#     str_list = list(map(lambda x: remove_non_alphabets(x), str_list))
#     str_list = list(map(lambda x: remove_stopword(x), str_list))
#     return str_list

# copied from bert_demo_gridsearch.ipynb
def cleaning_little(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))


In [6]:
# create a copy of the dataset, as we need both untouched text and cleaned text

dataset_preprocessed = dataset.copy()

In [7]:
cleaning(dataset_preprocessed, 'review_text')
cleaning_little(dataset, 'review_text')

In [8]:
X_preprocessed_temp = dataset_preprocessed['review_text'].values
X_temp = dataset['review_text'].values

In [9]:
assert X_temp.shape == X_preprocessed_temp.shape, "X_temp and X_preprocessed_temp should have the same shape. Found: {} and {}".format(X_temp.shape, X_preprocessed_temp.shape)
assert len(X_temp) == len(X_preprocessed_temp), "X_temp and X_preprocessed_temp should have the same length. Found: {} and {}".format(len(X_temp), len(X_preprocessed_temp))

In [10]:
# remove docs with 0 len

X, X_preprocessed = [], []

for i, (doc, doc_preprocessed) in enumerate(zip(list(X_temp), list(X_preprocessed_temp))):
    if len(doc) == 0 or len(doc_preprocessed) == 0:
        continue

    X.append(doc)
    X_preprocessed.append(doc_preprocessed)

In [11]:
assert len(X) == len(X_preprocessed), "X and X_preprocessed should have the same length. Found: {} and {}".format(len(X), len(X_preprocessed))

Apply lemmatizing to the preprocessed dataset as well (for BoW)

The function is identical in LDA

In [12]:
# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

In [13]:
X_preprocessed = list(map(lambda x: lemmatization(x), X_preprocessed))
X_preprocessed = list(map(lambda x: ' '.join(x), X_preprocessed))

Training

In [14]:
# copy from: https://github.com/MilaNLProc/contextualized-topic-models/blob/master/contextualized_topic_models/utils/data_preparation.py#L44
# call bert_embeddings_from_list() to produce embeddings by ourself

import warnings
from sentence_transformers import SentenceTransformer
import torch
import platform


if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series mac machine

print(device)


# moved the functions ctm_dataset_creation.py
from ctm_dataset_creation import bert_embeddings_from_list

cuda


In [15]:
from gensim.models import CoherenceModel
from copy import deepcopy

from sklearn.model_selection import ParameterGrid, ParameterSampler

sys.path.append('../')

from eval_metrics import compute_inverted_rbo, compute_topic_diversity, compute_pairwise_jaccard_similarity, \
                        METRICS, SEARCH_BEHAVIOUR, COHERENCE_MODEL_METRICS

In [16]:
def _print_message(message):
    '''Print message with a timestamp in front of it

    Timestamp format: YYYY-MM-DD HH:MM:SS,mmm
    '''
    print(f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]} - {message}')

In [17]:
# init params

def _init_count_vectorizer_params(
        max_features=2000,
        ngram_range=(1,1)
):
    params_dict = {}
    params_dict['max_features'] = max_features
    params_dict['ngram_range'] = ngram_range

    return params_dict

def _init_sbert_params(
    model_name_or_path='all-mpnet-base-v2'
):
    params_dict = {}
    params_dict['model_name_or_path'] = model_name_or_path

    return params_dict

# params are copied from source code of CTM: https://github.com/MilaNLProc/contextualized-topic-models/blob/master/contextualized_topic_models/models/ctm.py#L131
# commented params are params that has no plan on fine-tuning them (not significant to our project)
def _init_ctm_params(
        # bow_size,
        # contextual_size,
        # inference_type="combined",
        n_components=10,
        # model_type="prodLDA",
        hidden_sizes=[100, 100],        # pass as list as json does not support tuple
        # activation="softplus",
        dropout=0.2,
        # learn_priors=True,
        # batch_size=64,
        lr=2e-3,
        momentum=0.99,
        solver="adam",
        num_epochs=100,
        # reduce_on_plateau=False,      # only valid if there's a testing data (seems no need to havbe label, just partition a testing dataset with train_test_split()))
        # num_data_loader_workers=mp.cpu_count(),
        # label_size=0,
        # loss_weights=None
):
    params_dict = {}
    # params_dict['bow_size'] = bow_size                        # decided by the count vectorizer params (max_features)
    # params_dict['contextual_size'] = contextual_size          # decided by the sbert model
    # params_dict['inference_type'] = inference_type
    params_dict['n_components'] = n_components
    # params_dict['model_type'] = model_type
    params_dict['hidden_sizes'] = hidden_sizes
    # params_dict['activation'] = activation
    params_dict['dropout'] = dropout
    # params_dict['learn_priors'] = learn_priors
    # params_dict['batch_size'] = batch_size
    params_dict['lr'] = lr
    params_dict['momentum'] = momentum
    params_dict['solver'] = solver
    params_dict['num_epochs'] = num_epochs

    return params_dict

In [18]:
def _init_config_dict(config_path:Path, model_name:str, hyperparameters:dict, search_space_dict:dict, 
                      metrics:list[METRICS], monitor:METRICS,
                      search_behaviour:SEARCH_BEHAVIOUR, search_rs:int, search_n_iter:int):
    
    if not config_path.exists():
        config = {}

        sbert_params = _init_sbert_params(**hyperparameters['sbert_params'])
        countvect_params = _init_count_vectorizer_params(**hyperparameters['countvect_params'])
        ctm_params = _init_ctm_params(**hyperparameters['ctm_params'])

        config['model'] = model_name
        config['sbert_params'] = sbert_params
        config['countvect_params'] = countvect_params
        config['ctm_params'] = ctm_params

        if 'sbert_params' in search_space_dict:
            for k in search_space_dict['sbert_params'].keys():
                sbert_params.pop(k, '')     # add a default value to avoid key error
        if 'countvect_params' in search_space_dict:
            for k in search_space_dict['countvect_params'].keys():
                countvect_params.pop(k, '')
        if 'ctm_params' in search_space_dict:
            for k in search_space_dict['ctm_params'].keys():
                ctm_params.pop(k, '')

        config['search_space'] = search_space_dict
        config['metrics'] = list(map(lambda x: x.value, metrics))
        config['monitor'] = monitor.value

        config['search_behaviour'] = search_behaviour.value
        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            config['search_rs'] = search_rs
            config['search_n_iter'] = search_n_iter

        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)

        _print_message('Created config file at {}'.format(config_path))
        # print('Created config file at {}'.format(config_path))
    else:
        with open(config_path, 'r') as f:
            config = json.load(f)

        # check whether the input params are consistent with the config file
        assert config['model'] == model_name, 'input model_name is not consistent with the config["model"]'
        assert config['metrics'] == list(map(lambda x: x.value, metrics)), 'input metrics is not consistent with config["metrics"]'
        assert config['monitor'] == monitor.value, 'input monitor is not consistent with config["monitor"]'
        assert config['search_behaviour'] == search_behaviour.value, 'input search_behaviour is not consistent with config["search_behaviour"]'
        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            assert config['search_rs'] == search_rs, 'input search_rs is not consistent with config["search_rs"]'
            assert config['search_n_iter'] == search_n_iter, 'input search_n_iter is not consistent with config["search_n_iter"]'

        # check whether the hyperparameters are consistent with the config file
        sbert_params = _init_sbert_params(**hyperparameters['sbert_params'])
        countvect_params = _init_count_vectorizer_params(**hyperparameters['countvect_params'])
        ctm_params = _init_ctm_params(**hyperparameters['ctm_params'])

        assert config['sbert_params'].keys() <= sbert_params.keys(), 'existing config["sbert_params"] contains additional hyperparameters'
        assert config['countvect_params'].keys() <= countvect_params.keys(), 'existing config["countvect_params"] contains additional hyperparameters'
        assert config['ctm_params'].keys() <= ctm_params.keys(), 'existing config["ctm_params"] contains additional hyperparameters'

        for key in sbert_params.keys() & config['sbert_params'].keys():
            assert sbert_params[key] == config['sbert_params'][key], 'existing config["sbert_params"] contains different hyperparameters'
        for key in countvect_params.keys() & config['countvect_params'].keys():
            assert countvect_params[key] == config['countvect_params'][key], 'existing config["countvect_params"] contains different hyperparameters'
        for key in ctm_params.keys() & config['ctm_params'].keys():
            assert ctm_params[key] == config['ctm_params'][key], 'existing config["ctm_params"] contains different hyperparameters'

        # check whether the search_space is consistent with the config file
        if 'sbert_params' in config['search_space']:
            assert config['search_space']['sbert_params'].keys() == search_space_dict['sbert_params'].keys(), 'input search_space_dict["sbert_params"] contains different hyperparameter keys than existing config["search_space"]["sbert_params"]'
            for k in search_space_dict['sbert_params'].keys():
                assert k in config['search_space']['sbert_params'], f'input search_space_dict["sbert_params"]["{key}"] contains value than existing config["search_space"]["sbert_params"]["{key}"]'
        if 'countvect_params' in config['search_space']:
            assert config['search_space']['countvect_params'].keys() == search_space_dict['countvect_params'].keys(), 'input search_space_dict["countvect_params"] contains different hyperparameter keys than existing config["search_space"]["countvect_params"]'
            for k in search_space_dict['countvect_params'].keys():
                assert k in config['search_space']['countvect_params'], f'input search_space_dict["countvect_params"]["{key}"] contains value than existing config["search_space"]["countvect_params"]["{key}"]'
        if 'ctm_params' in config['search_space']:
            assert config['search_space']['ctm_params'].keys() == search_space_dict['ctm_params'].keys(), 'input search_space_dict["ctm_params"] contains different hyperparameter keys than existing config["search_space"]["ctm_params"]'
            for k in search_space_dict['ctm_params'].keys():
                assert k in config['search_space']['ctm_params'], f'input search_space_dict["ctm_params"]["{key}"] contains value than existing config["search_space"]["ctm_params"]["{key}"]'
        
        _print_message('Loaded existing config file from {}'.format(config_path))
        _print_message('Hyperparameters and search space are consistent with the input parameters')
        # print('Loaded existing config file from {}'.format(config_path))
        # print('Hyperparameters and search space are consistent with the input parameters')

    return config


In [19]:
def _init_result_dict(result_path:Path, monitor_type:str):
    if not result_path.exists():
        result = {}

        result['best_metric'] = -float('inf')
        result['best_model_checkpoint'] = ""
        result['best_hyperparameters'] = dict()
        result["monitor_type"] = monitor_type
        result["log_history"] = list()

    else:
        with open(result_path, 'r') as f:
            result = json.load(f)

        assert result['monitor_type'] == monitor_type

        _print_message('Loaded existing result file from {}'.format(result_path))
        # print('Loaded existing result file from {}'.format(result_path))
    
    return result

In [20]:
from ctm_utils import _load_ctm_model

# their implementation is moved to utils script as it is also used in eval script.

In [21]:
from ctm_utils import _get_topics, _get_topic_word_metrix, _get_topic_document_metrix

# their implementation is moved to utils script as it may be used in eval script.

In [None]:
import pickle
from gensim import corpora
# from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
# from contextualized_topic_models.datasets.dataset import CTMDataset

%autoreload 2
from ctm_dataset_creation import create_ctm_dataset

def model_search(X_contextual, X_bow, hyperparameters:dict, search_space:dict, save_folder:Path,
                 additional_stopwords:list[str]=None,
                 metrics:list[METRICS]=[METRICS.C_NPMI], monitor:METRICS=METRICS.C_NPMI, 
                 save_each_models=True, run_from_checkpoints=False,
                 search_behaviour=SEARCH_BEHAVIOUR.GRID_SEARCH, search_rs=42, search_n_iter=10):
    
    config_json_path = save_folder.joinpath('config.json')
    result_json_path = save_folder.joinpath('result.json')

    if monitor not in metrics:
        raise Exception('monitor is not in metrics. Please modify the metrics passed in.')

    if run_from_checkpoints:
        if not save_folder.exists():
            _print_message('Save folder:' + str(save_folder.resolve()) + ' does not exist. Function terminates.')
            # print('Save folder:' + str(save_folder.resolve()) + ' does not exist. Function terminates.')
            raise Exception('No checkpoints found. Function terminates.')
        
        # check for existing configs
        if not config_json_path.exists():
            raise Exception('No config.json found. Function terminates.')
        
        # check for existing results
        if not result_json_path.exists():
            _print_message('No result.json is found. Assuming no existing checkpoints.')
            # print('No result.json is found. Assuming no existing checkpoints.')
    else:
        if save_folder.exists():
            raise Exception('Checkpoints found. Please delete the checkpoints or set run_from_checkpoints=True. Function terminates.')

    if not save_folder.exists():
        save_folder.mkdir()

    config = _init_config_dict(config_json_path, 'ctm', hyperparameters, search_space,
                               metrics, monitor, search_behaviour, search_rs, search_n_iter)
    result = _init_result_dict(result_json_path, monitor.value)

    _print_message('Search folder: {}'.format(save_folder))
    # print('Search folder: {}'.format(save_folder))

    # init
    best_model_path = result['best_model_checkpoint']
    best_metric_score = result['best_metric']
    best_model = _load_ctm_model(Path(best_model_path),
                                 result['best_hyperparameters']['ctm_params']) if best_model_path != "" else None
    best_hyperparameters = result['best_hyperparameters']


    _print_message('Best model checkpoint: {}'.format(best_model_path))
    _print_message('Best metric score: {}'.format(best_metric_score))
    _print_message('Best model: {}'.format(best_model))
    # print(f'Best model checkpoint: {best_model_path}')
    # print(f'Best metric score: {best_metric_score}')
    # print(f'Best model: {best_model}')

    # search
    # like bertopic, we create a temp dict for initiating the search space
    # then we apply sklearn parameter sampler / parameter grid to get the search space
    temp_search_space = {}
    for k, v in search_space.items():
        for kk, vv in v.items():
            temp_search_space[k + '__' + kk] = vv

    if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
        search_iterator = ParameterSampler(temp_search_space, search_n_iter, random_state=search_rs)
    elif search_behaviour == SEARCH_BEHAVIOUR.GRID_SEARCH:
        search_iterator = ParameterGrid(temp_search_space)

    print('\n')

    for search_space_dict in search_iterator:

        # unwrap the search space dict

        model_name = ''

        _sbert_params = {}
        _countvect_params = {}
        _ctm_params = {}

        for k, v in search_space_dict.items():
            if k.startswith('sbert_params'):
                _sbert_params[k.split('__')[1]] = v
                model_name += 'sb_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('countvect_params'):
                _countvect_params[k.split('__')[1]] = v
                model_name += 'cvect_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('ctm_params'):
                _ctm_params[k.split('__')[1]] = v
                model_name += 'ctm_' + k.split('__')[1] + '_' + str(v) + '_'

        model_name = model_name[:-1]     # remove the last '_'

        model_path = save_folder.joinpath(config['model'] + '_' + model_name)

        # check whether the model exists
        if model_path.exists():
            _print_message('Skipping current search space: {}'.format(search_space_dict))
            # print('Skipping current search space: {}'.format(search_space_dict))
            continue

    
        ##########
        # Training starts
        ##########

        _print_message('Current search space: {}'.format(search_space_dict))
        # print('Current search space: {}'.format(search_space_dict))

        sbert_params = deepcopy(config['sbert_params'])     # deepcopy just for safety (not messing up with the original config)
        countvect_params = deepcopy(config['countvect_params'])
        ctm_params = deepcopy(config['ctm_params'])

        sbert_params.update(_sbert_params)
        countvect_params.update(_countvect_params)
        ctm_params.update(_ctm_params)

        countvect_params['ngram_range'] = tuple(countvect_params['ngram_range'])     # convert list to tuple

        ##########
        # Preprocessing
        ##########

        # for re-producting the result (and inferencing)
        # we need to load the vectorizer, do the exact steps in preprocessing for creating bow
        # then create a CTMDataset for inferencing

        # create bow
        # vectorizer = CountVectorizer(
        #     stop_words="english" if additional_stopwords is None else list(ENGLISH_STOP_WORDS.union(additional_stopwords)),
        #     analyzer='word',
        #     **countvect_params)
        
        # vectorizer = vectorizer.fit(X_bow)
        # vocab = vectorizer.get_feature_names_out()
        # vocab_set = set(vocab)

        # preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in vocab_set])
        #                     for doc in X_bow]
        
        # text_for_contextual, text_for_bow = [], []
        # X_tmp = []

        
        # assert len(X_contextual) == len(preprocessed_docs_tmp), f'len(text_for_contextual): {len(X_contextual)}, len(preprocessed_docs_tmp): {len(preprocessed_docs_tmp)}'
        # assert len(X) == len(X_contextual), f'len(X): {len(X)}, len(text_for_contextual): {len(X_contextual)}'
        
        # # remove empty docs
        # for i, (tfc, tfb) in enumerate(zip(X_contextual, preprocessed_docs_tmp)):
        #     if len(tfb) == 0 or len(tfc) == 0:
        #         continue
                
        #     text_for_contextual.append(tfc)
        #     text_for_bow.append(tfb)
        #     X_tmp.append(X[i])

        # assert len(text_for_contextual) == len(text_for_bow), f'len(text_for_contextual_tmp): {len(text_for_contextual)}, len(text_for_bow_tmp): {len(text_for_bow)}'
        # assert len(X_tmp) == len(text_for_contextual), f'len(X_tmp): {len(X_tmp)}, len(text_for_contextual_tmp): {len(text_for_contextual)}'


        # train_bow_embeddings = vectorizer.transform(text_for_bow)

        
        # # isntead of using default TopicModelDataPreparation(), build the dataset by referencing the source code of it
        # # source code: https://github.com/MilaNLProc/contextualized-topic-models/blob/master/contextualized_topic_models/utils/data_preparation.py
        # # according to the source code, we only need to create the idx2token, then use the countvectorizer above to build the dataset
        # idx2token = {k: v for k, v in zip(range(0, len(vocab)), vocab)}

        
        # # create sbert embeddings
        # if platform.system() == 'Linux' or platform.system() == 'Windows':
        #     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # else:
        #     device = torch.device('mps')        # m-series machine
        

        # # check existing embeddings
        # # reuse them if found
        # embeddings_path = save_folder.joinpath(f'embeddings_{sbert_params["model_name_or_path"]}.pkl')
        # if embeddings_path.exists():
        #     with open(embeddings_path, 'rb') as f:
        #         embeddings = np.load(f)

        #     assert embeddings.shape[0] == len(text_for_contextual), f'embeddings.shape[0]: {embeddings.shape[0]}, len(text_for_contextual): {len(text_for_contextual)}'

        #     _print_message(f'Found existing sbert embeddings at {embeddings_path}. Reusing them.')
        #     # print(f'Found existing sbert embeddings at {embeddings_path}. Reusing them.')
        # else:
        #     embeddings = bert_embeddings_from_list(text_for_contextual, **sbert_params, device=device)

        #     with open(embeddings_path, 'wb') as f:
        #         np.save(f, embeddings)
         


        # # tp = TopicModelDataPreparation()
        # # training_dataset = tp.fit(text_for_contextual=text_for_contextual, text_for_bow=text_for_bow, custom_embeddings=embeddings)
        # training_dataset = CTMDataset(
        #     X_contextual=embeddings,
        #     X_bow=train_bow_embeddings,
        #     idx2token=idx2token,
        #     labels=None
        # )

        training_dataset, vectorizer, embeddings, X_tmp = create_ctm_dataset(
            X_contextual, X_bow, X,
            sbert_params, save_folder,
            vectorizer=None,            # pass None to ask the function to create a new sklearn CountVectorizer
            countvect_params=countvect_params,
            additional_stopwords=additional_stopwords)

        vocab = vectorizer.get_feature_names_out()
        
        # ctm

        ctm_params['bow_size'] = len(vocab)
        ctm_params['contextual_size'] = embeddings.shape[1]
        ctm_params['hidden_sizes'] = tuple(ctm_params['hidden_sizes'])     # convert list to tuple

        ctm = CombinedTM(**ctm_params)
        ctm.device = device
        ctm.fit(training_dataset, verbose=True)

        ##########
        # Training ends
        ##########

        ##########
        # Evaluation starts
        ##########

        # init data for gensim coherence model
        topic_words = _get_topics(ctm, k=10)
        topics = ctm.get_predicted_topics(training_dataset, n_samples=20)

        documents = pd.DataFrame({"Document": X_tmp,
                                "ID": range(len(X_tmp)),
                                "Topic": topics})
        
        docs_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
        texts = [doc.split() for doc in docs_per_topic.Document.values]
        
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        # init octis format result for convenience
        result_octis = {}
        result_octis['topics'] = topic_words
        result_octis['topic-word-matrix'] = _get_topic_word_metrix(ctm)
        result_octis['topic-document-matrix'] = _get_topic_document_metrix(ctm, training_dataset, n_samples=20)

        _print_message('Compute evaluation metrics')
        # print('Compute evaluation metrics')

        metrics_score = dict()

        for metric in metrics:
            if metric in COHERENCE_MODEL_METRICS:
                coherencemodel = CoherenceModel(
                    topics=topic_words, 
                    texts=texts, 
                    corpus=corpus, 
                    dictionary=dictionary, 
                    coherence=metric.value, 
                    topn=10,
                    processes=3
                )
                score = coherencemodel.get_coherence()
            elif metric == METRICS.TOPIC_DIVERSITY:
                score = compute_topic_diversity(result_octis, topk=10)
            elif metric == METRICS.INVERTED_RBO:
                score = compute_inverted_rbo(result_octis, topk=10)
            elif metric == METRICS.PAIRWISE_JACCARD_SIMILARITY:
                score = compute_pairwise_jaccard_similarity(result_octis, topk=10)
            else:
                raise Exception('Unknown metric: {}'.format(metric.value))

            metrics_score[metric.value] = score

            _print_message('Evaluation metric ({}): {}'.format(metric.value, score))
            # print(f'Evaluation metric ({metric.value}): {score}')

        monitor_score = metrics_score[monitor.value]

        ##########
        # Evaluation ends
        ##########

        ##########
        # Save models
        ##########

        if not model_path.exists():
            model_path.mkdir()
        
        if save_each_models:
            ctm.save(models_dir=model_path)

        # save the vectorizer
        # then we can reproduce the result better
        vectorizer_path = model_path.joinpath('count_vectorizer.pkl')
        with open(vectorizer_path, 'wb') as f:
            pickle.dump(vectorizer, f)
        

        ##########
        # Save models ends
        ##########

        ###########
        # Update result dict and json file
        ###########
            
        model_hyperparameters = {
            'sbert_params': sbert_params,
            'countvect_params': countvect_params,
            'ctm_params': ctm_params
        }

        if monitor_score > best_metric_score:
            best_metric_score = monitor_score
            best_model_path = model_path
            best_model = ctm
            best_hyperparameters = model_hyperparameters

        model_log_history = dict()
        model_log_history.update(metrics_score)
        model_log_history['model_name'] = model_name
        model_log_history['hyperparameters']  = model_hyperparameters

        result['best_metric'] = best_metric_score
        result['best_model_checkpoint'] = str(best_model_path)
        result['best_hyperparameters'] = best_hyperparameters
        result['log_history'].append(model_log_history)

        # save result
        with open(result_json_path, 'w') as f:
            json.dump(result, f, indent=2)

        _print_message('Saved result.json at: {}'.format(result_json_path))        
        # print('Saved result.json at:', result_json_path)
        print('\n\n')
    
    _print_message('Search ends')
    # print('Search ends')
    return best_model, best_model_path, best_hyperparameters


In [23]:
# load/create custom stopwords stored in a txt from dataset folder
from pathlib import Path

custom_stopwords_path = Path('../../dataset/topic_modelling/stopwords.txt')
custom_stowords_games_path = Path('../../dataset/topic_modelling/stopwords_games.txt')
game_name_list_path = Path('../../dataset/topic_modelling/game_name_list.txt')

with open(custom_stopwords_path, 'r') as f:
    custom_stopwords = f.read().splitlines()

with open(custom_stowords_games_path, 'r') as f:
    custom_stowords_games = f.read().splitlines()

with open(game_name_list_path, 'r') as f:
    game_name_list = f.read().splitlines()

# also include the stopword list from nltk
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')

custom_stopwords = custom_stopwords + custom_stowords_games + game_name_list + nltk_stopwords
custom_stopwords = list(filter(lambda x: len(x) > 0, custom_stopwords))     # remove empty string
custom_stopwords = set(custom_stopwords)

print(custom_stopwords)
print(len(custom_stopwords))


155929


In [28]:
# grid search / random search

# hyperparameters
sbert_params = _init_sbert_params(model_name_or_path='all-mpnet-base-v2')
countvect_params = _init_count_vectorizer_params(max_features=2000, ngram_range=[1,1])
ctm_params = _init_ctm_params(
    n_components=10, 
    hidden_sizes=[100, 100], 
    dropout=0.2, lr=2e-3, momentum=0.99, solver="adam", 
    num_epochs=25       # original default value is 100 (in LDAProd), some tested with 50
)

search_space_dict = {
    'sbert_params': {
        'model_name_or_path': ['all-MiniLM-L6-v2', 'all-mpnet-base-v2']
    },
    # 'countvect_params': {
    #     'max_features' : [1500, 2000, 2500],
    #     'ngram_range': [[1, 1], [1, 2]]     # datatype is list as json does not support tuple
    # },
    'ctm_params':{
        'n_components': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
        # 'hidden_sizes': [(100, 100), (200, 200), (100, 100, 100), (200, 200, 200)],
        # 'num_epochs':[50]
    }
}

search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
# search_behaviour = SEARCH_BEHAVIOUR.RANDOM_SEARCH

training_datetime = datetime.now()
# training_datetime = datetime(2024, 1, 29, 21, 29, 10)
training_folder = Path(f'ctm_genre_{str(genre)}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')

best_model, best_model_path, best_hyperparameters = model_search(
    X,
    X_preprocessed,
    hyperparameters={
        'sbert_params': sbert_params,
        'countvect_params': countvect_params,
        'ctm_params': ctm_params
    },
    search_space=search_space_dict,
    save_folder=training_folder,
    metrics=[METRICS.C_NPMI, METRICS.C_V, METRICS.UMASS, METRICS.C_UCI, METRICS.TOPIC_DIVERSITY, METRICS.INVERTED_RBO, METRICS.PAIRWISE_JACCARD_SIMILARITY],
    monitor=METRICS.C_NPMI,
    save_each_models=True,
    run_from_checkpoints=False,
    search_behaviour=search_behaviour,
    # search_rs=42,
    # search_n_iter=50
)

2024-01-31 23:08:04,612 - Created config file at ctm_genreindie_grid_search_20240131_230804/config.json
2024-01-31 23:08:04,612 - Search folder: ctm_genreindie_grid_search_20240131_230804
2024-01-31 23:08:04,613 - Best model checkpoint: 
2024-01-31 23:08:04,613 - Best metric score: -inf
2024-01-31 23:08:04,613 - Best model: None


2024-01-31 23:08:04,613 - Current search space: {'ctm_params__n_components': 10, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}


Batches: 100%|██████████| 22415/22415 [04:05<00:00, 91.45it/s] 


Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 159.38840246894097	Time: 0:00:38.854616: : 25it [16:18, 39.12s/it]
100%|██████████| 11208/11208 [00:20<00:00, 539.21it/s]
100%|██████████| 11208/11208 [00:19<00:00, 568.63it/s]
100%|██████████| 11208/11208 [00:21<00:00, 524.78it/s]


2024-01-31 23:30:13,626 - Compute evaluation metrics
2024-01-31 23:32:11,251 - Evaluation metric (c_npmi): -0.020748932031732947
2024-01-31 23:35:31,671 - Evaluation metric (c_v): 0.4113026625260341
2024-01-31 23:35:31,881 - Evaluation metric (u_mass): -0.005675840263053142
2024-01-31 23:37:20,930 - Evaluation metric (c_uci): -0.8829632251181634
2024-01-31 23:37:20,930 - Evaluation metric (topic_diversity): 0.88
2024-01-31 23:37:20,932 - Evaluation metric (inverted_rbo): 0.9731156832928571
2024-01-31 23:37:20,932 - Evaluation metric (pairwise_jaccard_similarity): 0.020722831042748486


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-01-31 23:37:21,165 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-01-31 23:37:21,165 - Current search space: {'ctm_params__n_components': 10, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}


Batches: 100%|██████████| 22415/22415 [10:23<00:00, 35.94it/s] 


Settings: 
                   N Components: 10
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 159.42013623191744	Time: 0:00:42.193941: : 25it [17:44, 42.60s/it]
100%|██████████| 11208/11208 [00:22<00:00, 500.47it/s]
100%|██████████| 11208/11208 [00:23<00:00, 482.32it/s]
100%|██████████| 11208/11208 [00:21<00:00, 518.87it/s]


2024-02-01 00:07:24,999 - Compute evaluation metrics
2024-02-01 00:09:14,966 - Evaluation metric (c_npmi): -0.0274623803551875
2024-02-01 00:12:37,179 - Evaluation metric (c_v): 0.42826329246168504
2024-02-01 00:12:37,355 - Evaluation metric (u_mass): -0.0026754951196775486
2024-02-01 00:14:27,151 - Evaluation metric (c_uci): -1.0578310132026263
2024-02-01 00:14:27,152 - Evaluation metric (topic_diversity): 0.84
2024-02-01 00:14:27,153 - Evaluation metric (inverted_rbo): 0.9651195714414286
2024-02-01 00:14:27,153 - Evaluation metric (pairwise_jaccard_similarity): 0.025531148131767326


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 00:14:27,378 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 00:14:27,378 - Current search space: {'ctm_params__n_components': 20, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 161.15779817073494	Time: 0:00:41.658707: : 25it [18:06, 43.47s/it]
100%|██████████| 11208/11208 [00:24<00:00, 456.51it/s]
100%|██████████| 11208/11208 [00:22<00:00, 493.12it/s]
100%|██████████| 11208/11208 [00:23<00:00, 484.48it/s]


2024-02-01 00:34:19,564 - Compute evaluation metrics
2024-02-01 00:36:18,633 - Evaluation metric (c_npmi): 0.010069649625965519
2024-02-01 00:39:56,140 - Evaluation metric (c_v): 0.4746736597589559
2024-02-01 00:39:56,320 - Evaluation metric (u_mass): -0.04263420395219954
2024-02-01 00:41:52,913 - Evaluation metric (c_uci): -0.44996093865822884
2024-02-01 00:41:52,914 - Evaluation metric (topic_diversity): 0.825
2024-02-01 00:41:52,920 - Evaluation metric (inverted_rbo): 0.9720843123096616
2024-02-01 00:41:52,920 - Evaluation metric (pairwise_jaccard_similarity): 0.018925384629311637


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 00:41:53,274 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 00:41:53,274 - Current search space: {'ctm_params__n_components': 20, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
Settings: 
                   N Components: 20
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.95
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 161.2063967358516	Time: 0:00:41.906031: : 25it [17:32, 42.09s/it] 
100%|██████████| 11208/11208 [00:23<00:00, 485.83it/s]
100%|██████████| 11208/11208 [00:23<00:00, 480.78it/s]
100%|██████████| 11208/11208 [00:21<00:00, 514.78it/s]


2024-02-01 01:01:08,887 - Compute evaluation metrics
2024-02-01 01:03:10,919 - Evaluation metric (c_npmi): -0.0020398050785616463
2024-02-01 01:06:50,898 - Evaluation metric (c_v): 0.4675385269507025
2024-02-01 01:06:51,187 - Evaluation metric (u_mass): -0.05555377430175483
2024-02-01 01:08:49,517 - Evaluation metric (c_uci): -0.7737073459408664
2024-02-01 01:08:49,517 - Evaluation metric (topic_diversity): 0.82
2024-02-01 01:08:49,524 - Evaluation metric (inverted_rbo): 0.9736959642243609
2024-02-01 01:08:49,524 - Evaluation metric (pairwise_jaccard_similarity): 0.01787800353467618


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 01:08:49,878 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 01:08:49,878 - Current search space: {'ctm_params__n_components': 30, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
Settings: 
                   N Components: 30
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9666666666666667
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 163.92843399599275	Time: 0:00:41.664200: : 25it [17:23, 41.74s/it]
100%|██████████| 11208/11208 [00:24<00:00, 454.20it/s]
100%|██████████| 11208/11208 [00:22<00:00, 488.14it/s]
100%|██████████| 11208/11208 [00:22<00:00, 499.03it/s]


2024-02-01 01:27:58,304 - Compute evaluation metrics
2024-02-01 01:30:10,244 - Evaluation metric (c_npmi): 0.020888462061465614
2024-02-01 01:34:20,490 - Evaluation metric (c_v): 0.4921906338794756
2024-02-01 01:34:20,694 - Evaluation metric (u_mass): -0.040919939728378964
2024-02-01 01:36:30,040 - Evaluation metric (c_uci): -0.3521100708174178
2024-02-01 01:36:30,040 - Evaluation metric (topic_diversity): 0.79
2024-02-01 01:36:30,055 - Evaluation metric (inverted_rbo): 0.9771954049377504
2024-02-01 01:36:30,055 - Evaluation metric (pairwise_jaccard_similarity): 0.01393509736095408


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 01:36:30,526 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 01:36:30,527 - Current search space: {'ctm_params__n_components': 30, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
Settings: 
                   N Components: 30
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9666666666666667
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 164.07849144648407	Time: 0:00:41.894322: : 25it [17:33, 42.14s/it]
100%|██████████| 11208/11208 [00:23<00:00, 484.49it/s]
100%|██████████| 11208/11208 [00:24<00:00, 450.50it/s]
100%|██████████| 11208/11208 [00:24<00:00, 452.80it/s]


2024-02-01 01:55:52,411 - Compute evaluation metrics
2024-02-01 01:58:08,031 - Evaluation metric (c_npmi): 0.027264879801399022
2024-02-01 02:02:23,309 - Evaluation metric (c_v): 0.47848430667401504
2024-02-01 02:02:23,647 - Evaluation metric (u_mass): -0.04065385299922604
2024-02-01 02:04:34,453 - Evaluation metric (c_uci): -0.16487881440485458
2024-02-01 02:04:34,454 - Evaluation metric (topic_diversity): 0.77
2024-02-01 02:04:34,468 - Evaluation metric (inverted_rbo): 0.9761387200365189
2024-02-01 02:04:34,468 - Evaluation metric (pairwise_jaccard_similarity): 0.017264358083190136


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 02:04:34,956 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 02:04:34,956 - Current search space: {'ctm_params__n_components': 40, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
Settings: 
                   N Components: 40
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.975
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 167.17041181392608	Time: 0:00:42.933512: : 25it [17:28, 41.96s/it]
100%|██████████| 11208/11208 [00:24<00:00, 450.59it/s]
100%|██████████| 11208/11208 [00:23<00:00, 481.98it/s]
100%|██████████| 11208/11208 [00:22<00:00, 489.83it/s]


2024-02-01 02:23:50,738 - Compute evaluation metrics
2024-02-01 02:26:06,641 - Evaluation metric (c_npmi): 0.023920222741354982
2024-02-01 02:30:34,253 - Evaluation metric (c_v): 0.49558322387656306
2024-02-01 02:30:34,518 - Evaluation metric (u_mass): -0.045025651286997516
2024-02-01 02:32:47,723 - Evaluation metric (c_uci): -0.29413143719800533
2024-02-01 02:32:47,723 - Evaluation metric (topic_diversity): 0.725
2024-02-01 02:32:47,748 - Evaluation metric (inverted_rbo): 0.9798282643273261
2024-02-01 02:32:47,749 - Evaluation metric (pairwise_jaccard_similarity): 0.01419288882375329


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 02:32:48,349 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 02:32:48,349 - Current search space: {'ctm_params__n_components': 40, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
Settings: 
                   N Components: 40
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.975
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 167.35518194412882	Time: 0:00:43.083119: : 25it [18:00, 43.22s/it]
100%|██████████| 11208/11208 [00:24<00:00, 455.61it/s]
100%|██████████| 11208/11208 [00:23<00:00, 472.79it/s]
100%|██████████| 11208/11208 [00:23<00:00, 478.99it/s]


2024-02-01 02:52:35,934 - Compute evaluation metrics
2024-02-01 02:54:51,505 - Evaluation metric (c_npmi): 0.015132275019761924
2024-02-01 02:59:24,237 - Evaluation metric (c_v): 0.48400107915887813
2024-02-01 02:59:24,501 - Evaluation metric (u_mass): -0.049969851718406455
2024-02-01 03:01:39,088 - Evaluation metric (c_uci): -0.4693644624752412
2024-02-01 03:01:39,088 - Evaluation metric (topic_diversity): 0.6875
2024-02-01 03:01:39,114 - Evaluation metric (inverted_rbo): 0.975694025373141
2024-02-01 03:01:39,114 - Evaluation metric (pairwise_jaccard_similarity): 0.017803495667272722


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 03:01:39,731 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 03:01:39,731 - Current search space: {'ctm_params__n_components': 50, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
Settings: 
                   N Components: 50
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.98
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 170.48983068288337	Time: 0:00:42.776412: : 25it [17:29, 41.96s/it]
100%|██████████| 11208/11208 [00:23<00:00, 484.52it/s]
100%|██████████| 11208/11208 [00:25<00:00, 446.64it/s]
100%|██████████| 11208/11208 [00:22<00:00, 493.99it/s]


2024-02-01 03:20:55,650 - Compute evaluation metrics
2024-02-01 03:23:18,810 - Evaluation metric (c_npmi): 0.013462374400363726
2024-02-01 03:28:16,442 - Evaluation metric (c_v): 0.4688151520820071
2024-02-01 03:28:16,770 - Evaluation metric (u_mass): -0.06445191116928506
2024-02-01 03:30:37,501 - Evaluation metric (c_uci): -0.46596075920479335
2024-02-01 03:30:37,501 - Evaluation metric (topic_diversity): 0.646
2024-02-01 03:30:37,541 - Evaluation metric (inverted_rbo): 0.9744235943251953
2024-02-01 03:30:37,542 - Evaluation metric (pairwise_jaccard_similarity): 0.018273280393707514


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 03:30:38,280 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 03:30:38,281 - Current search space: {'ctm_params__n_components': 50, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
Settings: 
                   N Components: 50
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.98
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 170.72111055726955	Time: 0:00:41.937266: : 25it [17:35, 42.20s/it]
100%|██████████| 11208/11208 [00:24<00:00, 450.32it/s]
100%|██████████| 11208/11208 [00:25<00:00, 446.05it/s]
100%|██████████| 11208/11208 [00:24<00:00, 465.41it/s]


2024-02-01 03:50:04,873 - Compute evaluation metrics
2024-02-01 03:52:27,903 - Evaluation metric (c_npmi): 0.014382219630977872
2024-02-01 03:57:18,384 - Evaluation metric (c_v): 0.47081757142134883
2024-02-01 03:57:18,708 - Evaluation metric (u_mass): -0.04567255774104491
2024-02-01 03:59:37,862 - Evaluation metric (c_uci): -0.4135832972912013
2024-02-01 03:59:37,862 - Evaluation metric (topic_diversity): 0.602
2024-02-01 03:59:37,902 - Evaluation metric (inverted_rbo): 0.9657835077795627
2024-02-01 03:59:37,902 - Evaluation metric (pairwise_jaccard_similarity): 0.023316741387127485


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 03:59:38,650 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 03:59:38,650 - Current search space: {'ctm_params__n_components': 60, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
Settings: 
                   N Components: 60
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9833333333333333
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 174.1116179146881	Time: 0:00:41.562178: : 25it [17:32, 42.10s/it] 
100%|██████████| 11208/11208 [00:23<00:00, 480.22it/s]
100%|██████████| 11208/11208 [00:22<00:00, 506.17it/s]
100%|██████████| 11208/11208 [00:24<00:00, 459.21it/s]


2024-02-01 04:18:57,703 - Compute evaluation metrics
2024-02-01 04:21:24,372 - Evaluation metric (c_npmi): 0.012780104018603456
2024-02-01 04:26:28,305 - Evaluation metric (c_v): 0.465191405750643
2024-02-01 04:26:28,662 - Evaluation metric (u_mass): -0.06283067802330772
2024-02-01 04:28:50,080 - Evaluation metric (c_uci): -0.42369751993244975
2024-02-01 04:28:50,080 - Evaluation metric (topic_diversity): 0.545
2024-02-01 04:28:50,138 - Evaluation metric (inverted_rbo): 0.9652566498955972
2024-02-01 04:28:50,139 - Evaluation metric (pairwise_jaccard_similarity): 0.023389501503842853


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 04:28:50,985 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 04:28:50,986 - Current search space: {'ctm_params__n_components': 60, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
Settings: 
                   N Components: 60
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9833333333333333
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 174.10985773024768	Time: 0:00:41.767747: : 25it [17:30, 42.03s/it]
100%|██████████| 11208/11208 [00:23<00:00, 483.04it/s]
100%|██████████| 11208/11208 [00:24<00:00, 450.67it/s]
100%|██████████| 11208/11208 [00:24<00:00, 452.49it/s]


2024-02-01 04:48:12,102 - Compute evaluation metrics
2024-02-01 04:50:31,383 - Evaluation metric (c_npmi): 0.014480461299094933
2024-02-01 04:55:18,914 - Evaluation metric (c_v): 0.47423479897280496
2024-02-01 04:55:19,320 - Evaluation metric (u_mass): -0.08101301090066942
2024-02-01 04:57:36,036 - Evaluation metric (c_uci): -0.4788828573018827
2024-02-01 04:57:36,036 - Evaluation metric (topic_diversity): 0.51
2024-02-01 04:57:36,112 - Evaluation metric (inverted_rbo): 0.9635091986381195
2024-02-01 04:57:36,113 - Evaluation metric (pairwise_jaccard_similarity): 0.02673896033553926


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 04:57:37,010 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 04:57:37,010 - Current search space: {'ctm_params__n_components': 70, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
Settings: 
                   N Components: 70
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9857142857142858
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 177.64951579751812	Time: 0:00:41.299888: : 25it [17:29, 41.97s/it]
100%|██████████| 11208/11208 [00:24<00:00, 457.08it/s]
100%|██████████| 11208/11208 [00:24<00:00, 465.77it/s]
100%|██████████| 11208/11208 [00:22<00:00, 502.95it/s]


2024-02-01 05:16:53,582 - Compute evaluation metrics
2024-02-01 05:19:04,474 - Evaluation metric (c_npmi): 0.010283030491802174
2024-02-01 05:23:50,311 - Evaluation metric (c_v): 0.4638512359024996
2024-02-01 05:23:50,771 - Evaluation metric (u_mass): -0.07978685321721611
2024-02-01 05:26:00,812 - Evaluation metric (c_uci): -0.5227071516809422
2024-02-01 05:26:00,813 - Evaluation metric (topic_diversity): 0.48428571428571426
2024-02-01 05:26:00,890 - Evaluation metric (inverted_rbo): 0.9591162689107394
2024-02-01 05:26:00,891 - Evaluation metric (pairwise_jaccard_similarity): 0.026629480814816548


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 05:26:01,871 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 05:26:01,871 - Current search space: {'ctm_params__n_components': 70, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
Settings: 
                   N Components: 70
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9857142857142858
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 177.48719294515612	Time: 0:00:43.663294: : 25it [17:52, 42.92s/it]
100%|██████████| 11208/11208 [00:25<00:00, 442.16it/s]
100%|██████████| 11208/11208 [00:23<00:00, 482.10it/s]
100%|██████████| 11208/11208 [00:25<00:00, 440.47it/s]


2024-02-01 05:45:45,418 - Compute evaluation metrics
2024-02-01 05:48:02,637 - Evaluation metric (c_npmi): 0.01722747633101672
2024-02-01 05:52:55,027 - Evaluation metric (c_v): 0.4751070813236471
2024-02-01 05:52:55,345 - Evaluation metric (u_mass): -0.07168521605570384
2024-02-01 05:55:09,918 - Evaluation metric (c_uci): -0.321425960049019
2024-02-01 05:55:09,919 - Evaluation metric (topic_diversity): 0.4942857142857143
2024-02-01 05:55:09,996 - Evaluation metric (inverted_rbo): 0.9619567245702839
2024-02-01 05:55:09,998 - Evaluation metric (pairwise_jaccard_similarity): 0.028027971931208602


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 05:55:10,976 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 05:55:10,976 - Current search space: {'ctm_params__n_components': 80, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
Settings: 
                   N Components: 80
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9875
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 181.30135078392735	Time: 0:00:42.736852: : 25it [17:37, 42.32s/it]
100%|██████████| 11208/11208 [00:23<00:00, 476.18it/s]
100%|██████████| 11208/11208 [00:23<00:00, 475.55it/s]
100%|██████████| 11208/11208 [00:23<00:00, 478.44it/s]


2024-02-01 06:14:36,616 - Compute evaluation metrics
2024-02-01 06:16:35,627 - Evaluation metric (c_npmi): 0.015600202522405002
2024-02-01 06:20:54,351 - Evaluation metric (c_v): 0.46475073867334454
2024-02-01 06:20:54,755 - Evaluation metric (u_mass): -0.06980938295241171
2024-02-01 06:22:51,381 - Evaluation metric (c_uci): -0.32135929678045516
2024-02-01 06:22:51,382 - Evaluation metric (topic_diversity): 0.45625
2024-02-01 06:22:51,483 - Evaluation metric (inverted_rbo): 0.9501332283764805
2024-02-01 06:22:51,485 - Evaluation metric (pairwise_jaccard_similarity): 0.033185661006574795


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 06:22:52,642 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 06:22:52,643 - Current search space: {'ctm_params__n_components': 80, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
Settings: 
                   N Components: 80
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9875
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 181.14944637663137	Time: 0:00:43.063733: : 25it [17:52, 42.90s/it]
100%|██████████| 11208/11208 [00:23<00:00, 483.87it/s]
100%|██████████| 11208/11208 [00:25<00:00, 436.38it/s]
100%|██████████| 11208/11208 [00:25<00:00, 444.94it/s]


2024-02-01 06:42:37,579 - Compute evaluation metrics
2024-02-01 06:44:41,854 - Evaluation metric (c_npmi): 0.016223864343005375
2024-02-01 06:49:02,420 - Evaluation metric (c_v): 0.4738096761434713
2024-02-01 06:49:02,744 - Evaluation metric (u_mass): -0.06546226126521512
2024-02-01 06:51:02,773 - Evaluation metric (c_uci): -0.3709814886996553
2024-02-01 06:51:02,773 - Evaluation metric (topic_diversity): 0.45625
2024-02-01 06:51:02,876 - Evaluation metric (inverted_rbo): 0.9562121474116501
2024-02-01 06:51:02,878 - Evaluation metric (pairwise_jaccard_similarity): 0.030803459129428393


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 06:51:03,996 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 06:51:03,996 - Current search space: {'ctm_params__n_components': 90, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
Settings: 
                   N Components: 90
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9888888888888889
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 184.79693627614898	Time: 0:00:43.804489: : 25it [17:56, 43.07s/it]
100%|██████████| 11208/11208 [00:24<00:00, 448.50it/s]
100%|██████████| 11208/11208 [00:23<00:00, 480.19it/s]
100%|██████████| 11208/11208 [00:23<00:00, 474.32it/s]


2024-02-01 07:10:50,260 - Compute evaluation metrics
2024-02-01 07:12:36,149 - Evaluation metric (c_npmi): 0.017140561731823017
2024-02-01 07:16:24,834 - Evaluation metric (c_v): 0.46453967537678853
2024-02-01 07:16:25,162 - Evaluation metric (u_mass): -0.0792799489155701
2024-02-01 07:18:11,026 - Evaluation metric (c_uci): -0.29779326012690643
2024-02-01 07:18:11,026 - Evaluation metric (topic_diversity): 0.41444444444444445
2024-02-01 07:18:11,157 - Evaluation metric (inverted_rbo): 0.947883638292224
2024-02-01 07:18:11,159 - Evaluation metric (pairwise_jaccard_similarity): 0.03501886911576069


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 07:18:12,421 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 07:18:12,422 - Current search space: {'ctm_params__n_components': 90, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
Settings: 
                   N Components: 90
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.9888888888888889
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 184.89476762507638	Time: 0:00:45.000287: : 25it [18:20, 44.02s/it]
100%|██████████| 11208/11208 [00:25<00:00, 439.33it/s]
100%|██████████| 11208/11208 [00:23<00:00, 469.91it/s]
100%|██████████| 11208/11208 [00:24<00:00, 460.70it/s]


2024-02-01 07:38:23,669 - Compute evaluation metrics
2024-02-01 07:40:12,369 - Evaluation metric (c_npmi): 0.008228353596806822
2024-02-01 07:44:03,655 - Evaluation metric (c_v): 0.45492190561092893
2024-02-01 07:44:04,076 - Evaluation metric (u_mass): -0.09280631157977934
2024-02-01 07:45:50,648 - Evaluation metric (c_uci): -0.491434150598683
2024-02-01 07:45:50,649 - Evaluation metric (topic_diversity): 0.3977777777777778
2024-02-01 07:45:50,780 - Evaluation metric (inverted_rbo): 0.9445902903323792
2024-02-01 07:45:50,782 - Evaluation metric (pairwise_jaccard_similarity): 0.0355092798060763


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 07:45:52,046 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 07:45:52,047 - Current search space: {'ctm_params__n_components': 100, 'sbert_params__model_name_or_path': 'all-MiniLM-L6-v2'}
Settings: 
                   N Components: 100
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.99
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 188.2360387287972	Time: 0:00:43.151638: : 25it [18:10, 43.62s/it] 
100%|██████████| 11208/11208 [00:24<00:00, 448.38it/s]
100%|██████████| 11208/11208 [00:23<00:00, 477.54it/s]
100%|██████████| 11208/11208 [00:25<00:00, 440.62it/s]


2024-02-01 08:05:52,838 - Compute evaluation metrics
2024-02-01 08:07:30,426 - Evaluation metric (c_npmi): 0.008263856581880523
2024-02-01 08:11:00,752 - Evaluation metric (c_v): 0.45335317949768816
2024-02-01 08:11:01,055 - Evaluation metric (u_mass): -0.10431769799351176
2024-02-01 08:12:37,740 - Evaluation metric (c_uci): -0.49032930424862115
2024-02-01 08:12:37,741 - Evaluation metric (topic_diversity): 0.362
2024-02-01 08:12:37,899 - Evaluation metric (inverted_rbo): 0.9421385571619509
2024-02-01 08:12:37,902 - Evaluation metric (pairwise_jaccard_similarity): 0.03560509971242736


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 08:12:39,298 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 08:12:39,298 - Current search space: {'ctm_params__n_components': 100, 'sbert_params__model_name_or_path': 'all-mpnet-base-v2'}
Settings: 
                   N Components: 100
                   Topic Prior Mean: 0.0
                   Topic Prior Variance: 0.99
                   Model Type: prodLDA
                   Hidden Sizes: (100, 100)
                   Activation: softplus
                   Dropout: 0.2
                   Learn Priors: True
                   Learning Rate: 0.002
                   Momentum: 0.99
                   Reduce On Plateau: False
                   Save Dir: None


Epoch: [25/25]	 Seen Samples: [17931200/17931725]	Train Loss: 188.38193633003877	Time: 0:00:43.679320: : 25it [18:07, 43.52s/it]
100%|██████████| 11208/11208 [00:25<00:00, 445.35it/s]
100%|██████████| 11208/11208 [00:25<00:00, 435.59it/s]
100%|██████████| 11208/11208 [00:23<00:00, 472.74it/s]


2024-02-01 08:32:38,837 - Compute evaluation metrics
2024-02-01 08:34:15,142 - Evaluation metric (c_npmi): 0.01120428015255344
2024-02-01 08:37:42,386 - Evaluation metric (c_v): 0.45469835973798567
2024-02-01 08:37:42,725 - Evaluation metric (u_mass): -0.09307418845967824
2024-02-01 08:39:17,299 - Evaluation metric (c_uci): -0.418213826773553
2024-02-01 08:39:17,300 - Evaluation metric (topic_diversity): 0.38
2024-02-01 08:39:17,459 - Evaluation metric (inverted_rbo): 0.9433708181148889
2024-02-01 08:39:17,462 - Evaluation metric (pairwise_jaccard_similarity): 0.038724487109420516


  pickle.dump(vectorizer, open(vectorizer_path, 'wb'))


2024-02-01 08:39:18,828 - Saved result.json at: ctm_genreindie_grid_search_20240131_230804/result.json



2024-02-01 08:39:18,829 - Search ends


In [33]:
# load the model from disk to compare the results

search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
training_datetime = datetime(2024, 1, 29, 21, 29, 10)
training_folder = Path(f'ctm_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')

training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)


# load the embeddings
model_name_or_path = training_result['best_hyperparameters']['sbert_params']['model_name_or_path']
embeddings_path = training_folder.joinpath(f'embeddings_{model_name_or_path}.pkl')
with open(embeddings_path, 'rb') as f:
    embeddings = np.load(f)

best_model_path = training_result['best_model_checkpoint']
ctm_hyperparameters = training_result['best_hyperparameters']['ctm_params']
sbert_params = training_result['best_hyperparameters']['sbert_params']

# ctm_hyperparameters['bow_size'] = 2000
# ctm_hyperparameters['contextual_size'] = 768

best_model_loaded = _load_ctm_model(Path(best_model_path), ctm_hyperparameters)

# create the dataset on the fly
vectorizer = pickle.load(open(Path(best_model_path).joinpath('count_vectorizer.pkl'), 'rb'))

training_dataset, _, _, _ = create_ctm_dataset(
    X, X_preprocessed, 
    sbert_params, training_folder,
    vectorizer=vectorizer)


doc_topic_dist_1 = best_model.get_doc_topic_distribution(training_dataset, n_samples=20)
doc_topic_dist_2 = best_model_loaded.get_doc_topic_distribution(training_dataset, n_samples=20)

  vectorizer = pickle.load(open(Path(best_model_path).joinpath('count_vectorizer.pkl'), 'rb'))


2024-01-30 00:31:25,572 - Found existing sbert embeddings at ctm_grid_search_20240129_212910/embeddings_all-MiniLM-L6-v2.pkl. Reusing them.


100%|██████████| 11411/11411 [00:23<00:00, 492.96it/s]
100%|██████████| 11411/11411 [00:23<00:00, 489.78it/s]


In [63]:
training_dataset.X_bow.todense().shape

(75499, 2000)