LDA training script

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import json
import pickle
from datetime import datetime
import itertools

import gensim
import spacy
import nltk

import pyLDAvis

In [2]:
# download nltk stopwords
# import nltk
# nltk.download('stopwords')

# download spacy stopwords
# ...

In [3]:
%load_ext autoreload

In [4]:
import sys

sys.path.append('../')

In [5]:
# load the dataset

%autoreload 2
from dataset_loader import GENRES, load_dataset

genre = GENRES.INDIE
unique_list = ['review_text']

dataset_folder = Path(f'../../dataset/topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]')
dataset, dataset_path = load_dataset(genre, dataset_folder)

dataset.info(verbose=True)

Load dataset from: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[review_text]/01_indie.pkl



<class 'pandas.core.frame.DataFrame'>
Index: 725737 entries, 25636 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         725737 non-null  int64 
 1   app_id        725737 non-null  int64 
 2   app_name      725737 non-null  object
 3   review_text   725737 non-null  object
 4   review_score  725737 non-null  int64 
 5   review_votes  725737 non-null  int64 
 6   genre_id      725737 non-null  object
 7   category_id   725737 non-null  object
dtypes: int64(4), object(4)
memory usage: 49.8+ MB


In [6]:
# data preprocessing

%autoreload 2
sys.path.append('../../sa')
import str_cleaning_functions

def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

def cleaning_strlist(str_list):
    str_list = list(map(lambda x: str_cleaning_functions.remove_links(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_links2(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.clean(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.deEmojify(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_non_letters(x), str_list))
    str_list = list(map(lambda x: x.lower(), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_stopword(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
    return str_list

In [7]:
# apply data preprocessing

cleaning(dataset, 'review_text')

In [8]:
# remove reviews with too many punctuations

def calculate_nonalphabet_ratio(review: str) -> float:
    count = 0
    for char in review:
        if not char.isalpha():
            count += 1
    return count / (len(review) + 1e-5)

dataset['alphabet_ratio'] = dataset['review_text'].apply(calculate_nonalphabet_ratio)

dataset['alphabet_ratio'].describe([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

count    725737.000000
mean          0.138562
std           0.026648
min           0.000000
25%           0.130435
50%           0.142857
75%           0.152695
90%           0.162162
95%           0.168421
99%           0.184210
max           0.497382
Name: alphabet_ratio, dtype: float64

In [9]:
# remove reviews with too many punctuations
# ratio = ~99 percentile

# this further remove ~7.4K reviews

dataset = dataset[dataset['alphabet_ratio'] < 0.19]

In [10]:
X = dataset['review_text'].values

In [13]:
print(X)
print(len(X))

['take one part faerie solitaire two parts puzzle quest mix little poker yahtzee good measure get something like runespell overture changeling sort fight monsters take quests exchange coin buffs come form power cards story strongest element game like puzzle quest games battles determined playing mini game instead match though game card game similar poker making certain combinations cards pairs kind full house flush straight certain amount damage opponent trying ability steal cards opponent plus limited number moves get per turn move cards play power ups adds enough strategy game keep interesting admittedly game get bit repetitive found dialogue options bit tedious fortunately game allows skip want easy game learn entertaining casual game play seems pretty short achievements seem difficult collect thing finding little gems like reason buy bundles'
 'make games like simple card playing mechanic fun addicting vaguely interesting storyline character make way like puzzle quest st one love g

In [14]:
# nltk.download('averaged_perceptron_tagger')
# t = nltk.word_tokenize(X[0])
# tt = nltk.pos_tag(t)
# tt

In [15]:
# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

In [16]:
# save the lematized data, as separate pickle file
# for evaluation purpose

X_lemmatized_file = Path(f'lemmatized_data/{genre.value:02}_{str(genre)}.pkl')

if not X_lemmatized_file.parent.exists():
    X_lemmatized_file.parent.mkdir()

if not X_lemmatized_file.exists():
        # lemmatize the data

    X_lemmatized = list(map(lambda x: lemmatization(x), X))

    # filter empty list of strings in X_lemmatized, as they are not useful for topic modelling
    X_lemmatized = list(filter(lambda x: len(x) > 0, X_lemmatized))

    with open(X_lemmatized_file, "wb") as f:
        pickle.dump(X_lemmatized, f)
else:
    with open(X_lemmatized_file, "rb") as f:
        X_lemmatized = pickle.load(f)

In [17]:
# load the lematized data, as separate pickle file
# for convenient hyperparameter selection

# X_lemmatized_file = dataset_path.parent.joinpath('cleaned_lemmatized', dataset_path.stem + '_cleaned_lemmatized.pkl')

# with open(X_lemmatized_file, "rb") as f:
#     X_lemmatized = pickle.load(f)

# X_lemmatized[0]

In [18]:
# check the length when loading in a evaluation script

print(len(X_lemmatized))
print(X_lemmatized[0])

719448
['take', 'one', 'part', 'faerie', 'solitaire', 'two', 'part', 'puzzle', 'quest', 'mix', 'little', 'poker', 'yahtzee', 'good', 'measure', 'get', 'something', 'like', 'runespell', 'overture', 'changeling', 'sort', 'fight', 'monster', 'take', 'quest', 'exchange', 'coin', 'buff', 'come', 'form', 'power', 'card', 'story', 'strong', 'element', 'game', 'like', 'puzzle', 'quest', 'game', 'battle', 'determine', 'play', 'mini', 'game', 'instead', 'match', 'though', 'game', 'card', 'game', 'similar', 'poker', 'make', 'certain', 'combination', 'card', 'pair', 'kind', 'full', 'house', 'flush', 'straight', 'certain', 'amount', 'damage', 'opponent', 'try', 'ability', 'steal', 'card', 'opponent', 'plus', 'limited', 'number', 'move', 'get', 'per', 'turn', 'move', 'card', 'play', 'power', 'ups', 'add', 'enough', 'strategy', 'game', 'keep', 'interest', 'admittedly', 'game', 'get', 'bit', 'repetitive', 'find', 'dialogue', 'option', 'bite', 'tedious', 'fortunately', 'game', 'allow', 'skip', 'want', 

In [19]:
# check whether any empty list of strings in X_lemmatized
# as it will cause error in topic modelling and evaluating the model

def _get_empty_idxs(X):
    empty_idxs = []
    for i, text in enumerate(X):
        if len(text) == 0:
            empty_idxs.append(i)
    return sorted(empty_idxs, reverse=True)

empty_idxs = _get_empty_idxs(X_lemmatized)
empty_idxs

[]

In [20]:
# load/create custom stopwords stored in a txt from dataset folder
from pathlib import Path

custom_stopwords_path = Path('../../dataset/topic_modelling/stopwords.txt')
custom_stowords_games_path = Path('../../dataset/topic_modelling/stopwords_games.txt')
game_name_list_path = Path('../../dataset/topic_modelling/game_name_list.txt')

with open(custom_stopwords_path, 'r') as f:
    custom_stopwords = f.read().splitlines()

with open(custom_stowords_games_path, 'r') as f:
    custom_stowords_games = f.read().splitlines()

with open(game_name_list_path, 'r') as f:
    game_name_list = f.read().splitlines()

# also include the stopword list from nltk
# does not include the stopword list from Gensim
# as it is identical with the english stopword list from sklearn
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')

custom_stopwords = custom_stopwords + custom_stowords_games + game_name_list + nltk_stopwords
custom_stopwords = list(filter(lambda x: len(x) > 0, custom_stopwords))     # remove empty string

custom_stopwords = set(custom_stopwords)

print(custom_stopwords)
print(len(custom_stopwords))

USE_CUSTOM_STOPWORDS = True


155930


Grid Search

In [21]:
def _print_message(message):
    '''Print message with a timestamp in front of it

    Timestamp format: YYYY-MM-DD HH:MM:SS,mmm
    '''
    print(f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]} - {message}')

In [22]:
from typing import Tuple
def _init_count_vectorizer_params(n_frequency:int = 0, ngram_range:list[int, int] = [1, 1]):

    params_dict = {}
    params_dict['n_frequency'] = n_frequency
    params_dict['ngram_range'] = ngram_range

    return params_dict

def _init_LdaMulticore_params(corpus=None, num_topics=100, id2word=None, workers=None, chunksize=2000, 
        passes=1, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, 
        eval_every=10, iterations=50, gamma_threshold=0.001, random_state=None, 
        minimum_probability=0.01, minimum_phi_value=0.01, per_word_topics=False, dtype=np.float32):
    
    hyperparameters = dict()
    hyperparameters['corpus'] = corpus
    hyperparameters["num_topics"] = num_topics
    hyperparameters['id2word'] = id2word
    hyperparameters["workers"] = workers
    hyperparameters["chunksize"] = chunksize
    hyperparameters["passes"] = passes
    hyperparameters["alpha"] = alpha
    hyperparameters["eta"] = eta
    hyperparameters["decay"] = decay
    hyperparameters["offset"] = offset
    hyperparameters["eval_every"] = eval_every
    hyperparameters["iterations"] = iterations
    hyperparameters["gamma_threshold"] = gamma_threshold
    hyperparameters['minimum_probability'] = minimum_probability
    hyperparameters["random_state"] = random_state
    hyperparameters['minimum_phi_value'] = minimum_phi_value
    hyperparameters['per_word_topics'] = per_word_topics
    hyperparameters['dtype'] = dtype

    if "alpha" in hyperparameters:
        if isinstance(hyperparameters["alpha"], float):
            hyperparameters["alpha"] = [
                hyperparameters["alpha"]
            ] * hyperparameters["num_topics"]

    return hyperparameters

In [23]:
sys.path.append('../')

from eval_metrics import compute_inverted_rbo, compute_topic_diversity, compute_pairwise_jaccard_similarity, \
                        METRICS, SEARCH_BEHAVIOUR, COHERENCE_MODEL_METRICS

In [24]:
def _init_config_dict(config_path:Path, model_name:str, dataset_path:Path, hyperparameters:dict, search_space_dict:dict, 
                      metrics:list[METRICS], monitor:METRICS,
                      search_behaviour:SEARCH_BEHAVIOUR, search_rs:int, search_n_iter:int):
    # init dict for config.json

    _hyperparameters = hyperparameters
    _search_space_dict = search_space_dict

    if not config_path.exists():
        config = {}

        config['model'] = model_name
        config['dataset_path'] = str(dataset_path)

        countvect_params = _init_count_vectorizer_params(**hyperparameters['countvect_params'])
        lda_params = _init_LdaMulticore_params(**hyperparameters['lda_params'])

        # store the countvectorizer and lda params
        config['countvect_params'] = countvect_params


        # b4 storing, remove corpus and id2word
        # and convert dtype to str
        lda_params.pop('corpus', '')
        lda_params.pop('id2word', '')
        lda_params['dtype'] = str(lda_params['dtype'])      # datatype is not json serializable, so convert to str

        config['lda_params'] = lda_params

        # config.update(hyperparameters)
        
        # remove hyperparameters that are in the search space dict
        if 'countvect_params' in search_space_dict:
            for key in search_space_dict['countvect_params'].keys():
                countvect_params.pop(key, '')
        if 'lda_params' in search_space_dict:
            for key in search_space_dict['lda_params'].keys():
                lda_params.pop(key, '')

        # store the search space
        config['search_space'] = search_space_dict

        # store the metrics types
        config['metrics'] = list(map(lambda x: x.value, metrics))

        # store the monitor metric
        config['monitor'] = monitor.value

        # store the search behaviour
        config['search_behaviour'] = search_behaviour.value

        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            config['search_rs'] = search_rs
            config['search_n_iter'] = search_n_iter


        config['gensim_version'] = str(gensim.__version__)

        # save the file
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)

        _print_message('Created config.json at: ' + str(config_path))
    else:
        with open(config_path, 'r') as f:
            config = json.load(f)

        assert config['model'] == model_name, 'input model_name is not consistent with config["model"]'
        assert config['dataset_path'] == str(dataset_path), 'input dataset_path is not consistent with config["dataset_path"]'
        assert config['metrics'] == list(map(lambda x: x.value, metrics)), 'input metrics is not consistent with config["metrics"]'
        assert config['monitor'] == monitor.value, 'input monitor is not consistent with config["monitor"]'
        assert config['search_behaviour'] == search_behaviour.value, 'input search_behaviour is not consistent with config["search_behaviour"]'
        if search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
            assert config['search_rs'] == search_rs, 'input search_rs is not consistent with config["search_rs"]'
            assert config['search_n_iter'] == search_n_iter, 'input search_n_iter is not consistent with config["search_n_iter"]'

        # check whether hyperparameters in _hyperparameters are same in _config
        countvect_params = _init_count_vectorizer_params(**hyperparameters['countvect_params'])
        lda_params = _init_LdaMulticore_params(**hyperparameters['lda_params'])

        # remove corpus and id2word
        # and convert dtype to str
        lda_params.pop('corpus', '')
        lda_params.pop('id2word', '')
        lda_params['dtype'] = str(lda_params['dtype'])      # datatype is not json serializable, so convert to str

        assert config['countvect_params'].keys() <= countvect_params.keys(), 'existing config["countvect_params"] contains additional hyperparameters'
        assert config['lda_params'].keys() <= lda_params.keys(), 'existing config["lda_params"] contains additional hyperparameters'

        for key in config['countvect_params'].keys() & countvect_params.keys():
            assert config['countvect_params'][key] == countvect_params[key], f'config["countvect_params"][{key}] is not consistent with input hyperparameters'
        for key in config['lda_params'].keys() & lda_params.keys():
            assert config['lda_params'][key] == lda_params[key], f'config["lda_params"][{key}] is not consistent with input hyperparameters'
        


        # check whetehr the config file contains all the search space
        if 'countvect_params' in config['search_space']:
            assert config['search_space']['countvect_params'].keys() == search_space_dict['countvect_params'].keys(), 'existing config["search_space"]["countvect_params"] is not consistent with input search_space_dict["countvect_params"]'
            for key in  search_space_dict['countvect_params'].keys():
                assert config['search_space']['countvect_params'][key] == search_space_dict['countvect_params'][key], f'config["search_space"]["countvect_params"][{key}] is not consistent with input search_space_dict["countvect_params"]'
        if 'lda_params' in config['search_space']:
            assert config['search_space']['lda_params'].keys() == search_space_dict['lda_params'].keys(), 'existing config["search_space"]["lda_params"] is not consistent with input search_space_dict["lda_params"]'
            for key in  search_space_dict['lda_params'].keys():
                assert config['search_space']['lda_params'][key] == search_space_dict['lda_params'][key], f'config["search_space"]["lda_params"][{key}] is not consistent with input search_space_dict["lda_params"]'

        _print_message('Loaded existing config.json from: ' + str(config_path))
        _print_message('Hyperparameters and search space are checked to be consistent with config.json')

    return config

In [25]:
def _init_result_dict(result_path:Path, monitor_type:str):
    # init dict for result.json

    if not result_path.exists():
        result = {}
        result['best_metric'] = -float('inf')
        result['best_model_checkpoint'] = ""
        result['best_hyperparameters'] = dict()
        result["monitor_type"] = monitor_type
        result["log_history"] = list()
    else:
        with open(result_path, 'r') as f:
            result = json.load(f)

        # check whether metric_type in result.json is same as metric_type passed in
        if 'monitor_type' not in result.keys():
            raise Exception('metric_type is not found in result.json. Please modify the metric_type passed in.')
        elif result['monitor_type'] != monitor_type:
            raise Exception(f'metric_type is different in result.json. Please modify the metric_type passed in.')

        _print_message('Loaded existing result.json from: ' + str(result_path))
        _print_message('metric_type is checked to be consistent with result.json')
        # print('Loaded existing result.json from:', result_path)
        # print('metric_type is checked to be consistent with result.json')

    return result

In [26]:
# referencing octis to calculate topics, topic-document-matrix and topic-word-matrix

def _get_topic_word_matrix(model):
    return model.get_topics()

def _get_topics(model, id2word, result, top_words=10):
    if top_words > 0:
        topics_output = []
        for topic in result["topic-word-matrix"]:
            top_k = np.argsort(topic)[-top_words:]
            top_k_words = list(reversed([id2word[i] for i in top_k]))
            topics_output.append(top_k_words)

        return topics_output
    
def _get_topic_document_matrix(model, corpus, num_topics):
    """
    Return the topic representation of the
    corpus
    """
    doc_topic_tuples = []
    for document in corpus:
        doc_topic_tuples.append(
            model.get_document_topics(document, minimum_probability=0))

    topic_document = np.zeros((
        num_topics,
        len(doc_topic_tuples)))

    for ndoc in range(len(doc_topic_tuples)):
        document = doc_topic_tuples[ndoc]
        for topic_tuple in document:
            topic_document[topic_tuple[0]][ndoc] = topic_tuple[1]
    return topic_document

In [27]:
def _load_lda_model(model_path:Path):
    if not model_path.exists():
        raise Exception(f'Cannot find model checkpoint at {model_path}')
    else:
        lda_model = gensim.models.ldamodel.LdaModel.load(str(model_path.joinpath('lda_multicore')))
        return lda_model

In [28]:
# from itertools import product
import collections
import os
from gensim.models import CoherenceModel
from copy import deepcopy

from sklearn.model_selection import ParameterGrid, ParameterSampler
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from tqdm import tqdm

def model_search(X, hyperparameters:dict, search_space:dict, save_folder:Path, dataset_path:Path,
                 additional_stopwords:list[str]=None,
                metrics:list[METRICS]=[METRICS.C_NPMI], monitor:METRICS=METRICS.C_NPMI, 
                save_each_models=True, run_from_checkpoints=False,
                search_behaviour=SEARCH_BEHAVIOUR.GRID_SEARCH, search_rs=42, search_n_iter=10):
    """
    Perform grid search for LDA model hyperparameter selection

    Parameters:
    ----------
    X : List of input texts (after preprocessing like lemmatization)
    hyperparameters : dict of hyperparameters
    search_space : dict of search space for hyperparameters
    save_each_models : save each model or not
    save_path : folder to save the model
    run_from_checkpoints : whether to run from checkpoints or not

    Returns:
    -------
    best_model : best model
    best_model_path : path to the best model
    best_hyperparameters : best hyperparameters
    """

    config_json_path = save_folder.joinpath('config.json')
    result_json_path = save_folder.joinpath('result.json')

    if monitor not in metrics:
        raise Exception('monitor is not in metrics. Please modify the metrics passed in.')

    if run_from_checkpoints:
        if not save_folder.exists():
            _print_message('Save folder:' + str(save_folder.resolve()) + ' does not exist. Function terminates.')
            # print('Save folder:' + str(save_folder.resolve()) + ' does not exist. Function terminates.')
            raise Exception('No checkpoints found. Function terminates.')
        
        # check for existing configs
        if not config_json_path.exists():
            raise Exception('No config.json found. Function terminates.')
        
        # check for existing results
        if not result_json_path.exists():
            _print_message('No result.json is found. Assuming no existing checkpoints.')
            # print('No result.json is found. Assuming no existing checkpoints.')
    else:
        if save_folder.exists():
            raise Exception('Checkpoints found. Please delete the checkpoints or set run_from_checkpoints=True. Function terminates.')
        
    if not save_folder.exists():
        save_folder.mkdir()

    # init / load existing json files
    # also doing consistency checks for hyperparameters and search space
    config = _init_config_dict(config_json_path, 'lda_multicore', dataset_path, hyperparameters, search_space, metrics, monitor,
                               search_behaviour, search_rs, search_n_iter)

    result = _init_result_dict(result_json_path, monitor.value)
    
    
    _print_message(f'Search folder: {save_folder}')
    # print(f'Search folder: {save_folder}')


    # init
    best_model_path = result['best_model_checkpoint']
    best_metric_score = result['best_metric']
    best_model = _load_lda_model(Path(best_model_path)) if best_model_path != "" else None
    best_hyperparameters = result['best_hyperparameters']


    _print_message(f'Best model checkpoint: {best_model_path}')
    _print_message(f'Best metric score: {best_metric_score}')
    _print_message(f'Best model: {best_model}')
    # print(f'Best model checkpoint: {best_model_path}')
    # print(f'Best metric score: {best_metric_score}')
    # print(f'Best model: {best_model}')

    # create search space
    temp_search_space = {}
    for k, v in search_space.items():
        for kk, vv in v.items():
            temp_search_space[k + '__' + kk] = vv


    # use sklearn to generate the search space instead of generating my myself
    if search_behaviour == SEARCH_BEHAVIOUR.GRID_SEARCH:
        search_iterator = ParameterGrid(temp_search_space)
    elif search_behaviour == SEARCH_BEHAVIOUR.RANDOM_SEARCH:
        search_iterator = ParameterSampler(temp_search_space, n_iter=search_n_iter, random_state=search_rs)

    
    print('\n')       

    for search_space_dict in search_iterator:

        model_name = ''

        _countvect_params = {}
        _lda_params = {}

        for k, v in search_space_dict.items():
            if k.startswith('countvect_params'):
                _countvect_params[k.split('__')[1]] = v
                model_name += 'cv_' + k.split('__')[1] + '_' + str(v) + '_'
            elif k.startswith('lda_params'):
                _lda_params[k.split('__')[1]] = v
                model_name += 'lda_' + k.split('__')[1] + '_' + str(v) + '_'
            else:
                raise Exception(f'Unknown key: {k}')
        
        model_name = model_name[:-1]     # remove the last underscore

        model_path = save_folder.joinpath(
            'lda_multicore_' + model_name
        )

        # check whether the current search space is already trained
        # by comparing the folder name

        if model_path.exists():
            print(f'Skipping current search space: {search_space_dict}')
            continue


        ##########
        # Training starts
        ##########

        _print_message(f'Training with current search space: {search_space_dict}')
        # print(f'Training with current search space: {search_space_dict}')

        # create the id2word and the corpus using sklearn count vectorizer
        # to apply custom stopwords same as other topic models

        # although can be done outside the loop, but it's better to do it inside the loop
        # for consistency to other model (as CTM is done inside the loop)

        countvect_params = deepcopy(config['countvect_params'])
        lda_params = deepcopy(config['lda_params'])


        countvect_params.update(_countvect_params)
        lda_params.update(_lda_params)

        # remove empty rows in X
        X = list(filter(lambda x: len(x) > 0, X))
        X_listofstr = list(map(lambda x: ' '.join(x), X))

        # the params of CountVectorizer shd be consistent with other topic models

        vocab = collections.Counter()
        tokenizer = CountVectorizer().build_tokenizer()
        for doc in tqdm(X_listofstr):
            # print(doc)
            vocab.update(tokenizer(doc))
        vocab = [word for word, frequency in vocab.items() if frequency >= countvect_params['n_frequency']]       # set the minimum frequency to reduce the vocabulary size
        _print_message('Number of vocabulary: {}'.format(len(vocab)))

        del countvect_params['n_frequency']       # not used in the vectorizer model for training
        countvect_params['ngram_range'] = tuple(countvect_params['ngram_range'])       # convert list to tuple

        vect = CountVectorizer(
            vocabulary=vocab,
            stop_words="english" if additional_stopwords is None else list(ENGLISH_STOP_WORDS.union(additional_stopwords)), 
            analyzer='word',
            # max_features=2000000        # default value of max_features in gensim.corpora.Dictionary is 2M
            **countvect_params
        )

        # need to join as fit_transform expects list of strings, not list of list of strings
        corpus_vect = vect.fit_transform(X_listofstr)

        corpus = gensim.matutils.Sparse2Corpus(corpus_vect, documents_columns=False)

        # transform scikit vocabulary into gensim dictionary
        id2word = gensim.corpora.Dictionary.from_corpus(
            corpus,
            id2word=dict((id, word) for word, id in vect.vocabulary_.items())
        )

        # id2word = dict((v, k) for k, v in vect.vocabulary_.items())


        # no need to save the id2word, as it's saved by ldamodel

        corpus_filepath = save_folder.joinpath('temp_corpus.mm')
        gensim.corpora.MmCorpus.serialize(str(corpus_filepath), corpus)
        _print_message(f'Temporarily saved the corpus to {corpus_filepath}')

        # load the corpus as mmcorpus for more efficient training
        corpus = gensim.corpora.MmCorpus(str(corpus_filepath))

        # update the corpus and id2word in the hyperparameter
        lda_params['corpus'] = corpus
        lda_params['id2word'] = id2word
        # update the dtype to the datatype from hyperparameter args
        lda_params['dtype'] = hyperparameters['lda_params']['dtype']
        

        # train the model
        model = gensim.models.ldamulticore.LdaMulticore(**lda_params)

        ##########
        # Training ends
        ##########

        ##########
        # Evaluation starts
        ##########

        result_octis = {}
        result_octis['topic-word-matrix'] = _get_topic_word_matrix(model)
        result_octis['topics'] = _get_topics(model, lda_params['id2word'], result_octis, top_words=10)
        result_octis['topic-document-matrix'] = _get_topic_document_matrix(model, corpus, lda_params['num_topics'])

        _print_message('Computing evaluation metrics')
        # print('Computing evaluation metrics')

        metrics_score = dict()

        # compute various metrics
        for metric in metrics:
            if metric in COHERENCE_MODEL_METRICS:
                # compute the coherence
                coherencemodel = CoherenceModel(
                    model=model, 
                    texts=X, 
                    dictionary=id2word,
                    coherence=metric.value,
                    topn=10,
                    processes=3
                )
                score = coherencemodel.get_coherence()                

            elif metric == METRICS.TOPIC_DIVERSITY:
                # compute the coherence
                score = compute_topic_diversity(result_octis, topk=10)

            elif metric == METRICS.INVERTED_RBO:
                # compute the coherence
                score = compute_inverted_rbo(result_octis, topk=10)

            elif metric == METRICS.PAIRWISE_JACCARD_SIMILARITY:
                # compute the coherence
                score = compute_pairwise_jaccard_similarity(result_octis, topk=10)

            else:
                raise Exception(f'Unknown metric: {metric.value}')
            
            metrics_score[metric.value] = score

            _print_message(f'Evaluation metric ({metric.value}): {score}')
            # print(f'Evaluation metric ({metric.value}): {score}')
            
        # get the monitor score
        monitor_score = metrics_score[monitor.value]

        ##########
        # Evaluation ends
        ##########

        ##########
        # Save models
        ##########

        if not model_path.exists():
            model_path.mkdir(parents=True)

        # save the model
        if save_each_models:
            model.save(str(model_path.joinpath('lda_multicore')))

            _print_message('Model saved at: ' + str(model_path))
            # print('Model saved at:', model_path)


            # save the corpus as well
            corpus_filepath = Path(model_path.joinpath(model_path.stem + '_corpus.mm'))
            gensim.corpora.MmCorpus.serialize(str(corpus_filepath), corpus)
            _print_message(f'Saved the corpus to {corpus_filepath}')

        ##########
        # Save models ends
        ##########
            
        ###########
        # Update result dict and json file
        ###########
            
        # init

        # model_hyperparameters = deepcopy(hyperparameters)
        model_hyperparameters = {
            'countvect_params': countvect_params,
            'lda_params': lda_params
        }

        model_hyperparameters['lda_params'].pop('corpus', '')     # pop as it is not json serializable
        model_hyperparameters['lda_params'].pop('id2word', '')    # pop as it is not json serializable
        model_hyperparameters['lda_params']['dtype'] = str(model_hyperparameters['lda_params']['dtype'])      # convert dtype to str
            
        if monitor_score > best_metric_score:
            best_metric_score = monitor_score
            best_model = model
            best_model_path = model_path
            best_hyperparameters = model_hyperparameters
            
        # update
            
        model_log_history = dict()
        model_log_history.update(metrics_score)         # add the metrics score values to the log history
        model_log_history['model_name'] = model_name
        model_log_history['hyperparameters'] = model_hyperparameters

        result['best_metric'] = best_metric_score
        result['best_model_checkpoint'] = str(best_model_path)      # relative path
        result['best_hyperparameters'] = best_hyperparameters
        result["log_history"].append(model_log_history)

        # print(result)

        # save result
        with open(result_json_path, 'w') as f:
            json.dump(result, f, indent=2)

        _print_message('Saved result.json at: ' + str(result_json_path))
        # print("Saved result.json at:", result_json_path)
        print('\n\n')

    _print_message('Search ends')

    # remove the temporary corpus
    corpus_tmp_filepath = save_folder.joinpath('temp_corpus.mm').resolve()
    corpus_index_tmp_filepath = save_folder.joinpath('temp_corpus.mm.index').resolve()
    if corpus_tmp_filepath.exists():
        os.remove(corpus_tmp_filepath)
    if corpus_index_tmp_filepath.exists():
        os.remove(corpus_index_tmp_filepath)

    # print('Search ends')
    return best_model, best_model_path, best_hyperparameters


In [29]:
# grid search / random search

countvect_params = _init_count_vectorizer_params(n_frequency=70, ngram_range=[1, 1])

# corpus and id2word will be generated on the fly
lda_params = _init_LdaMulticore_params(
    corpus=None, num_topics=20, id2word=None, 
    workers=3, chunksize=2024, random_state=42, passes=5)

# create search_space dict
search_space = {
    'lda_params': {
        'num_topics': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]     # do parameter search on number of topics only
        # 'decay':[0.7, 0.8, 0.9],
        # 'offset':[16, 64, 128]
    }
}

dataset_path_config = dataset_path.relative_to(dataset_path.parent.parent.parent.parent)

search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
# search_behaviour = SEARCH_BEHAVIOUR.RANDOM_SEARCH


training_datetime = datetime.now()
# training_datetime = datetime(2024, 2, 3, 22, 37, 32)

training_folder = Path(f'lda_multicore_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')

best_model, best_model_path, best_hyperparameters = model_search(
    X_lemmatized, 
    hyperparameters = {
        'countvect_params':countvect_params,
        'lda_params':lda_params
    }, 
    search_space=search_space, 
    save_folder=training_folder,
    dataset_path=dataset_path_config,
    additional_stopwords=custom_stopwords,
    metrics=[METRICS.C_NPMI, METRICS.C_V, METRICS.UMASS, METRICS.C_UCI, METRICS.TOPIC_DIVERSITY, METRICS.INVERTED_RBO, METRICS.PAIRWISE_JACCARD_SIMILARITY],
    monitor=METRICS.C_NPMI,
    search_behaviour=search_behaviour, 
    # search_rs=42, 
    # search_n_iter=10,
    run_from_checkpoints=False)

2024-02-07 18:59:39,269 - Created config.json at: lda_multicore_grid_search_20240207_185939/config.json
2024-02-07 18:59:39,269 - Search folder: lda_multicore_grid_search_20240207_185939
2024-02-07 18:59:39,269 - Best model checkpoint: 
2024-02-07 18:59:39,269 - Best metric score: -inf
2024-02-07 18:59:39,269 - Best model: None


2024-02-07 18:59:39,269 - Training with current search space: {'lda_params__num_topics': 10}


100%|██████████| 719448/719448 [00:05<00:00, 127063.12it/s]


2024-02-07 18:59:45,496 - Number of vocabulary: 10417




2024-02-07 19:00:23,997 - Temporarily saved the corpus to lda_multicore_grid_search_20240207_185939/temp_corpus.mm
2024-02-07 19:03:26,647 - Computing evaluation metrics
2024-02-07 19:03:57,942 - Evaluation metric (c_npmi): 0.0512752226282208
2024-02-07 19:04:21,845 - Evaluation metric (c_v): 0.5026157025585205
2024-02-07 19:04:32,494 - Evaluation metric (u_mass): -2.1923964338356705
2024-02-07 19:05:04,117 - Evaluation metric (c_uci): 0.358247497503031
2024-02-07 19:05:04,118 - Evaluation metric (topic_diversity): 0.78
2024-02-07 19:05:04,120 - Evaluation metric (inverted_rbo): 0.9229553535969841
2024-02-07 19:05:04,120 - Evaluation metric (pairwise_jaccard_similarity): 0.06039062798608724
2024-02-07 19:05:04,133 - Model saved at: lda_multicore_grid_search_20240207_185939/lda_multicore_lda_num_topics_10
2024-02-07 19:05:13,536 - Saved the corpus to lda_multicore_grid_search_20240207_185939/lda_multicore_lda_num_topics_10/lda_multicore_lda_num_topics_10_corpus.mm
2024-02-07 19:05:13,53

100%|██████████| 719448/719448 [00:05<00:00, 133771.62it/s]


2024-02-07 19:05:19,466 - Number of vocabulary: 10417




2024-02-07 19:05:57,590 - Temporarily saved the corpus to lda_multicore_grid_search_20240207_185939/temp_corpus.mm
2024-02-07 19:08:56,038 - Computing evaluation metrics
2024-02-07 19:09:33,725 - Evaluation metric (c_npmi): 0.056258962156916115
2024-02-07 19:10:13,749 - Evaluation metric (c_v): 0.500430029723298
2024-02-07 19:10:23,859 - Evaluation metric (u_mass): -2.6216152198177314
2024-02-07 19:11:01,912 - Evaluation metric (c_uci): 0.4094221984580776
2024-02-07 19:11:01,912 - Evaluation metric (topic_diversity): 0.79
2024-02-07 19:11:01,922 - Evaluation metric (inverted_rbo): 0.9656794119308271
2024-02-07 19:11:01,922 - Evaluation metric (pairwise_jaccard_similarity): 0.023534390672243045
2024-02-07 19:11:01,935 - Model saved at: lda_multicore_grid_search_20240207_185939/lda_multicore_lda_num_topics_20
2024-02-07 19:11:11,189 - Saved the corpus to lda_multicore_grid_search_20240207_185939/lda_multicore_lda_num_topics_20/lda_multicore_lda_num_topics_20_corpus.mm
2024-02-07 19:11:11

100%|██████████| 719448/719448 [00:05<00:00, 135445.29it/s]


2024-02-07 19:11:17,053 - Number of vocabulary: 10417
2024-02-07 19:11:54,887 - Temporarily saved the corpus to lda_multicore_grid_search_20240207_185939/temp_corpus.mm
2024-02-07 19:15:15,350 - Computing evaluation metrics
2024-02-07 19:15:57,417 - Evaluation metric (c_npmi): 0.05128064892197273
2024-02-07 19:16:51,138 - Evaluation metric (c_v): 0.48732621616642885
2024-02-07 19:17:01,542 - Evaluation metric (u_mass): -2.9237253598375146
2024-02-07 19:17:43,819 - Evaluation metric (c_uci): 0.32269991068250964
2024-02-07 19:17:43,820 - Evaluation metric (topic_diversity): 0.8133333333333334
2024-02-07 19:17:43,841 - Evaluation metric (inverted_rbo): 0.9773852554438095
2024-02-07 19:17:43,841 - Evaluation metric (pairwise_jaccard_similarity): 0.01925297241300229
2024-02-07 19:17:43,857 - Model saved at: lda_multicore_grid_search_20240207_185939/lda_multicore_lda_num_topics_30
2024-02-07 19:17:53,151 - Saved the corpus to lda_multicore_grid_search_20240207_185939/lda_multicore_lda_num_to

100%|██████████| 719448/719448 [00:05<00:00, 132873.78it/s]


2024-02-07 19:17:59,121 - Number of vocabulary: 10417




2024-02-07 19:18:37,377 - Temporarily saved the corpus to lda_multicore_grid_search_20240207_185939/temp_corpus.mm
2024-02-07 19:58:58,816 - Computing evaluation metrics
2024-02-07 19:59:45,966 - Evaluation metric (c_npmi): 0.04531897277171383
2024-02-07 20:00:56,791 - Evaluation metric (c_v): 0.4742796439622238
2024-02-07 20:01:07,392 - Evaluation metric (u_mass): -3.2441696983048063
2024-02-07 20:01:54,708 - Evaluation metric (c_uci): 0.22506432447340652
2024-02-07 20:01:54,708 - Evaluation metric (topic_diversity): 0.84
2024-02-07 20:01:54,745 - Evaluation metric (inverted_rbo): 0.987022484105174
2024-02-07 20:01:54,746 - Evaluation metric (pairwise_jaccard_similarity): 0.010213520855935703
2024-02-07 20:01:54,762 - Model saved at: lda_multicore_grid_search_20240207_185939/lda_multicore_lda_num_topics_40
2024-02-07 20:02:04,128 - Saved the corpus to lda_multicore_grid_search_20240207_185939/lda_multicore_lda_num_topics_40/lda_multicore_lda_num_topics_40_corpus.mm
2024-02-07 20:02:04

100%|██████████| 719448/719448 [00:05<00:00, 133521.40it/s]


2024-02-07 20:02:10,068 - Number of vocabulary: 10417
2024-02-07 20:02:48,361 - Temporarily saved the corpus to lda_multicore_grid_search_20240207_185939/temp_corpus.mm
2024-02-07 22:05:42,149 - Computing evaluation metrics
2024-02-07 22:06:33,549 - Evaluation metric (c_npmi): 0.037045367136969064
2024-02-07 22:08:02,841 - Evaluation metric (c_v): 0.46049491491897093
2024-02-07 22:08:13,598 - Evaluation metric (u_mass): -3.2428863449322534
2024-02-07 22:09:04,987 - Evaluation metric (c_uci): 0.16318599494004754
2024-02-07 22:09:04,987 - Evaluation metric (topic_diversity): 0.802
2024-02-07 22:09:05,048 - Evaluation metric (inverted_rbo): 0.9813369745797901
2024-02-07 22:09:05,049 - Evaluation metric (pairwise_jaccard_similarity): 0.017331002576469218
2024-02-07 22:09:05,066 - Model saved at: lda_multicore_grid_search_20240207_185939/lda_multicore_lda_num_topics_50
2024-02-07 22:09:14,694 - Saved the corpus to lda_multicore_grid_search_20240207_185939/lda_multicore_lda_num_topics_50/lda

100%|██████████| 719448/719448 [00:05<00:00, 124276.78it/s]


2024-02-07 22:09:21,070 - Number of vocabulary: 10417




2024-02-07 22:10:00,323 - Temporarily saved the corpus to lda_multicore_grid_search_20240207_185939/temp_corpus.mm


KeyboardInterrupt: 

---

load the best model from grid search

but also load the id2word object and the corpus corresponding to the model (for separate evaluation and inference)

https://stackoverflow.com/questions/60840809/gensim-how-to-load-corpus-from-saved-lda-model

In [25]:
# load the best model from training folder

training_folder = Path(f'lda_multicore_grid_search_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

best_model_checkpoint_path = Path(training_result['best_model_checkpoint'])

best_id2word = gensim.corpora.Dictionary.load(str(best_model_checkpoint_path.joinpath('lda_multicore.id2word')))
# best_corpus = [best_id2word.doc2bow(text) for text in X_lemmatized]      # recreate the corpus given the id2word (gensim Dictionary) (this is for new data)
best_corpus = gensim.corpora.MmCorpus(str(best_model_checkpoint_path.joinpath(f'{best_model_checkpoint_path.stem}_corpus.mm')))
best_model = gensim.models.ldamulticore.LdaMulticore.load(str(best_model_checkpoint_path.joinpath('lda_multicore')))

print('Best model checkpoint path:', best_model_checkpoint_path)

lda_model = best_model
id2word = best_id2word
corpus = best_corpus


Best model checkpoint path: lda_multicore_grid_search_20240130_235851/lda_multicore_num_topics_20


visualize the data

In [26]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis



In [27]:
# recreate the result?
# yes
# given the same X_lemmatized, corpus, id2word (dictionary) and model, the result should be the same

coherencemodel = CoherenceModel(
    texts=X_lemmatized,
    corpus=corpus, dictionary=id2word, model=lda_model, coherence='c_npmi', processes=3
)
print('c_npmi coherence:', coherencemodel.get_coherence())

coherencemodel = CoherenceModel(
    texts=X_lemmatized,
    corpus=corpus, dictionary=id2word, model=lda_model, coherence='c_v', processes=3
)
print('c_v coherence:', coherencemodel.get_coherence())

coherencemodel = CoherenceModel(
    texts=X_lemmatized,
    corpus=corpus, dictionary=id2word, model=lda_model, coherence='u_mass', processes=3
)
print('u_mass coherence:', coherencemodel.get_coherence())

coherencemodel = CoherenceModel(
    texts=X_lemmatized,
    corpus=corpus, dictionary=id2word, model=lda_model, coherence='c_uci', processes=3
)
print('c_uci coherence:', coherencemodel.get_coherence())

c_npmi coherence: 0.0015355651331122177
c_v coherence: 0.5321776032659485
u_mass coherence: -3.3458121760126383
c_uci coherence: -0.5886526217343326


save model

we need to save the corpora.Dictionary and the LDA model

In [None]:
# save the LDA multicore model (and the corpora.Dictionary object) automatically

# lda_save_folder = Path(f'lda_model_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
# if not lda_save_folder.exists():
#     lda_save_folder.mkdir()

# lda_model.save(str(lda_save_folder.joinpath('lda_model')))     # no need to add file extension

Evaluation

gensim provide functions to calculate, so we don't need to install octis (as the evaluation backend of octis also relies on gensim)

octis seems awesome for simple development, but it installs many packages ;(

In [None]:
# corpus = lemmatized words (?) (list of list of str)

# create a result object from the LDAMulticore model for octis evaluation
# referencing from https://github.com/MIND-Lab/OCTIS/blob/master/octis/models/LDA.py
# and guideline in README: https://github.com/MIND-Lab/OCTIS/tree/master
result_lda_online = {}
result_lda_online['topic-word-matrix'] = lda_model.get_topics()

top_words = 10
topics_output = []
for topic in result_lda_online["topic-word-matrix"]:
    top_k = np.argsort(topic)[-top_words:]
    top_k_words = list(reversed([id2word[i] for i in top_k]))
    topics_output.append(top_k_words)
result_lda_online["topics"] = topics_output

def _get_topic_document_matrix(lda_model, corpus, num_topics=10):
    """
    Return the topic representation of the
    corpus
    """

    id_corpus = corpus

    doc_topic_tuples = []
    for document in id_corpus:
        doc_topic_tuples.append(
            lda_model.get_document_topics(document, minimum_probability=0))

    topic_document = np.zeros((num_topics, len(doc_topic_tuples)))

    for ndoc in range(len(doc_topic_tuples)):
        document = doc_topic_tuples[ndoc]
        for topic_tuple in document:
            topic_document[topic_tuple[0]][ndoc] = topic_tuple[1]
    return topic_document

result_lda_online['topic-document-matrix'] = _get_topic_document_matrix(lda_model, corpus, num_topics=N_TOPICS)

In [None]:
lda_model.show_topics(num_topics=N_TOPICS, num_words=10, formatted=True, log=False)

In [None]:
# setup: get the model's topics in their native ordering...
all_topics = lda_model.print_topics()
# ...then create a empty list per topic to collect the docs:
docs_per_topic = [[] for _ in all_topics]

# now, for every doc...
for doc_id, doc_bow in enumerate(corpus):
    # ...get its topics...
    doc_topics = lda_model.get_document_topics(doc_bow)
    # ...& for each of its topics...
    for topic_id, score in doc_topics:
        # ...add the doc_id & its score to the topic's doc list
        docs_per_topic[topic_id].append((doc_id, score))

In [None]:
# If you're interested in the top docs per topic, you can further sort each list's pairs by their score

for doc_list in docs_per_topic:
    doc_list.sort(key=lambda id_and_score: id_and_score[1], reverse=True)

In [None]:
print(docs_per_topic[0][:10])

In [None]:
# show top 10 documents for each topic, also the name of the game
for topic_id, docs in enumerate(docs_per_topic):
    print(f'Topic {topic_id + 1}:')
    for doc_id, score in docs[:10]:
        print(f'Game: {dataset.iloc[doc_id]["app_name"]}')
        print(f'Doc ID: {doc_id}')
        print(f'Score: {score}')
        print(f'Doc: {dataset.iloc[doc_id]["review_text"]}')
        print()
    print('\n\n\n\n\n')

In [None]:
dataset.iloc[1655473]

In [None]:
X[1655473]

In [None]:
result_lda_online['topic-document-matrix'][0]

In [None]:
lda_model.get_topics().shape

In [None]:
np.sum(result_lda_online['topic-document-matrix'], axis=0)

INFERENCE

inference test

In [30]:
# inference test

inference_test = ["well its been fun guys, but that's it, no more updates, that one was the last one, there is no longer going to be anymore content for this game anymore, there is no way to replay it as there won't be any updates, nope, that was it, the last update, nothing more, this game has no new ways to experience it as there is no more content updates, nothing new to freshen up the experience, its such a shame that this game has no replay-ability, once you beat the game there is like no point to playing again, as they said guys 1.2 will be they final update. nothing more after 1.2, there is no chance they will make another final update right? several years and final updates later: alright, thats it, no more updates we wont be getting anymore, thats it, nothing more, no more updates, for real this time... oh god, redigit made another tweet.",
                  "keeps forcing me to play it",
'''I will leave the cat here, so that everybody who passes by can pet it and give it a thumbs up and awards
　　　 　　／＞　　フ
　　　 　　| 　_　 _ l
　 　　 　／` ミ＿xノ
　　 　 /　　　 　 |
　　　 /　 ヽ　　 ﾉ
　 　 │　　|　|　|
　／￣|　　 |　|　|
　| (￣ヽ＿_ヽ_)__)
　＼二つ''']

inference_test = cleaning_strlist(inference_test)

inference_test = list(map(lambda x: lemmatization(x), inference_test))

corpus_test = [id2word.doc2bow(text) for text in inference_test]

test_output = lda_model[corpus_test]

test_output

<gensim.interfaces.TransformedCorpus at 0x2d01d19a0>

In [31]:
inference_test[-1]

['leave',
 'cat',
 'everybody',
 'pass',
 'pet',
 'give',
 'thumb',
 'award',
 'l',
 'x']

In [32]:
# test inference

corpus_test = [id2word.doc2bow(text) for text in inference_test]

output_test = lda_model[corpus_test]

for i in range(len(output_test)):
    # print(sorted(test_output[i], key=lambda x: x[1], reverse=True))
    print(sorted(output_test[i], key=lambda x: x[1], reverse=True))

[(19, 0.49894378), (10, 0.27795982), (5, 0.14360103), (15, 0.032877725), (8, 0.027817974)]
[(2, 0.68325394), (11, 0.016670862), (7, 0.016670847), (0, 0.016670845), (1, 0.016670845), (3, 0.016670845), (4, 0.016670845), (5, 0.016670845), (6, 0.016670845), (8, 0.016670845), (9, 0.016670845), (10, 0.016670845), (12, 0.016670845), (13, 0.016670845), (14, 0.016670845), (15, 0.016670845), (16, 0.016670845), (17, 0.016670845), (18, 0.016670845), (19, 0.016670845)]
[(2, 0.40191895), (12, 0.32662132), (8, 0.15000892)]


load model (both corpora Dictionary and the LDA model)

In [None]:
# del id2word
# del lda_model

# model_datetime = datetime(2024, 1, 15, 0, 21, 57)
# lda_save_folder = Path(f'lda_model_{model_datetime.strftime("%Y%m%d_%H%M%S")}')

# # id2word_load = gensim.corpora.Dictionary.load('lda_model.id2word')
# id2word_l = gensim.corpora.Dictionary.load(str(lda_save_folder.joinpath('lda_model.id2word')))

# lda_model_l = gensim.models.ldamulticore.LdaMulticore.load(str(lda_save_folder.joinpath('lda_model')))

In [None]:
# corpus_test2 = [id2word_l.doc2bow(text) for text in inference_test]

# output_test2 = lda_model_l[corpus_test2]

# for i in range(len(output_test2)):
#     print(sorted(output_test2[i], key=lambda x: x[1], reverse=True))