Demo ipynb for LDA

Creating training pipeline on different situations (15/01/2024)

- For all games (using 0.1 and 0.5 of the whole dataset for pipeline developments)
- For top 11 genres

- (if possible): per game, focus on large and small (indie) games

Do the same thing for all three model architectures

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

import gensim

import spacy

import nltk

import pyLDAvis
import pickle
from datetime import datetime

In [2]:
# download nltk stopwords
# import nltk
# nltk.download('stopwords')

# download spacy stopwords
# ...

In [3]:
# training conditions

from enum import Enum

class TRAINING_CONDS(Enum):
    ALL_GAMES = 1
    ALL_GAMES_LARGE = 2
    BY_GENRE = 3
    ALL_GAMES_TINY = 4

# training condition
training_cond = TRAINING_CONDS.ALL_GAMES_TINY

In [4]:
# load a dataset

GENRES = ['Action', 'Indie', 'Adventure', 'RPG', 'Strategy', 'Simulation', 'Free to Play', 'Causal', 'Massively Multiplayer', 'Racing', 'Sports']

training_genre_id = 2

if training_cond == TRAINING_CONDS.ALL_GAMES:
    dataset_path = Path('../dataset_cleaned_heartless_sampled_for_demo.pkl')
elif training_cond == TRAINING_CONDS.ALL_GAMES_LARGE:
    dataset_path = Path('../dataset_cleaned_heartless_sampled_for_demo_large.pkl')
elif training_cond == TRAINING_CONDS.ALL_GAMES_TINY:
    dataset_path = Path('../dataset_cleaned_heartless_sampled_for_demo_tiny.pkl')
elif training_cond == TRAINING_CONDS.BY_GENRE:
    dataset_path = Path(f'../../dataset/topic_modelling/top_11_genres/{training_genre_id:02}_{GENRES[training_genre_id]}.pkl')

# dataset_path = Path('../../dataset/topic_modelling/top_10_games/00_Terraria.pkl')

# dataset_path = Path('../dataset_cleaned_heartless_sampled_for_demo.pkl')


dataset = pd.read_pickle(dataset_path)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 41801 entries, 2265566 to 1363240
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         41801 non-null  int64 
 1   app_id        41801 non-null  int64 
 2   app_name      41801 non-null  object
 3   review_text   41801 non-null  object
 4   review_score  41801 non-null  int64 
 5   review_votes  41801 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 2.2+ MB


In [5]:
%load_ext autoreload

In [6]:
# data preprocessing

import re

import sys
sys.path.append('../../sa')

%autoreload 2
import str_cleaning_functions

def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

def cleaning_strlist(str_list):
    str_list = list(map(lambda x: str_cleaning_functions.remove_links(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_links2(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.clean(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.deEmojify(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_non_letters(x), str_list))
    str_list = list(map(lambda x: x.lower(), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_stopword(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
    return str_list

In [7]:
# apply data preprocessing

cleaning(dataset, 'review_text')

In [8]:
X = dataset['review_text'].values

In [9]:
X

array(['actually already wrote huge review short great game rage inducing play play minutes time hopelessly frustrating said funny fun',
       'far thrilling game ever played fan soccer soccer games one ever play awesome game play highly recommend game anyone age even infants jks',
       'might opinon find game boring uninteresting hard get controlls really weird graphics decent tho yet play assasins creed games besides one yeah',
       ...,
       'funny game alot geek culture built swearing side thoroughly enjoyed every part although think classic school adventure clickers easy time navigating game rusty get sale think enjoy',
       'ftl faster light perhaps one greatest games time ever',
       'okay saw game expected high quality remake like bionic commando rearmed duck tales remastered know great classic games uplifted polished graphics sound honestly whole point get dose retro nostalgia care might well download rom image play old version emulator free right long time assumed 

In [10]:
# nltk.download('averaged_perceptron_tagger')
# t = nltk.word_tokenize(X[0])
# tt = nltk.pos_tag(t)
# tt

In [11]:
# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

In [12]:
# lemmatize the data

X_lemmatized = list(map(lambda x: lemmatization(x), X))

In [13]:
X_lemmatized[0]

['actually',
 'already',
 'write',
 'huge',
 'review',
 'short',
 'great',
 'game',
 'rage',
 'induce',
 'play',
 'play',
 'minute',
 'time',
 'hopelessly',
 'frustrate',
 'say',
 'funny',
 'fun']

In [14]:
# save the lematized data, as separate pickle file

X_lemmatized_file = dataset_path.parent.joinpath('cleaned_lemmatized', dataset_path.stem + '_cleaned_lemmatized.pkl')

if not X_lemmatized_file.parent.exists():
    X_lemmatized_file.parent.mkdir()

with open(X_lemmatized_file, "wb") as f:
    pickle.dump(X_lemmatized, f)

In [15]:
# load the lematized data, as separate pickle file
# for convenient hyperparameter selection

# X_lemmatized_file = dataset_path.parent.joinpath('cleaned_lemmatized', dataset_path.stem + '_cleaned_lemmatized.pkl')

# with open(X_lemmatized_file, "rb") as f:
#     X_lemmatized = pickle.load(f)

# X_lemmatized[0]

In [16]:
# use gensim to build a dictionary and train our LDAModel

id2word = gensim.corpora.Dictionary(X_lemmatized)

corpus = [id2word.doc2bow(text) for text in X_lemmatized]

In [17]:
# build a grid search for hyperparameter selection

from gensim.models import CoherenceModel

# def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
#     """
#     Compute c_v coherence for various number of topics

#     Parameters:
#     ----------
#     dictionary : Gensim dictionary
#     corpus : Gensim corpus
#     texts : List of input texts
#     limit : Max num of topics

#     Returns:
#     -------
#     model_list : List of LDA topic models
#     coherence_values : Coherence values corresponding to the LDA model with respective number of topics

#     From: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
#     """
#     coherence_values = []
#     model_list = []
#     for num_topics in range(start, limit, step):
#         print(f'num_topics: {num_topics}')
#         model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                                 id2word=id2word,
#                                                 num_topics=num_topics, 
#                                                 random_state=100,
#                                                 update_every=1,
#                                                 chunksize=100,
#                                                 passes=10,
#                                                 alpha='auto',
#                                                 per_word_topics=True)
#         model_list.append(model)
#         coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
#         print(f'coherence: {coherencemodel.get_coherence()}')
#         coherence_values.append(coherencemodel.get_coherence())

#     return model_list, coherence_values

In [24]:
def _init_LdaMulticore_params(corpus=None, num_topics=100, id2word=None, workers=None, chunksize=2000, 
        passes=1, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, 
        eval_every=10, iterations=50, gamma_threshold=0.001, random_state=None, 
        minimum_probability=0.01, minimum_phi_value=0.01, per_word_topics=False, dtype=np.float32):
    
    hyperparameters = dict()
    hyperparameters['corpus'] = corpus
    hyperparameters["num_topics"] = num_topics
    hyperparameters['id2word'] = id2word
    hyperparameters["workers"] = workers
    hyperparameters["chunksize"] = chunksize
    hyperparameters["passes"] = passes
    hyperparameters["alpha"] = alpha
    hyperparameters["eta"] = eta
    hyperparameters["decay"] = decay
    hyperparameters["offset"] = offset
    hyperparameters["eval_every"] = eval_every
    hyperparameters["iterations"] = iterations
    hyperparameters["gamma_threshold"] = gamma_threshold
    hyperparameters['minimum_probability'] = minimum_probability
    hyperparameters["random_state"] = random_state
    hyperparameters['minimum_phi_value'] = minimum_phi_value
    hyperparameters['per_word_topics'] = per_word_topics
    hyperparameters['dtype'] = dtype

    if "alpha" in hyperparameters:
        if isinstance(hyperparameters["alpha"], float):
            hyperparameters["alpha"] = [
                hyperparameters["alpha"]
            ] * hyperparameters["num_topics"]

    return hyperparameters

def _init_config_dict(model_name:str, hyperparameters:dict, search_space_dict:dict):
    # init dict for config.json

    config = {}
    config['model'] = model_name
    config.update(hyperparameters)

    config.pop('corpus', '')
    config.pop('id2word', '')
    
    # remove hyperparameters that are in the search space
    for key in search_space_dict.keys():
        config.pop(key, '')

    config['dtype'] = str(config['dtype'])

    # store the search space
    config['search_space'] = search_space_dict

    config['gensim_version'] = str(gensim.__version__)

    return config

def _init_result_dict():
    # init dict for result.json

    result = {}
    result['best_metric'] = -float('inf')
    result['best_model_checkpoint'] = ""
    result['best_hyperparameters'] = dict()
    result["metric_type"] = ""
    result["log_history"] = list()

    return result


hyperparameters = _init_LdaMulticore_params(
    corpus=corpus, num_topics=100, id2word=id2word, 
    workers=3, chunksize=2000, random_state=42, passes=10)

# create search_space dict
search_space = dict()

search_space['num_topics'] = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
search_space['decay'] = [0.7, 0.8, 0.9]
search_space['offset'] = [16, 64, 128]

from itertools import product
import json
from gensim.models import CoherenceModel
from copy import deepcopy

def grid_search(X, hyperparameters:dict, search_space:dict, save_folder:Path, save_each_models=True):
    """
    Perform grid search for LDA model hyperparameter selection

    Parameters:
    ----------
    X : List of input texts
    hyperparameters : dict of hyperparameters
    search_space : dict of search space for hyperparameters
    save_each_models : save each model or not
    save_path : path to save the model

    Returns:
    -------
    best_model : best model
    best_model_path : path to the best model
    best_hyperparameters : best hyperparameters
    """

    if not save_folder.exists():
        save_folder.mkdir()

    print(f'Grid Search folder: {save_folder}')

    # TODO: load the josn files to contiunue the search

    # init some json files for saving the results
    config = _init_config_dict('lda_multicore', hyperparameters, search_space)
    # save config
    with open(save_folder.joinpath('config.json'), 'w') as f:
        json.dump(config, f, indent=2)

    print('Created config.json at:', save_folder.joinpath('config.json'))

    result = _init_result_dict()
    coherence_str = 'c_npmi'
    result['metric_type'] = coherence_str

    # init
    best_model = None
    best_model_path = None
    best_metric_score = -float('inf')

    # get the search space keys
    keys = list(search_space.keys())

    # iterate through the search space by using combination func
    for values in product(*search_space.values()):

        # create a dict of search space hyperparams
        search_space_dict = dict(zip(keys, values))

        print(f'Training with current search space: {search_space_dict}')

        # update existing hyperparams dict
        hyperparameters.update(search_space_dict)

        # train the model
        model = gensim.models.ldamulticore.LdaMulticore(**hyperparameters)

        ##########
        # Evaluation starts
        ##########

        print('Computing evaluation metric')

        # modify the evaluation part to include any evaluations you want


        # compute the coherence
        coherencemodel = CoherenceModel(model=model, texts=X, dictionary=id2word, coherence=coherence_str)
        metric_score = coherencemodel.get_coherence()

        print(f'Evaluation metric ({coherence_str}): {metric_score}')

        ##########
        # Evaluation ends
        ##########

        ##########
        # Save models
        ##########

        # create the folder by using search_space keys and current hyperparams values
        model_path = save_folder.joinpath(
            'lda_multicore_' + '_'.join([f'{key}_{value}' for key, value in search_space_dict.items()])
        )

        if not model_path.exists():
            model_path.mkdir(parents=True)

        # save the model
        if save_each_models:
            model.save(str(model_path.joinpath('lda_multicore')))

        ##########
        # Save models ends
        ##########
            
        model_hyperparameters = deepcopy(hyperparameters)
        model_hyperparameters.pop('corpus', '')
        model_hyperparameters.pop('id2word', '')
        model_hyperparameters['dtype'] = str(model_hyperparameters['dtype'])
            
        if metric_score > best_metric_score:
            best_metric_score = metric_score
            best_model = model
            best_model_path = model_path
            best_hyperparameters = model_hyperparameters
            
        ###########
        # Update result dict and json file
        ###########

        model_log_history = dict()
        model_log_history['metric'] = metric_score
        model_log_history['hyperparameters'] = model_hyperparameters

        result['best_metric'] = best_metric_score
        result['best_model_checkpoint'] = str(best_model_path)      # relative path
        result['best_hyperparameters'] = model_hyperparameters
        result["log_history"].append(model_log_history)

        print(result)

        # save result
        with open(save_folder.joinpath('result.json'), 'w') as f:
            json.dump(result, f, indent=2)

        print("Saved result.json at:", save_folder.joinpath('result.json'))
        print('\n\n')

    print('Grid Search ends')
    return best_model, best_model_path, best_hyperparameters


best_model, best_model_path, best_hyperparameters = grid_search(
    X_lemmatized, hyperparameters, search_space, 
    Path(f'lda_multicore_grid_search_{datetime.now().strftime("%Y%m%d_%H%M%S")}'))

Grid Search folder: lda_multicore_grid_search_20240118_001906
Created config.json at: lda_multicore_grid_search_20240118_001906/config.json
Training with current search space: {'num_topics': 10, 'decay': 0.7, 'offset': 16}


Computing evaluation metric
Evaluation metric (c_npmi): -0.13285839262271096
{'best_metric': -0.13285839262271096, 'best_model_checkpoint': 'lda_multicore_grid_search_20240118_001906/lda_multicore_num_topics_10_decay_0.7_offset_16', 'best_hyperparameters': {'num_topics': 10, 'workers': 3, 'chunksize': 2000, 'passes': 10, 'alpha': 'symmetric', 'eta': None, 'decay': 0.7, 'offset': 16, 'eval_every': 10, 'iterations': 50, 'gamma_threshold': 0.001, 'minimum_probability': 0.01, 'random_state': 42, 'minimum_phi_value': 0.01, 'per_word_topics': False, 'dtype': "<class 'numpy.float32'>"}, 'metric_type': 'c_npmi', 'log_history': [{'metric': -0.13285839262271096, 'hyperparameters': {'num_topics': 10, 'workers': 3, 'chunksize': 2000, 'passes': 10, 'alpha': 'symmetric', 'eta': None, 'decay': 0.7, 'offset': 16, 'eval_every': 10, 'iterations': 50, 'gamma_threshold': 0.001, 'minimum_probability': 0.01, 'random_state': 42, 'minimum_phi_value': 0.01, 'per_word_topics': False, 'dtype': "<class 'numpy.flo

KeyboardInterrupt: 

In [None]:
N_TOPICS = 20

# Online LDA, how to effective train LDA models
# https://papers.nips.cc/paper_files/paper/2010/hash/71f6278d140af599e06ad9bf1ba03cb0-Abstract.html

lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                             id2word=id2word,
                                             num_topics=N_TOPICS,         # later can use grid search to find the best number of topics
                                             random_state=42,
                                             chunksize=2048,                # chunk size affects memory consumption, and updating speed (like DL batch_size). https://groups.google.com/g/gensim/c/FE7_FYSconA
                                             passes=10,                     # no. of passes over the whole corpus. If larger chunksize, then the passes should be larger too.
                                            #  alpha='auto',
                                             workers=3)     # workers = no. of cores (physical cores, but not logical threads)

visualize the data

In [None]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis

save model

we need to save the corpora.Dictionary and the LDA model

In [None]:
# save the LDA multicore model (and the corpora.Dictionary object) automatically

lda_save_folder = Path(f'lda_model_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
if not lda_save_folder.exists():
    lda_save_folder.mkdir()

lda_model.save(str(lda_save_folder.joinpath('lda_model')))     # no need to add file extension

Evaluation

gensim provide functions to calculate, so we don't need to install octis (as the evaluation backend of octis also relies on gensim)

octis seems awesome for simple development, but it installs many packages ;(

In [None]:
# corpus = lemmatized words (?) (list of list of str)

# create a result object from the LDAMulticore model for octis evaluation
# referencing from https://github.com/MIND-Lab/OCTIS/blob/master/octis/models/LDA.py
# and guideline in README: https://github.com/MIND-Lab/OCTIS/tree/master
result_lda_online = {}
result_lda_online['topic-word-matrix'] = lda_model.get_topics()

top_words = 10
topics_output = []
for topic in result_lda_online["topic-word-matrix"]:
    top_k = np.argsort(topic)[-top_words:]
    top_k_words = list(reversed([id2word[i] for i in top_k]))
    topics_output.append(top_k_words)
result_lda_online["topics"] = topics_output

def _get_topic_document_matrix(lda_model, corpus, num_topics=10):
    """
    Return the topic representation of the
    corpus
    """

    id_corpus = corpus

    doc_topic_tuples = []
    for document in id_corpus:
        doc_topic_tuples.append(
            lda_model.get_document_topics(document, minimum_probability=0))

    topic_document = np.zeros((num_topics, len(doc_topic_tuples)))

    for ndoc in range(len(doc_topic_tuples)):
        document = doc_topic_tuples[ndoc]
        for topic_tuple in document:
            topic_document[topic_tuple[0]][ndoc] = topic_tuple[1]
    return topic_document

result_lda_online['topic-document-matrix'] = _get_topic_document_matrix(lda_model, corpus, num_topics=N_TOPICS)

In [None]:
lda_model.show_topics(num_topics=N_TOPICS, num_words=10, formatted=True, log=False)

In [None]:
# setup: get the model's topics in their native ordering...
all_topics = lda_model.print_topics()
# ...then create a empty list per topic to collect the docs:
docs_per_topic = [[] for _ in all_topics]

# now, for every doc...
for doc_id, doc_bow in enumerate(corpus):
    # ...get its topics...
    doc_topics = lda_model.get_document_topics(doc_bow)
    # ...& for each of its topics...
    for topic_id, score in doc_topics:
        # ...add the doc_id & its score to the topic's doc list
        docs_per_topic[topic_id].append((doc_id, score))

In [None]:
# If you're interested in the top docs per topic, you can further sort each list's pairs by their score

for doc_list in docs_per_topic:
    doc_list.sort(key=lambda id_and_score: id_and_score[1], reverse=True)

In [None]:
print(docs_per_topic[0][:10])

In [None]:
# show top 10 documents for each topic, also the name of the game
for topic_id, docs in enumerate(docs_per_topic):
    print(f'Topic {topic_id + 1}:')
    for doc_id, score in docs[:10]:
        print(f'Game: {dataset.iloc[doc_id]["app_name"]}')
        print(f'Doc ID: {doc_id}')
        print(f'Score: {score}')
        print(f'Doc: {dataset.iloc[doc_id]["review_text"]}')
        print()
    print('\n\n\n\n\n')

In [None]:
dataset.iloc[1655473]

In [None]:
X[1655473]

In [None]:
result_lda_online['topic-document-matrix'][0]

In [None]:
lda_model.get_topics().shape

In [None]:
np.sum(result_lda_online['topic-document-matrix'], axis=0)

Evaluation

instead of using octis, we use gensim provided CoherenceModel object,  
as octis also uses this module for calculating the coherence scores

In [None]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=X_lemmatized, dictionary=id2word, coherence='c_v')
coherence_cv = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_cv)

In [None]:
# Compute Coherence Score using c_npmi
coherence_model_lda = CoherenceModel(model=lda_model, texts=X_lemmatized, dictionary=id2word, coherence='c_npmi')
coherence_npmi = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_npmi)

INFERENCE

inference test

In [None]:
# inference test

inference_test = ["well its been fun guys, but that's it, no more updates, that one was the last one, there is no longer going to be anymore content for this game anymore, there is no way to replay it as there won't be any updates, nope, that was it, the last update, nothing more, this game has no new ways to experience it as there is no more content updates, nothing new to freshen up the experience, its such a shame that this game has no replay-ability, once you beat the game there is like no point to playing again, as they said guys 1.2 will be they final update. nothing more after 1.2, there is no chance they will make another final update right? several years and final updates later: alright, thats it, no more updates we wont be getting anymore, thats it, nothing more, no more updates, for real this time... oh god, redigit made another tweet.",
                  "keeps forcing me to play it",
'''I will leave the cat here, so that everybody who passes by can pet it and give it a thumbs up and awards
　　　 　　／＞　　フ
　　　 　　| 　_　 _ l
　 　　 　／` ミ＿xノ
　　 　 /　　　 　 |
　　　 /　 ヽ　　 ﾉ
　 　 │　　|　|　|
　／￣|　　 |　|　|
　| (￣ヽ＿_ヽ_)__)
　＼二つ''']

inference_test = cleaning_strlist(inference_test)

inference_test = list(map(lambda x: lemmatization(x), inference_test))

corpus_test = [id2word.doc2bow(text) for text in inference_test]

test_output = lda_model[corpus_test]

test_output

In [None]:
inference_test[-1]

In [None]:
# test inference

corpus_test = [id2word.doc2bow(text) for text in inference_test]

output_test = lda_model[corpus_test]

for i in range(len(output_test)):
    # print(sorted(test_output[i], key=lambda x: x[1], reverse=True))
    print(sorted(output_test[i], key=lambda x: x[1], reverse=True))

load model (both corpora Dictionary and the LDA model)

In [None]:
del id2word
del lda_model

model_datetime = datetime(2024, 1, 15, 0, 21, 57)
lda_save_folder = Path(f'lda_model_{model_datetime.strftime("%Y%m%d_%H%M%S")}')

# id2word_load = gensim.corpora.Dictionary.load('lda_model.id2word')
id2word_l = gensim.corpora.Dictionary.load(str(lda_save_folder.joinpath('lda_model.id2word')))

lda_model_l = gensim.models.ldamulticore.LdaMulticore.load(str(lda_save_folder.joinpath('lda_model')))

In [None]:
corpus_test2 = [id2word_l.doc2bow(text) for text in inference_test]

output_test2 = lda_model_l[corpus_test2]

for i in range(len(output_test2)):
    print(sorted(output_test2[i], key=lambda x: x[1], reverse=True))