Demo ipynb for LDA

Testing the pipeline for a single game

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

import gensim

import spacy

import nltk

import pyLDAvis
import pickle
from datetime import datetime

In [2]:
# download nltk stopwords
# import nltk
# nltk.download('stopwords')

In [3]:
# load a dataset

# dataset_path = Path('../../dataset/topic_modelling/top_10_games/00_Terraria.pkl')

# dataset_path = Path('../dataset_cleaned_heartless_sampled_for_demo.pkl')

dataset_path = Path('../../dataset/topic_modelling/top_11_genres/01_Indie.pkl')


dataset = pd.read_pickle(dataset_path)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 741913 entries, 25636 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         741913 non-null  int64 
 1   app_id        741913 non-null  int64 
 2   app_name      741913 non-null  object
 3   review_text   741913 non-null  object
 4   review_score  741913 non-null  int64 
 5   review_votes  741913 non-null  int64 
 6   genre_id      741913 non-null  object
 7   category_id   741913 non-null  object
dtypes: int64(4), object(4)
memory usage: 50.9+ MB


In [5]:
%load_ext autoreload

In [7]:
# data preprocessing

import re

import sys
sys.path.append('../../sa')

%autoreload 2
import str_cleaning_functions

def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

def cleaning_strlist(str_list):
    str_list = list(map(lambda x: str_cleaning_functions.remove_links(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_links2(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.clean(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.deEmojify(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_non_letters(x), str_list))
    str_list = list(map(lambda x: x.lower(), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_stopword(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
    return str_list

In [8]:
# apply data preprocessing

cleaning(dataset, 'review_text')

In [9]:
X = dataset['review_text'].values

In [10]:
X

array(['take one part faerie solitaire two parts puzzle quest mix little poker yahtzee good measure get something like runespell overture changeling sort fight monsters take quests exchange coin buffs come form power cards story strongest element game like puzzle quest games battles determined playing mini game instead match though game card game similar poker making certain combinations cards pairs kind full house flush straight certain amount damage opponent trying ability steal cards opponent plus limited number moves get per turn move cards play power ups adds enough strategy game keep interesting admittedly game get bit repetitive found dialogue options bit tedious fortunately game allows skip want easy game learn entertaining casual game play seems pretty short achievements seem difficult collect thing finding little gems like reason buy bundles',
       'make games like simple card playing mechanic fun addicting vaguely interesting storyline character make way like puzzle quest 

In [None]:
# nltk.download('averaged_perceptron_tagger')
# t = nltk.word_tokenize(X[0])
# tt = nltk.pos_tag(t)
# tt

[('game', 'NN'),
 ('billed', 'VBD'),
 ('scrabble', 'JJ'),
 ('meets', 'NNS'),
 ('sudoku', 'VBP'),
 ('unfortunately', 'RB'),
 ('really', 'RB'),
 ('multiple', 'JJ'),
 ('word', 'NN'),
 ('scramble', 'JJ'),
 ('word', 'NN'),
 ('certain', 'JJ'),
 ('set', 'NN'),
 ('possible', 'JJ'),
 ('letters', 'NNS'),
 ('solution', 'VBP'),
 ('player', 'NN'),
 ('must', 'MD'),
 ('solve', 'VB'),
 ('words', 'NNS'),
 ('cotemporaneously', 'RB'),
 ('approached', 'VBD'),
 ('one', 'CD'),
 ('might', 'MD'),
 ('approach', 'VB'),
 ('sudoku', 'NN'),
 ('really', 'RB'),
 ('possible', 'JJ'),
 ('play', 'NN'),
 ('like', 'IN'),
 ('electronic', 'JJ'),
 ('sudoku', 'NNS'),
 ('make', 'VBP'),
 ('notations', 'NNS'),
 ('possible', 'JJ'),
 ('answers', 'NNS'),
 ('board', 'NN'),
 ('becomes', 'VBZ'),
 ('nigh', 'RB'),
 ('unplayable', 'JJ'),
 ('difficulty', 'NN'),
 ('curve', 'NN'),
 ('extremely', 'RB'),
 ('uneven', 'JJ'),
 ('jumping', 'VBG'),
 ('dropping', 'VBG'),
 ('unexpectedly', 'RB'),
 ('within', 'IN'),
 ('difficulty', 'NN'),
 ('heading'

In [11]:
# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

In [12]:
# lemmatize the data

X_lemmatized = list(map(lambda x: lemmatization(x), X))

In [11]:
X_lemmatized[0]

['game',
 'bill',
 'scrabble',
 'meet',
 'sudoku',
 'unfortunately',
 'really',
 'multiple',
 'word',
 'scramble',
 'word',
 'certain',
 'set',
 'possible',
 'letter',
 'solution',
 'player',
 'must',
 'solve',
 'word',
 'cotemporaneously',
 'approach',
 'one',
 'might',
 'approach',
 'sudoku',
 'really',
 'possible',
 'play',
 'like',
 'electronic',
 'sudoku',
 'make',
 'notation',
 'possible',
 'answer',
 'board',
 'become',
 'nigh',
 'unplayable',
 'difficulty',
 'curve',
 'extremely',
 'uneven',
 'jump',
 'drop',
 'unexpectedly',
 'within',
 'difficulty',
 'head',
 'lexica',
 'also',
 'bill',
 'coffee',
 'break',
 'type',
 'game',
 'due',
 'finite',
 'number',
 'puzzle',
 'available',
 'although',
 'large',
 'number',
 'puzzle',
 'can',
 'not',
 'easily',
 'reset',
 'severely',
 'limits',
 'coffee',
 'break',
 'play',
 'random',
 'puzzle',
 'generator',
 'would',
 'improve',
 'limitation',
 'timer',
 'unnecessary',
 'feature',
 'can',
 'not',
 'reset',
 'puzzle',
 'replay',
 'can',

In [12]:
# save the lematized data, as separate pickle file

import pickle

X_lemmatized_file = Path('dataset_cleaned_heartless_sampled_for_demo_X_lemmatized.pkl')

with open(X_lemmatized_file, "wb") as f:
    pickle.dump(X_lemmatized, f)

In [9]:
import pickle

X_lemmatized_file = Path('dataset_cleaned_heartless_sampled_for_demo_X_lemmatized.pkl')

with open(X_lemmatized_file, "rb") as f:
    X_lemmatized = pickle.load(f)

X_lemmatized[0]

['game',
 'bill',
 'scrabble',
 'meet',
 'sudoku',
 'unfortunately',
 'really',
 'multiple',
 'word',
 'scramble',
 'word',
 'certain',
 'set',
 'possible',
 'letter',
 'solution',
 'player',
 'must',
 'solve',
 'word',
 'cotemporaneously',
 'approach',
 'one',
 'might',
 'approach',
 'sudoku',
 'really',
 'possible',
 'play',
 'like',
 'electronic',
 'sudoku',
 'make',
 'notation',
 'possible',
 'answer',
 'board',
 'become',
 'nigh',
 'unplayable',
 'difficulty',
 'curve',
 'extremely',
 'uneven',
 'jump',
 'drop',
 'unexpectedly',
 'within',
 'difficulty',
 'head',
 'lexica',
 'also',
 'bill',
 'coffee',
 'break',
 'type',
 'game',
 'due',
 'finite',
 'number',
 'puzzle',
 'available',
 'although',
 'large',
 'number',
 'puzzle',
 'can',
 'not',
 'easily',
 'reset',
 'severely',
 'limits',
 'coffee',
 'break',
 'play',
 'random',
 'puzzle',
 'generator',
 'would',
 'improve',
 'limitation',
 'timer',
 'unnecessary',
 'feature',
 'can',
 'not',
 'reset',
 'puzzle',
 'replay',
 'can',

In [18]:
# use gensim to build a dictionary and train our LDAModel

id2word = gensim.corpora.Dictionary(X_lemmatized)

corpus = [id2word.doc2bow(text) for text in X_lemmatized]

# save the dictionary and corpus for later use (e.g. hyperparameter selection)

id2word_file = Path(dataset_path.stem + '_id2word.dict')
id2word.save(str(id2word_file))
corpus_file = Path(dataset_path.stem + '_corpus.mm')
gensim.corpora.MmCorpus.serialize(str(corpus_file), corpus)

# load the id2word and corpus
id2word = gensim.corpora.Dictionary.load(str(id2word_file))
corpus = gensim.corpora.MmCorpus(str(corpus_file))

In [19]:
list(id2word.items())

[(0, 'ability'),
 (1, 'achievement'),
 (2, 'add'),
 (3, 'admittedly'),
 (4, 'allow'),
 (5, 'amount'),
 (6, 'battle'),
 (7, 'bit'),
 (8, 'bite'),
 (9, 'buff'),
 (10, 'bundle'),
 (11, 'buy'),
 (12, 'card'),
 (13, 'casual'),
 (14, 'certain'),
 (15, 'changeling'),
 (16, 'coin'),
 (17, 'collect'),
 (18, 'combination'),
 (19, 'come'),
 (20, 'damage'),
 (21, 'determine'),
 (22, 'dialogue'),
 (23, 'difficult'),
 (24, 'easy'),
 (25, 'element'),
 (26, 'enough'),
 (27, 'entertain'),
 (28, 'exchange'),
 (29, 'faerie'),
 (30, 'fight'),
 (31, 'find'),
 (32, 'flush'),
 (33, 'form'),
 (34, 'fortunately'),
 (35, 'full'),
 (36, 'game'),
 (37, 'gem'),
 (38, 'get'),
 (39, 'good'),
 (40, 'house'),
 (41, 'instead'),
 (42, 'interest'),
 (43, 'keep'),
 (44, 'kind'),
 (45, 'learn'),
 (46, 'like'),
 (47, 'limited'),
 (48, 'little'),
 (49, 'make'),
 (50, 'match'),
 (51, 'measure'),
 (52, 'mini'),
 (53, 'mix'),
 (54, 'monster'),
 (55, 'move'),
 (56, 'number'),
 (57, 'one'),
 (58, 'opponent'),
 (59, 'option'),
 (6

In [None]:
# test different method to build the corpus for faster training



In [12]:
N_TOPICS = 20

# Online LDA, how to effective train LDA models
# https://papers.nips.cc/paper_files/paper/2010/hash/71f6278d140af599e06ad9bf1ba03cb0-Abstract.html

lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                             id2word=id2word,
                                             num_topics=N_TOPICS,         # later can use grid search to find the best number of topics
                                             random_state=42,
                                             chunksize=2048,                # chunk size affects memory consumption, and updating speed (like DL batch_size). https://groups.google.com/g/gensim/c/FE7_FYSconA
                                             passes=1,                     # no. of passes over the whole corpus. If larger chunksize, then the passes should be larger too.
                                            #  alpha='auto',
                                             workers=3)     # workers = no. of cores (physical cores, but not logical threads)

visualize the data

In [13]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis



save model

we need to save the corpora.Dictionary and the LDA model

In [14]:
# save the LDA multicore model (and the corpora.Dictionary object) automatically

lda_save_folder = Path(f'lda_model_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
if not lda_save_folder.exists():
    lda_save_folder.mkdir()

lda_model.save(str(lda_save_folder.joinpath('lda_model')))     # no need to add file extension

Evaluation

gensim provide functions to calculate, so we don't need to install octis (as the evaluation backend of octis also relies on gensim)

octis seems awesome for simple development, but it installs many packages ;(

In [15]:
# corpus = lemmatized words (?) (list of list of str)

# create a result object from the LDAMulticore model for octis evaluation
# referencing from https://github.com/MIND-Lab/OCTIS/blob/master/octis/models/LDA.py
# and guideline in README: https://github.com/MIND-Lab/OCTIS/tree/master
result_lda_online = {}
result_lda_online['topic-word-matrix'] = lda_model.get_topics()

top_words = 10
topics_output = []
for topic in result_lda_online["topic-word-matrix"]:
    top_k = np.argsort(topic)[-top_words:]
    top_k_words = list(reversed([id2word[i] for i in top_k]))
    topics_output.append(top_k_words)
result_lda_online["topics"] = topics_output

def _get_topic_document_matrix(lda_model, corpus, num_topics=10):
    """
    Return the topic representation of the
    corpus
    """

    id_corpus = corpus

    doc_topic_tuples = []
    for document in id_corpus:
        doc_topic_tuples.append(
            lda_model.get_document_topics(document, minimum_probability=0))

    topic_document = np.zeros((num_topics, len(doc_topic_tuples)))

    for ndoc in range(len(doc_topic_tuples)):
        document = doc_topic_tuples[ndoc]
        for topic_tuple in document:
            topic_document[topic_tuple[0]][ndoc] = topic_tuple[1]
    return topic_document

result_lda_online['topic-document-matrix'] = _get_topic_document_matrix(lda_model, corpus, num_topics=N_TOPICS)

In [16]:
lda_model.show_topics(num_topics=N_TOPICS, num_words=10, formatted=True, log=False)

[(0,
  '0.053*"story" + 0.049*"game" + 0.024*"good" + 0.023*"character" + 0.021*"like" + 0.017*"really" + 0.012*"well" + 0.011*"gameplay" + 0.011*"end" + 0.011*"feel"'),
 (1,
  '0.054*"game" + 0.041*"old" + 0.025*"original" + 0.025*"new" + 0.022*"like" + 0.021*"fan" + 0.021*"year" + 0.020*"play" + 0.019*"still" + 0.017*"classic"'),
 (2,
  '0.069*"buy" + 0.065*"worth" + 0.056*"money" + 0.049*"game" + 0.043*"price" + 0.036*"sale" + 0.035*"pay" + 0.030*"get" + 0.029*"dlc" + 0.021*"free"'),
 (3,
  '0.073*"game" + 0.020*"review" + 0.016*"bug" + 0.016*"update" + 0.014*"release" + 0.013*"still" + 0.012*"make" + 0.012*"new" + 0.010*"developer" + 0.010*"fix"'),
 (4,
  '0.049*"game" + 0.024*"work" + 0.020*"crash" + 0.018*"play" + 0.018*"click" + 0.016*"steam" + 0.014*"get" + 0.013*"screen" + 0.013*"can" + 0.012*"not"'),
 (5,
  '0.070*"game" + 0.027*"puzzle" + 0.024*"story" + 0.016*"great" + 0.016*"music" + 0.016*"gameplay" + 0.014*"art" + 0.013*"beautiful" + 0.011*"well" + 0.010*"style"'),
 (6,


In [20]:
# setup: get the model's topics in their native ordering...
all_topics = lda_model.print_topics()
# ...then create a empty list per topic to collect the docs:
docs_per_topic = [[] for _ in all_topics]

# now, for every doc...
for doc_id, doc_bow in enumerate(corpus):
    # ...get its topics...
    doc_topics = lda_model.get_document_topics(doc_bow)
    # ...& for each of its topics...
    for topic_id, score in doc_topics:
        # ...add the doc_id & its score to the topic's doc list
        docs_per_topic[topic_id].append((doc_id, score))

In [24]:
# If you're interested in the top docs per topic, you can further sort each list's pairs by their score

for doc_list in docs_per_topic:
    doc_list.sort(key=lambda id_and_score: id_and_score[1], reverse=True)

In [25]:
print(docs_per_topic[0][:10])

[(2306384, 0.99681187), (2327985, 0.9914269), (1525134, 0.9806092), (1948157, 0.97839916), (1655473, 0.9756402), (1688446, 0.9756334), (1309651, 0.97359663), (1805370, 0.9712106), (2188923, 0.9693497), (2223045, 0.9693483)]


In [45]:
# show top 10 documents for each topic, also the name of the game
for topic_id, docs in enumerate(docs_per_topic):
    print(f'Topic {topic_id + 1}:')
    for doc_id, score in docs[:10]:
        print(f'Game: {dataset.iloc[doc_id]["app_name"]}')
        print(f'Doc ID: {doc_id}')
        print(f'Score: {score}')
        print(f'Doc: {dataset.iloc[doc_id]["review_text"]}')
        print()
    print('\n\n\n\n\n')

Topic 1:
Game: Pillars of Eternity
Doc ID: 2306384
Score: 0.9968118667602539
Doc: Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading reading reading reading Reading reading reading reading reading reading readin

In [41]:
dataset.iloc[1655473]

index                                                     6294630
app_id                                                       8190
app_name                                             Just Cause 2
review_text     It's one big action movie action movie action ...
review_score                                                    1
review_votes                                                    0
Name: 4780759, dtype: object

In [32]:
X[1655473]

"It's one big action movie action movie action movie action movie action movie action movie action movie action movie action movie action movie action movie action movie action movie action movie action movie action movie action movie action movie 9/10"

In [None]:
result_lda_online['topic-document-matrix'][0]

array([0.00047761, 0.01666767, 0.00135199, ..., 0.0027796 , 0.01250004,
       0.01000022])

In [17]:
lda_model.get_topics().shape

(20, 410296)

In [None]:
np.sum(result_lda_online['topic-document-matrix'], axis=0)

array([0.99999999, 1.00000001, 0.99999994, ..., 1.00000002, 1.00000001,
       1.00000008])

Evaluation

instead of using octis, we use gensim provided CoherenceModel object,  
as octis also uses this module for calculating the coherence scores

In [None]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=X_lemmatized, dictionary=id2word, coherence='c_v')
coherence_cv = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_cv)


Coherence Score:  0.4464862782742937


In [None]:
# Compute Coherence Score using c_npmi
coherence_model_lda = CoherenceModel(model=lda_model, texts=X_lemmatized, dictionary=id2word, coherence='c_npmi')
coherence_npmi = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_npmi)


Coherence Score:  0.03243114218978375


INFERENCE

inference test

In [None]:
# inference test

inference_test = ["well its been fun guys, but that's it, no more updates, that one was the last one, there is no longer going to be anymore content for this game anymore, there is no way to replay it as there won't be any updates, nope, that was it, the last update, nothing more, this game has no new ways to experience it as there is no more content updates, nothing new to freshen up the experience, its such a shame that this game has no replay-ability, once you beat the game there is like no point to playing again, as they said guys 1.2 will be they final update. nothing more after 1.2, there is no chance they will make another final update right? several years and final updates later: alright, thats it, no more updates we wont be getting anymore, thats it, nothing more, no more updates, for real this time... oh god, redigit made another tweet.",
                  "keeps forcing me to play it",
'''I will leave the cat here, so that everybody who passes by can pet it and give it a thumbs up and awards
　　　 　　／＞　　フ
　　　 　　| 　_　 _ l
　 　　 　／` ミ＿xノ
　　 　 /　　　 　 |
　　　 /　 ヽ　　 ﾉ
　 　 │　　|　|　|
　／￣|　　 |　|　|
　| (￣ヽ＿_ヽ_)__)
　＼二つ''']

inference_test = cleaning_strlist(inference_test)

inference_test = list(map(lambda x: lemmatization(x), inference_test))

corpus_test = [id2word.doc2bow(text) for text in inference_test]

test_output = lda_model[corpus_test]

test_output

<gensim.interfaces.TransformedCorpus at 0x7f6a88318460>

In [None]:
inference_test[-1]

['leave',
 'cat',
 'everybody',
 'pass',
 'pet',
 'give',
 'thumb',
 'award',
 'l',
 'x']

In [None]:
# test inference

corpus_test = [id2word.doc2bow(text) for text in inference_test]

output_test = lda_model[corpus_test]

for i in range(len(output_test)):
    # print(sorted(test_output[i], key=lambda x: x[1], reverse=True))
    print(sorted(output_test[i], key=lambda x: x[1], reverse=True))

[(3, 0.60644966), (11, 0.15789744), (13, 0.123749346), (10, 0.08829053), (8, 0.013170155)]
[(10, 0.5038039), (6, 0.27117985), (0, 0.012500903), (1, 0.012500903), (2, 0.012500903), (3, 0.012500903), (4, 0.012500903), (5, 0.012500903), (7, 0.012500903), (8, 0.012500903), (9, 0.012500903), (11, 0.012500903), (12, 0.012500903), (13, 0.012500903), (14, 0.012500903), (15, 0.012500903), (16, 0.012500903), (17, 0.012500903), (18, 0.012500903), (19, 0.012500903)]
[(16, 0.50347227), (11, 0.21316677), (1, 0.11489906), (18, 0.095704)]


load model (both corpora Dictionary and the LDA model)

In [None]:
del id2word
del lda_model

model_datetime = datetime(2024, 1, 15, 0, 21, 57)
lda_save_folder = Path(f'lda_model_{model_datetime.strftime("%Y%m%d_%H%M%S")}')

# id2word_load = gensim.corpora.Dictionary.load('lda_model.id2word')
id2word_l = gensim.corpora.Dictionary.load(str(lda_save_folder.joinpath('lda_model.id2word')))

lda_model_l = gensim.models.ldamulticore.LdaMulticore.load(str(lda_save_folder.joinpath('lda_model')))

In [None]:
corpus_test2 = [id2word_l.doc2bow(text) for text in inference_test]

output_test2 = lda_model_l[corpus_test2]

for i in range(len(output_test2)):
    print(sorted(output_test2[i], key=lambda x: x[1], reverse=True))

[(1, 0.32744843), (16, 0.2903514), (15, 0.123242974), (6, 0.09021147), (18, 0.09014924), (5, 0.030303197), (19, 0.023578195), (7, 0.016134478)]
[(1, 0.49804363), (10, 0.2769175), (0, 0.012502159), (2, 0.012502159), (3, 0.012502159), (4, 0.012502159), (5, 0.012502159), (6, 0.012502159), (7, 0.012502159), (8, 0.012502159), (9, 0.012502159), (11, 0.012502159), (12, 0.012502159), (13, 0.012502159), (14, 0.012502159), (15, 0.012502159), (16, 0.012502159), (17, 0.012502159), (18, 0.012502159), (19, 0.012502159)]
[(13, 0.27624068), (7, 0.20502365), (14, 0.13374868), (17, 0.105004296), (11, 0.104988545), (2, 0.10497881)]
