Evaluating the best LDA model from a hyperparemter search

We need
- The text (lemmatized), or any text
- The LDA model
- The corpus
- The id2word (can be generated on the fly)

In [1]:
import pandas as pd
import numpy as np
import pickle

import gensim

from pathlib import Path
from datetime import datetime
import json
import sys

In [2]:
%load_ext autoreload

In [3]:
sys.path.append('../')

%autoreload 2
from dataset_loader import GENRES, load_dataset
from eval_metrics import SEARCH_BEHAVIOUR

In [5]:
# constants

genre = GENRES.ACTION
# genre = -1
unique_list = ['review_text']

---

Load the dataset from raw, and keep track with the avaiilable index during processing

Processing is copied from the training script

In [6]:
if type(genre) == GENRES:
    dataset_folder = Path(f'../../dataset/topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]')
    dataset, dataset_path = load_dataset(genre, dataset_folder)
else:
    dataset_folder = Path(f'../../dataset/topic_modelling/00_dataset_filtered_all_4045065.pkl').resolve()
    dataset, dataset_path = pd.read_pickle(dataset_folder), dataset_folder

# new: create an untouched ver of the dataset for retrieving original text
dataset_untouched = dataset.copy()

dataset.info(verbose=True)

Load dataset from: /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[review_text]/00_action.pkl



<class 'pandas.core.frame.DataFrame'>
Index: 1273475 entries, 0 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   index         1273475 non-null  int64 
 1   app_id        1273475 non-null  int64 
 2   app_name      1273475 non-null  object
 3   review_text   1273475 non-null  object
 4   review_score  1273475 non-null  int64 
 5   review_votes  1273475 non-null  int64 
 6   genre_id      1273475 non-null  object
 7   category_id   1273475 non-null  object
dtypes: int64(4), object(4)
memory usage: 87.4+ MB


In [6]:
%autoreload 2
sys.path.append('../../sa')
import str_cleaning_functions

def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

In [7]:
cleaning(dataset, 'review_text')

In [8]:
# we do not remv reviews with too many punctuations. This is only for training more consistent topic model
# but not inferencing

In [9]:
# then we lemmatize the text

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from datasets import Dataset

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

def lemmatization_dataset(data):
    return {'review_text2': lemmatization(data['review_text'])}

# X_lemmatized = list(map(lambda x: lemmatization(x), X))
temp_dataset = Dataset.from_dict({'review_text': dataset['review_text'].values})
temp_dataset = temp_dataset.map(lemmatization_dataset, num_proc=4)
dataset['review_text_lemmatized'] = temp_dataset['review_text2']      # assign a new column to the dataset

  from .autonotebook import tqdm as notebook_tqdm
Map (num_proc=4): 100%|██████████| 4045065/4045065 [17:13<00:00, 3915.14 examples/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [10]:
# filter out the empty reviews

dataset = dataset[dataset['review_text_lemmatized'].apply(lambda x: len(x) > 0)]

In [11]:
dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 4036083 entries, 0 to 4180147
Data columns (total 7 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   index                   int64 
 1   app_id                  int64 
 2   app_name                object
 3   review_text             object
 4   review_score            int64 
 5   review_votes            int64 
 6   review_text_lemmatized  object
dtypes: int64(4), object(3)
memory usage: 246.3+ MB


In [14]:
# save this dataset for evaluation usage
dataset_eval_path = Path(f'category_{str(genre) if type(genre) == GENRES else "all"}_unique_review_text').joinpath(
    Path(f'lemmatized_data/{genre.value:02}_{str(genre)}_dataset_eval.pkl') if type(genre) == GENRES else Path(f'lemmatized_data/category_all_dataset_eval.pkl')
)

if not dataset_eval_path.parent.exists():
    dataset_eval_path.parent.mkdir(parents=True)

dataset.to_pickle(dataset_eval_path)

---

Or we load a lemmatized data (preprocessed data) for simple quick analysis

either this is from training dataset, or a less-processed evaluation dataset

In [20]:
# Load the text

# X_lemmatized_file = Path('category_indie_unique_review_text').joinpath(
#     Path(f'lemmatized_data/{genre.value:02}_{str(genre)}_dataset.pkl')
# )

X_lemmatized_file = Path(f'category_{str(genre) if type(genre) == GENRES else "all"}_unique_review_text').joinpath(
    Path(f'lemmatized_data/{genre.value:02}_{str(genre)}_dataset_eval.pkl') if type(genre) == GENRES \
    else Path(f'lemmatized_data/category_all_dataset_eval.pkl')
)

if X_lemmatized_file.exists():
    with open(X_lemmatized_file, 'rb') as f:
        X_lemmatized_ds = pickle.load(f)
        X_lemmatized = X_lemmatized_ds['review_text_lemmatized'].values
    print(f'Loaded X_lemmatized')
    print("X_lemmatized len:", len(X_lemmatized))
else:
    raise 'X_lemmatized_file does not exist'

Loaded X_lemmatized
X_lemmatized len: 4036083


In [21]:
X_lemmatized[0]

['ruin', 'life']

---

Load the best model from search

In [22]:
# load the best model from training folder

reuse_corpus = False
search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
# training_datetime = datetime(2024, 2, 17, 1, 18, 55)
training_datetime = datetime(2024, 2, 20, 1, 4, 8)

if type(genre) == GENRES and genre.value >= 0:
    training_folder_p = Path(f'category_{str(genre)}_unique_review_text')
    training_folder = Path(f'lda_multicore_genre_{str(genre)}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
elif genre < 0:
    training_folder_p = Path(f'category_all_unique_review_text')
    training_folder = Path(f'lda_multicore_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
training_folder = training_folder_p.joinpath(training_folder)
training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

best_model_checkpoint_path = Path(training_result['best_model_checkpoint'])

# change the path to what we want
best_model_checkpoint_path = best_model_checkpoint_path.parent.joinpath(
    "lda_multicore_lda_num_topics_30"
)
best_model_checkpoint_path

best_id2word = gensim.corpora.Dictionary.load(str(best_model_checkpoint_path.joinpath('lda_multicore.id2word')))
# best_corpus = [best_id2word.doc2bow(text) for text in X_lemmatized]      # recreate the corpus given the id2word (gensim Dictionary) (this is for new data)
if reuse_corpus:
    best_corpus = gensim.corpora.MmCorpus(str(best_model_checkpoint_path.joinpath(f'{best_model_checkpoint_path.stem}_corpus.mm')))
else:
    best_corpus = [best_id2word.doc2bow(text) for text in X_lemmatized]      # recreate the corpus given the id2word (gensim Dictionary) (this is for new data)
    print('create new corpus from new X_lemmatized and existing id2word')
best_model = gensim.models.ldamulticore.LdaMulticore.load(str(best_model_checkpoint_path.joinpath('lda_multicore')))

print('Best model checkpoint path:', best_model_checkpoint_path)

lda_model = best_model
id2word = best_id2word
corpus = best_corpus

create new corpus from new X_lemmatized and existing id2word
Best model checkpoint path: category_all_unique_review_text/lda_multicore_grid_search_20240220_010408/lda_multicore_lda_num_topics_30


In [23]:
best_model_checkpoint_path = best_model_checkpoint_path.parent.joinpath(
    "lda_multicore_lda_num_topics_30"
)

best_model_checkpoint_path

PosixPath('category_all_unique_review_text/lda_multicore_grid_search_20240220_010408/lda_multicore_lda_num_topics_30')

In [None]:
# the block can be copied to load diff models for evaluation

---

Visualize the data

In [24]:
eval_folder_path = Path('../eval_results')
eval_folder_path = eval_folder_path.joinpath(
    best_model_checkpoint_path
)

if not eval_folder_path.exists():
    eval_folder_path.mkdir(parents=True)

In [26]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis



In [27]:
save_html = True
if save_html:
    if type(genre) == GENRES:
        pyLDAvis.save_html(vis, str(eval_folder_path.joinpath(f'pyldavis_{genre.value:02}_{str(genre)}_eval.html')))
    else:
        pyLDAvis.save_html(vis, str(eval_folder_path.joinpath(f'pyldavis_category_all_eval.html')))

Get top 10 keywords for each topics

In [12]:
top_N_words = 10
top_N_keywords = {}

for i, topic in lda_model.show_topics(num_topics=lda_model.num_topics, num_words=top_N_words, formatted=False):
    top_N_keywords[i] = [word for word, _ in topic]

    print(f'Topic {i}:')
    print(', '.join([word for word, _ in topic]))
    print()

Topic 0:
worth, money, sale, buy, pay, hour, definitely, garry, cheap, dollar

Topic 1:
unit, war, ai, drive, car, city, realistic, line, clue, terrify

Topic 2:
like, really, feel, good, play, look, stuff, kind, thing, think

Topic 3:
free, scary, version, pc, download, play, demo, wow, mobile, xbox

Topic 4:
level, play, fast, recommend, die, bore, video, boring, start, action

Topic 5:
character, item, fight, rpg, dungeon, combat, bos, different, weapon, battle

Topic 6:
world, dinosaur, build, hate, open, creative, building, mario, clone, tf

Topic 7:
awesome, cool, addictive, cute, freddy, pretty, crazy, thumb, realy, simple

Topic 8:
good, music, nice, graphic, gameplay, pretty, short, really, style, great

Topic 9:
buy, hour, play, life, spend, real, half, suck, want, played

Topic 10:
best, play, year, simulator, far, indie, date, trine, ex, hand

Topic 11:
weapon, shoot, gun, fps, kill, fly, wall, throw, shot, attack

Topic 12:
day, night, man, water, life, come, leave, world,

---

Get the most representative docs

Ref: https://stackoverflow.com/questions/63777101/topic-wise-document-distribution-in-gensim-lda

In [13]:
topics_json = {}

all_topics = lda_model.print_topics(num_topics=-1)

for topic_id, topic in all_topics:
    topics_json[i] = topic
    
all_topics

[(0,
  '0.168*"worth" + 0.094*"money" + 0.076*"sale" + 0.069*"buy" + 0.039*"pay" + 0.028*"hour" + 0.021*"definitely" + 0.021*"garry" + 0.020*"cheap" + 0.019*"dollar"'),
 (1,
  '0.040*"unit" + 0.039*"war" + 0.037*"ai" + 0.024*"drive" + 0.018*"car" + 0.018*"city" + 0.016*"realistic" + 0.015*"line" + 0.012*"clue" + 0.011*"terrify"'),
 (2,
  '0.389*"like" + 0.133*"really" + 0.083*"feel" + 0.056*"good" + 0.030*"play" + 0.025*"look" + 0.022*"stuff" + 0.020*"kind" + 0.020*"thing" + 0.015*"think"'),
 (3,
  '0.200*"free" + 0.079*"scary" + 0.078*"version" + 0.060*"pc" + 0.050*"download" + 0.049*"play" + 0.029*"demo" + 0.026*"wow" + 0.019*"mobile" + 0.019*"xbox"'),
 (4,
  '0.094*"level" + 0.070*"play" + 0.057*"fast" + 0.052*"recommend" + 0.048*"die" + 0.048*"bore" + 0.041*"video" + 0.040*"boring" + 0.038*"start" + 0.028*"action"'),
 (5,
  '0.044*"character" + 0.043*"item" + 0.043*"fight" + 0.034*"rpg" + 0.032*"dungeon" + 0.030*"combat" + 0.028*"bos" + 0.025*"different" + 0.025*"weapon" + 0.018*"b

In [14]:
len(all_topics)

30

In [38]:
# setup: get the model's topics in their native ordering...
all_topics = lda_model.print_topics(num_topics=-1)
# ...then create a empty list per topic to collect the docs:
docs_per_topic = {topic_id: [] for (topic_id, _) in all_topics}

docs_top1_per_topic = {topic_id: [] for (topic_id, _) in all_topics}

# now, for every doc...
for doc_id, doc_bow in enumerate(corpus):
    # ...get its topics...
    doc_topics = lda_model.get_document_topics(doc_bow)
    # ...& for each of its topics...
        
    topic_id_max = -1; max_score = float('-inf')

    for topic_id, score in doc_topics:
        # ...add the doc_id & its score to the topic's doc list
        docs_per_topic[topic_id].append((doc_id, score))

        if score > max_score:
            max_score = score
            topic_id_max = topic_id
    
    docs_top1_per_topic[topic_id_max].append((doc_id, max_score))

In [39]:
df_eval_topic_freq = pd.DataFrame(
    {
        'topic_id': [topic_id for topic_id in docs_top1_per_topic.keys()],
        'topic_freq': [len(docs) for docs in docs_top1_per_topic.values()]
    }
)

df_eval_topic_freq

Unnamed: 0,topic_id,topic_freq
0,0,37190
1,1,5388
2,2,31336
3,3,8172
4,4,16374
5,5,22553
6,6,8632
7,7,11061
8,8,65845
9,9,28111


In [31]:
docs_per_topic

{0: [(568517, 0.9979649),
  (566430, 0.9967776),
  (569749, 0.9957788),
  (562321, 0.99560744),
  (565426, 0.99018306),
  (722933, 0.94576615),
  (279631, 0.9309488),
  (571208, 0.91212106),
  (434039, 0.91211706),
  (15412, 0.9033321),
  (62292, 0.90333027),
  (610628, 0.8925881),
  (483180, 0.89258736),
  (434976, 0.892565),
  (569539, 0.87916625),
  (28700, 0.8791659),
  (529496, 0.87916553),
  (425715, 0.8791634),
  (570242, 0.86190456),
  (624370, 0.86190367),
  (327688, 0.86190355),
  (569573, 0.8619031),
  (268055, 0.8619028),
  (339296, 0.8619028),
  (18877, 0.86190224),
  (51645, 0.86190224),
  (245290, 0.8619021),
  (88417, 0.861902),
  (652165, 0.861902),
  (168362, 0.8619015),
  (379870, 0.86190116),
  (165585, 0.8619005),
  (278726, 0.8618999),
  (324632, 0.86187917),
  (146116, 0.8618619),
  (74012, 0.85849214),
  (276549, 0.8488367),
  (104398, 0.84871715),
  (69516, 0.8388887),
  (562548, 0.8388887),
  (570489, 0.8388887),
  (675382, 0.83888865),
  (211925, 0.8388886),


In [16]:
print(len(docs_per_topic[1]))

205682


In [17]:
docs_per_topic[0][:10]

[(0, 0.022032209),
 (2, 0.124895185),
 (3, 0.045312826),
 (5, 0.059161633),
 (8, 0.1660312),
 (10, 0.079034284),
 (13, 0.07952131),
 (19, 0.11690636),
 (22, 0.034432586),
 (23, 0.03597497)]

In [18]:
for doc_list in docs_per_topic.values():
    doc_list.sort(key=lambda id_and_score: id_and_score[1], reverse=True)

In [19]:
top_N_docs = 10

for i in range(len(docs_per_topic)):
    print(docs_per_topic[i][:top_N_docs])

[(568517, 0.9979649), (566430, 0.9967776), (569749, 0.9957788), (562321, 0.99560744), (565426, 0.99018306), (722933, 0.94576615), (279631, 0.9309488), (571208, 0.91212106), (434039, 0.91211706), (15412, 0.9033321)]
[(581902, 0.98415273), (432598, 0.8618838), (249798, 0.8066643), (392269, 0.80666405), (386191, 0.80666053), (247291, 0.8066562), (295679, 0.80665374), (562330, 0.8066436), (494871, 0.8066292), (566225, 0.8066104)]
[(380786, 0.99817055), (155154, 0.99619126), (21981, 0.98209876), (75581, 0.9794326), (407595, 0.9785155), (363538, 0.9775209), (286020, 0.9462962), (360688, 0.89259166), (532985, 0.8857377), (684109, 0.87916636)]
[(481078, 0.9964976), (509478, 0.9836155), (362729, 0.975526), (678119, 0.96739656), (675892, 0.85601604), (40533, 0.8388882), (190903, 0.8388881), (680038, 0.8388877), (309414, 0.83888584), (250616, 0.8388852)]
[(438242, 0.95992213), (311642, 0.9208814), (584, 0.8925923), (360473, 0.8925823), (461965, 0.8791664), (178134, 0.8791648), (408346, 0.86190414

In [27]:
dataset.index

Index([  25636,   25637,   25638,   25639,   25640,   25641,   25642,   25643,
         25644,   25645,
       ...
       4179598, 4179599, 4179600, 4179601, 4179602, 4179603, 4179604, 4179605,
       4179607, 4179608],
      dtype='int64', length=723659)

In [28]:
# use the ID to retrieve the top docs, and copy them to a file for inspection

# retrieve the original text
df_original_texts = []
for topic_id in docs_per_topic.keys():
    t = dataset.iloc[[doc_id for doc_id, _ in docs_per_topic[topic_id][:top_N_docs]]]
    # t = dataset_untouched.iloc[[doc_id for doc_id, _ in docs_per_topic[topic_id][:top_N_docs]]]
    t = dataset_untouched.loc[t.index]
    t['lemmatized_text'] = dataset.loc[t.index, 'review_text_lemmatized']
    t['topic_id'] = topic_id        # store the topic id

    df_original_texts.append(t)

df_original_texts = pd.concat(df_original_texts)
df_original_texts

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id,lemmatized_text,topic_id
2905749,4528329,346900,AdVenture Capitalist,"Love it, You get to make lots of money money m...",1,1,"[4, 37, 23]","[2, 22, 29, 23]","[love, get, make, lot, money, money, money, mo...",0
2903569,4525993,346900,AdVenture Capitalist,Money Money Money Money Money Money Money Mone...,1,1,"[4, 37, 23]","[2, 22, 29, 23]","[money, money, money, money, money, money, mon...",0
2907026,4529670,346900,AdVenture Capitalist,"Money, Money, Money, Money, Money, Money, Mone...",1,0,"[4, 37, 23]","[2, 22, 29, 23]","[money, money, money, money, money, money, mon...",0
2899259,4521212,346900,AdVenture Capitalist,money money money money money money money mone...,1,1,"[4, 37, 23]","[2, 22, 29, 23]","[money, money, money, money, money, money, mon...",0
2902506,4524766,346900,AdVenture Capitalist,"*Starts game, and gets more then a Million dol...",1,1,"[4, 37, 23]","[2, 22, 29, 23]","[start, game, get, million, dollar, le, hour, ...",0
...,...,...,...,...,...,...,...,...,...,...
1002607,1463524,22230,Rock of Ages,"You get to roll a Giant rock...A GIANT ROCK, i...",1,0,"[1, 23, 9, 2]","[2, 1, 49, 36, 37, 9, 24, 22, 29, 18, 23, 25]","[get, roll, giant, giant, rock, opponent, gate...",29
2943309,4581428,351490,Intergalactic Bubbles,A decent enough casual game where the aim is t...,1,0,"[4, 23]","[2, 22, 29]","[decent, enough, casual, game, aim, fire, ball...",29
2692065,4195435,323060,Tharsis,"Too much random chance, both in die rolls and ...",0,0,"[23, 3, 2]","[2, 22, 29, 23, 25]","[much, random, chance, die, roll, event, show,...",29
207683,250221,113200,The Binding of Isaac,"Very entertaining, sometimes relies on luck to...",1,0,"[1, 25, 23, 3]","[2, 22, 29]","[entertain, sometimes, rely, luck, get, correc...",29


In [29]:
# print out the original texts as a log

for topic_id in docs_per_topic.keys():
    print(f'Topic {topic_id}:')
    print()
    t = dataset.iloc[[doc_id for doc_id, _ in docs_per_topic[topic_id][:top_N_docs]]]
    t = dataset_untouched.loc[t.index]
    for index, row in t.iterrows():
        print(f'Doc {doc_id}:')
        print(row['review_text'])
        print()
    print()

Topic 0:

Doc 723658:
Love it, You get to make lots of money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money moneymoney money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money mone

In [30]:
top_N_keywords

{0: ['worth',
  'money',
  'sale',
  'buy',
  'pay',
  'hour',
  'definitely',
  'garry',
  'cheap',
  'dollar'],
 1: ['unit',
  'war',
  'ai',
  'drive',
  'car',
  'city',
  'realistic',
  'line',
  'clue',
  'terrify'],
 2: ['like',
  'really',
  'feel',
  'good',
  'play',
  'look',
  'stuff',
  'kind',
  'thing',
  'think'],
 3: ['free',
  'scary',
  'version',
  'pc',
  'download',
  'play',
  'demo',
  'wow',
  'mobile',
  'xbox'],
 4: ['level',
  'play',
  'fast',
  'recommend',
  'die',
  'bore',
  'video',
  'boring',
  'start',
  'action'],
 5: ['character',
  'item',
  'fight',
  'rpg',
  'dungeon',
  'combat',
  'bos',
  'different',
  'weapon',
  'battle'],
 6: ['world',
  'dinosaur',
  'build',
  'hate',
  'open',
  'creative',
  'building',
  'mario',
  'clone',
  'tf'],
 7: ['awesome',
  'cool',
  'addictive',
  'cute',
  'freddy',
  'pretty',
  'crazy',
  'thumb',
  'realy',
  'simple'],
 8: ['good',
  'music',
  'nice',
  'graphic',
  'gameplay',
  'pretty',
  'short

In [41]:
# save the top N representative docs

df_original_texts.to_pickle(
    eval_folder_path.joinpath(f'df_eval_top_{top_N_docs}.pkl')
)

# save the topic frequency  (top 1 prob)
df_eval_topic_freq.to_pickle(
    eval_folder_path.joinpath(f'df_eval_topic_freq.pkl')
)


# save the top N (10) keywords
with open(eval_folder_path.joinpath(f'top_N_keywords.json'), 'w') as f:
    json.dump(top_N_keywords, f, indent=2)

Test the capability of LDA with LLM topic naming

But before that, we need to find a way to map the corpus id back to the original document ID in the dataset, so that LLM can refer the document, then pass it to the prompt.