Evaluating the best LDA model from a hyperparemter search

We need
- The text (lemmatized), or any text
- The LDA model
- The corpus
- The id2word (can be generated on the fly)

In [1]:
import pandas as pd
import numpy as np
import pickle

import gensim

from pathlib import Path
from datetime import datetime
import json
import sys

In [2]:
%load_ext autoreload

In [3]:
sys.path.append('../')

%autoreload 2
from dataset_loader import GENRES, load_dataset

In [4]:
# constants

genre = GENRES.INDIE
unique_list = ['review_text']

---

Load the dataset from raw, and keep track with the avaiilable index during processing

Processing is copied from the training script

In [5]:
dataset_folder = Path(f'../../dataset/topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]')
dataset, dataset_path = load_dataset(genre, dataset_folder)

# new: create an untouched ver of the dataset for retrieving original text
dataset_untouched = dataset.copy()

dataset.info(verbose=True)

Load dataset from: /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[review_text]/01_indie.pkl



<class 'pandas.core.frame.DataFrame'>
Index: 725737 entries, 25636 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         725737 non-null  int64 
 1   app_id        725737 non-null  int64 
 2   app_name      725737 non-null  object
 3   review_text   725737 non-null  object
 4   review_score  725737 non-null  int64 
 5   review_votes  725737 non-null  int64 
 6   genre_id      725737 non-null  object
 7   category_id   725737 non-null  object
dtypes: int64(4), object(4)
memory usage: 49.8+ MB


In [6]:
%autoreload 2
sys.path.append('../../sa')
import str_cleaning_functions

def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

In [7]:
cleaning(dataset, 'review_text')

In [8]:
# we do not remv reviews with too many punctuations. This is only for training more consistent topic model
# but not inferencing

In [9]:
# then we lemmatize the text

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from datasets import Dataset

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

def lemmatization_dataset(data):
    return {'review_text2': lemmatization(data['review_text'])}

# X_lemmatized = list(map(lambda x: lemmatization(x), X))
temp_dataset = Dataset.from_dict({'review_text': dataset['review_text'].values})
temp_dataset = temp_dataset.map(lemmatization_dataset, num_proc=4)
dataset['review_text_lemmatized'] = temp_dataset['review_text2']      # assign a new column to the dataset

  from .autonotebook import tqdm as notebook_tqdm
Map (num_proc=4): 100%|██████████| 725737/725737 [03:08<00:00, 3848.86 examples/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [10]:
# filter out the empty reviews

dataset = dataset[dataset['review_text_lemmatized'].apply(lambda x: len(x) > 0)]

In [11]:
dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 723659 entries, 25636 to 4179608
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   index                   723659 non-null  int64 
 1   app_id                  723659 non-null  int64 
 2   app_name                723659 non-null  object
 3   review_text             723659 non-null  object
 4   review_score            723659 non-null  int64 
 5   review_votes            723659 non-null  int64 
 6   genre_id                723659 non-null  object
 7   category_id             723659 non-null  object
 8   review_text_lemmatized  723659 non-null  object
dtypes: int64(4), object(5)
memory usage: 55.2+ MB


In [12]:
# save this dataset for evaluation usage
dataset_eval_path = Path('category_indie_unique_review_text').joinpath(
    Path(f'lemmatized_data/{genre.value:02}_{str(genre)}_dataset_eval.pkl')
)
dataset.to_pickle(dataset_eval_path)

---

Or we load a lemmatized data (preprocessed data) for simple quick analysis

either this is from training dataset, or a less-processed evaluation dataset

In [17]:
# load a un-touched version of the dataset
dataset_folder = Path(f'../../dataset/topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]')
dataset_untouched, dataset_path = load_dataset(genre, dataset_folder)

Load dataset from: /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[review_text]/01_indie.pkl





In [18]:
# Load the text

# X_lemmatized_file = Path('category_indie_unique_review_text').joinpath(
#     Path(f'lemmatized_data/{genre.value:02}_{str(genre)}_dataset.pkl')
# )

X_lemmatized_file = Path('category_indie_unique_review_text').joinpath(
    Path(f'lemmatized_data/{genre.value:02}_{str(genre)}_dataset_eval.pkl')
)

if X_lemmatized_file.exists():
    with open(X_lemmatized_file, 'rb') as f:
        X_lemmatized_ds = pd.read_pickle(f)
        X_lemmatized = X_lemmatized_ds['review_text_lemmatized'].values

        dataset = X_lemmatized_ds
    print(f'Loaded X_lemmatized')
    print("X_lemmatized len:", len(X_lemmatized))
else:
    raise 'X_lemmatized_file does not exist'

Loaded X_lemmatized
X_lemmatized len: 723659


In [6]:
X_lemmatized[0]

['take',
 'one',
 'part',
 'faerie',
 'solitaire',
 'two',
 'part',
 'puzzle',
 'quest',
 'mix',
 'little',
 'poker',
 'yahtzee',
 'good',
 'measure',
 'get',
 'something',
 'like',
 'runespell',
 'overture',
 'changeling',
 'sort',
 'fight',
 'monster',
 'take',
 'quest',
 'exchange',
 'coin',
 'buff',
 'come',
 'form',
 'power',
 'card',
 'story',
 'strong',
 'element',
 'game',
 'like',
 'puzzle',
 'quest',
 'game',
 'battle',
 'determine',
 'play',
 'mini',
 'game',
 'instead',
 'match',
 'though',
 'game',
 'card',
 'game',
 'similar',
 'poker',
 'make',
 'certain',
 'combination',
 'card',
 'pair',
 'kind',
 'full',
 'house',
 'flush',
 'straight',
 'certain',
 'amount',
 'damage',
 'opponent',
 'try',
 'ability',
 'steal',
 'card',
 'opponent',
 'plus',
 'limited',
 'number',
 'move',
 'get',
 'per',
 'turn',
 'move',
 'card',
 'play',
 'power',
 'ups',
 'add',
 'enough',
 'strategy',
 'game',
 'keep',
 'interest',
 'admittedly',
 'game',
 'get',
 'bit',
 'repetitive',
 'find',


---

Load the best model from search

In [7]:
# load the best model from training folder

reuse_corpus = False
training_datetime = datetime(2024, 2, 17, 1, 18, 55)

training_folder_p = Path(f'category_{str(genre)}_unique_review_text')
training_folder = Path(f'lda_multicore_genre_{str(genre)}_grid_search_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
training_folder = training_folder_p.joinpath(training_folder)
training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

best_model_checkpoint_path = Path(training_result['best_model_checkpoint'])

best_id2word = gensim.corpora.Dictionary.load(str(best_model_checkpoint_path.joinpath('lda_multicore.id2word')))
# best_corpus = [best_id2word.doc2bow(text) for text in X_lemmatized]      # recreate the corpus given the id2word (gensim Dictionary) (this is for new data)
if reuse_corpus:
    best_corpus = gensim.corpora.MmCorpus(str(best_model_checkpoint_path.joinpath(f'{best_model_checkpoint_path.stem}_corpus.mm')))
else:
    best_corpus = [best_id2word.doc2bow(text) for text in X_lemmatized]      # recreate the corpus given the id2word (gensim Dictionary) (this is for new data)
    print('create new corpus from new X_lemmatized and existing id2word')
best_model = gensim.models.ldamulticore.LdaMulticore.load(str(best_model_checkpoint_path.joinpath('lda_multicore')))

print('Best model checkpoint path:', best_model_checkpoint_path)

lda_model = best_model
id2word = best_id2word
corpus = best_corpus

create new corpus from new X_lemmatized and existing id2word
Best model checkpoint path: category_indie_unique_review_text/lda_multicore_genre_indie_grid_search_20240217_011855/lda_multicore_lda_num_topics_40


In [None]:
# the block above can be copied to load diff models for evaluation

Visualize the data

In [18]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis



Get top 10 keywords for each topics

In [25]:
topic_keywords = {}
top_N_words = 10

for i, topic in lda_model.show_topics(num_topics=lda_model.num_topics, num_words=top_N_words, formatted=False):
    topic_keywords[i] = [word for word, _ in topic]
    
    print(f'Topic {i}:')
    print(', '.join([word for word, _ in topic]))
    print()

Topic 0:
worth, money, sale, definitely, pay, garry, totally, dollar, hate, buck

Topic 1:
horror, simulator, sam, hack, drive, walk, car, stick, train, realistic

Topic 2:
amazing, absolutely, truly, incredible, wonderful, blow, cheat, forget, tho, immersive

Topic 3:
feel, life, like, real, felt, wow, ask, feeling, sad, question

Topic 4:
character, item, different, new, dungeon, class, quest, shop, loot, choose

Topic 5:
weapon, fight, bos, attack, boss, battle, kill, sword, health, damage

Topic 6:
world, action, rpg, fan, series, combat, genre, classic, night, adventure

Topic 7:
simple, easy, addictive, difficult, master, pick, cat, quick, hard, learn

Topic 8:
good, really, pretty, cool, bad, graphic, think, look, scare, overall

Topic 9:
little, bit, scary, funny, cute, weird, stupid, quite, hilarious, expect

Topic 10:
playing, video, stop, watch, indie, youtube, movie, joke, non, trailer

Topic 11:
water, star, eye, physic, black, universe, brain, freddy, hot, fear

Topic 12:

---

Get the most representative docs

Ref: https://stackoverflow.com/questions/63777101/topic-wise-document-distribution-in-gensim-lda

In [9]:
all_topics = lda_model.print_topics(num_topics=-1)
all_topics

[(0,
  '0.246*"worth" + 0.137*"money" + 0.079*"sale" + 0.050*"definitely" + 0.044*"pay" + 0.031*"garry" + 0.030*"totally" + 0.028*"dollar" + 0.021*"hate" + 0.020*"buck"'),
 (1,
  '0.144*"horror" + 0.057*"simulator" + 0.056*"sam" + 0.046*"hack" + 0.036*"drive" + 0.029*"walk" + 0.027*"car" + 0.022*"stick" + 0.021*"train" + 0.018*"realistic"'),
 (2,
  '0.193*"amazing" + 0.107*"absolutely" + 0.063*"truly" + 0.048*"incredible" + 0.048*"wonderful" + 0.038*"blow" + 0.028*"cheat" + 0.027*"forget" + 0.025*"tho" + 0.024*"immersive"'),
 (3,
  '0.235*"feel" + 0.144*"life" + 0.116*"like" + 0.084*"real" + 0.028*"felt" + 0.027*"wow" + 0.019*"ask" + 0.018*"feeling" + 0.018*"sad" + 0.017*"question"'),
 (4,
  '0.083*"character" + 0.059*"item" + 0.055*"different" + 0.051*"new" + 0.043*"dungeon" + 0.030*"class" + 0.020*"quest" + 0.019*"shop" + 0.017*"loot" + 0.016*"choose"'),
 (5,
  '0.096*"weapon" + 0.083*"fight" + 0.054*"bos" + 0.041*"attack" + 0.033*"boss" + 0.028*"battle" + 0.027*"kill" + 0.021*"sword

In [10]:
len(all_topics)

40

In [11]:
# setup: get the model's topics in their native ordering...
all_topics = lda_model.print_topics(num_topics=-1)
# ...then create a empty list per topic to collect the docs:
docs_per_topic = {topic_id: [] for (topic_id, _) in all_topics}

# now, for every doc...
for doc_id, doc_bow in enumerate(corpus):
    # ...get its topics...
    doc_topics = lda_model.get_document_topics(doc_bow)
    # ...& for each of its topics...
    for topic_id, score in doc_topics:
        # ...add the doc_id & its score to the topic's doc list
        docs_per_topic[topic_id].append((doc_id, score))

In [12]:
print(len(docs_per_topic[1]))

130932


In [13]:
docs_per_topic[0][:10]

[(2, 0.10495907),
 (3, 0.03382655),
 (5, 0.04202193),
 (13, 0.07121767),
 (22, 0.047833517),
 (23, 0.034463134),
 (24, 0.019113604),
 (37, 0.24553242),
 (40, 0.1713568),
 (44, 0.06130245)]

In [14]:
for doc_list in docs_per_topic.values():
    doc_list.sort(key=lambda id_and_score: id_and_score[1], reverse=True)

In [15]:
top_N_docs = 10

for i in range(len(docs_per_topic)):
    print(docs_per_topic[i][:top_N_docs])

[(568517, 0.99794734), (566430, 0.9966727), (569749, 0.9957423), (562321, 0.9955159), (565426, 0.9886081), (571208, 0.91136354), (569539, 0.8781247), (570242, 0.8607142), (88417, 0.86071295), (569573, 0.86071247)]
[(249714, 0.99640214), (620941, 0.8607128), (249798, 0.8049984), (409989, 0.80498713), (196192, 0.7562489), (643288, 0.7562482), (248387, 0.7562476), (543972, 0.7562475), (267526, 0.75624686), (680537, 0.7562366)]
[(250616, 0.8374988), (656217, 0.80499935), (485925, 0.80499876), (465449, 0.7562496), (343906, 0.7562495), (363930, 0.7562493), (397316, 0.7562493), (659212, 0.7562493), (521858, 0.75624925), (422872, 0.7562492)]
[(509478, 0.9834744), (188563, 0.8781209), (419314, 0.8374995), (650355, 0.83749753), (142833, 0.83749443), (421895, 0.8049999), (422297, 0.8049998), (424875, 0.8049998), (335472, 0.80499977), (376692, 0.8049948)]
[(112638, 0.83749527), (127377, 0.806891), (163230, 0.8049974), (149428, 0.80499715), (226590, 0.8049962), (223674, 0.8049948), (311566, 0.80499

In [19]:
# use the ID to retrieve the top docs, and copy them to a file for inspection

# retrieve the original text
df_original_texts = []
for topic_id in docs_per_topic.keys():
    t = dataset.iloc[[doc_id for doc_id, _ in docs_per_topic[topic_id][:top_N_docs]]]
    # t = dataset_untouched.iloc[[doc_id for doc_id, _ in docs_per_topic[topic_id][:top_N_docs]]]
    t = dataset_untouched.loc[t.index]
    t['lemmatized_text'] = dataset.loc[t.index, 'review_text_lemmatized']
    t['topic_id'] = topic_id        # store the topic id

    df_original_texts.append(t)

df_original_texts = pd.concat(df_original_texts)
df_original_texts

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id,lemmatized_text,topic_id
2905749,4528329,346900,AdVenture Capitalist,"Love it, You get to make lots of money money m...",1,1,"[4, 37, 23]","[2, 22, 29, 23]","[love, get, make, lot, money, money, money, mo...",0
2903569,4525993,346900,AdVenture Capitalist,Money Money Money Money Money Money Money Mone...,1,1,"[4, 37, 23]","[2, 22, 29, 23]","[money, money, money, money, money, money, mon...",0
2907026,4529670,346900,AdVenture Capitalist,"Money, Money, Money, Money, Money, Money, Mone...",1,0,"[4, 37, 23]","[2, 22, 29, 23]","[money, money, money, money, money, money, mon...",0
2899259,4521212,346900,AdVenture Capitalist,money money money money money money money mone...,1,1,"[4, 37, 23]","[2, 22, 29, 23]","[money, money, money, money, money, money, mon...",0
2902506,4524766,346900,AdVenture Capitalist,"*Starts game, and gets more then a Million dol...",1,1,"[4, 37, 23]","[2, 22, 29, 23]","[start, game, get, million, dollar, le, hour, ...",0
...,...,...,...,...,...,...,...,...,...,...
2578995,4021716,312530,Duck Game,Quack Quack Quack Quack!,1,1,"[1, 23]","[2, 1, 49, 36, 47, 37, 24, 22, 28, 29, 30, 23,...","[quack, quack, quack, quack]",39
2829260,4397196,337150,Sentinels of the Multiverse,This game faithfully recreates the card game i...,1,0,"[23, 2]","[2, 1, 9, 38, 39, 24, 27, 22, 29, 30, 23, 44]","[game, faithfully, recreate, card, game, satis...",39
2581032,4024639,312530,Duck Game,I have written a haiku to tell you about how a...,1,1,"[1, 23]","[2, 1, 49, 36, 47, 37, 24, 22, 28, 29, 30, 23,...","[write, haiku, tell, awesome, cuck, gmae, quac...",39
2579932,4023000,312530,Duck Game,Quack quack quackety quack! (I love his game!),1,0,"[1, 23]","[2, 1, 49, 36, 47, 37, 24, 22, 28, 29, 30, 23,...","[quack, quack, quackety, quack, love, game]",39


In [39]:
# print out the original texts as a log

for topic_id in docs_per_topic.keys():
    print(f'Topic {topic_id}:')
    print()
    t = dataset.iloc[[doc_id for doc_id, _ in docs_per_topic[topic_id][:top_N_docs]]]
    t = dataset_untouched.loc[t.index]
    for index, row in t.iterrows():
        print(f'Doc {index}:')
        print(row['review_text'])
        print()
    print()

Topic 0:

Doc 2905749:
Love it, You get to make lots of money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money moneymoney money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money money mon

In [40]:
dataset_untouched.loc[2578962]

index                                                     4021652
app_id                                                     312530
app_name                                                Duck Game
review_text             Quack Quack Quack 10/10 would Quack again
review_score                                                    1
review_votes                                                    0
genre_id                                                  [1, 23]
category_id     [2, 1, 49, 36, 47, 37, 24, 22, 28, 29, 30, 23,...
Name: 2578962, dtype: object

---

Test the capability of LDA with LLM topic naming

In [21]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

# device check

import platform
import torch
if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

cuda


In [22]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

In [23]:
llm = Ollama(model="llama2")        # assuming the port is 11434

In [24]:
# prompt engineering
system_message = "You are a player of the game who is reading the reviews about the game."

human_template = \
'''Create a name for a topic given the topic's keywords and some most representative reviews of the topic. Output a label for the topic in less than 5 words. Do not output other text. 

The top keywords of the topic is: \'\'\'{topic_keywords}\'\'\'. 

The most representative reviews of the topic are: \'\'\'{topic_reviews}\'\'\'.'''

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("human", human_template)
])

chain = chat_prompt | llm

In [34]:
import random
import time

N_times = 5

topic_ids = list(docs_per_topic.keys())           # also generate the labels for the outlier topic, as its part of the topic_labels_ attribute

new_topic_labels = {}

for topic_id in topic_ids:
    _topic_keywords = topic_keywords[topic_id]

    time.sleep(2)

    for i in range(N_times):
        while True:
            topic_reviews = random.sample(list(df_original_texts[df_original_texts['topic_id'] == topic_id]['review_text'].values), 2)            # only 2 reviews, sampled randomly        # TODO: called multiple times to select the best topic label

            # check the length of the topic reviews so that the llm won't be overloaded
            # 5000 character limits
            check_bool = [len(topic_reviews[i]) < 5000 for i in range(len(topic_reviews))]
            if all(check_bool):
                break


        result = chain.invoke(
            {
                "topic_keywords": _topic_keywords,
                "topic_reviews": topic_reviews
            }
        )

        print(f'{topic_id:02}_call{i}: {result}')

        if topic_id not in new_topic_labels:
            new_topic_labels[topic_id] = [result]
        else:
            new_topic_labels[topic_id].append(result)
    print('\n')

00_call0: "Money"
00_call1: "Money maker"
00_call2: "Money Money Money"
00_call3: 
The provided text is a string of lemmas, which are the basic units of language in Generative Grammar. Each lemma is a word or phrase that has a specific grammatical function, such as noun, verb, adjective, etc. The lemmatization process involves breaking down words into their constituent parts and classifying them according to their grammatical function.

In the provided text, there are several lemmas that can be identified:

1. MONEY - This is a common lemma in Generative Grammar, representing the idea of something having a specific value or worth.
2. MONEY!MONEY! - This is an example of a phrase that is a combination of two lemmas (MONEY and MONEY), indicating repetition or emphasis.
3. MONEY!MONEY!MONEY! - This is another instance of a phrase made up of multiple lemmas, indicating further repetition or emphasis.
4. MONEY!MONEY!MONEY!MONEY! - This is an example of a chain of phrases made up of multiple

Then it's upto human inspection to review the connection btw the name of the topics and the comments

---

Save all the topic names

In [41]:
best_model_checkpoint_path

PosixPath('category_indie_unique_review_text/lda_multicore_genre_indie_grid_search_20240217_011855/lda_multicore_lda_num_topics_40')

In [42]:
new_topic_labels

{0: ['"Money"',
  '"Money maker"',
  '"Money Money Money"',
  '\nThe provided text is a string of lemmas, which are the basic units of language in Generative Grammar. Each lemma is a word or phrase that has a specific grammatical function, such as noun, verb, adjective, etc. The lemmatization process involves breaking down words into their constituent parts and classifying them according to their grammatical function.\n\nIn the provided text, there are several lemmas that can be identified:\n\n1. MONEY - This is a common lemma in Generative Grammar, representing the idea of something having a specific value or worth.\n2. MONEY!MONEY! - This is an example of a phrase that is a combination of two lemmas (MONEY and MONEY), indicating repetition or emphasis.\n3. MONEY!MONEY!MONEY! - This is another instance of a phrase made up of multiple lemmas, indicating further repetition or emphasis.\n4. MONEY!MONEY!MONEY!MONEY! - This is an example of a chain of phrases made up of multiple lemmas, ea

In [44]:
topic_names_path = best_model_checkpoint_path.joinpath(
    f'topic_names_{genre.value:02}_{str(genre)}.json'
)

with open(topic_names_path, 'w') as f:
    json.dump(new_topic_labels, f, indent=2)

# topic_names_path_pkl = best_model_checkpoint_path.joinpath(
#     f'topic_names_{genre.value:02}_{str(genre)}.pkl'
# )

# with open(topic_names_path_pkl, 'wb') as f:
#     pickle.dump(new_topic_labels, f)

In [51]:
# del new_topic_labels

topic_names_path = best_model_checkpoint_path.joinpath(
    f'topic_names_{genre.value:02}_{str(genre)}.json'
)

with open(topic_names_path, 'r') as f:
    new_topic_labels_l = json.load(f)
    new_topic_labels = {int(k): v for k, v in new_topic_labels_l.items()}       # original key is an int

In [53]:
print(new_topic_labels[0][3])


The provided text is a string of lemmas, which are the basic units of language in Generative Grammar. Each lemma is a word or phrase that has a specific grammatical function, such as noun, verb, adjective, etc. The lemmatization process involves breaking down words into their constituent parts and classifying them according to their grammatical function.

In the provided text, there are several lemmas that can be identified:

1. MONEY - This is a common lemma in Generative Grammar, representing the idea of something having a specific value or worth.
2. MONEY!MONEY! - This is an example of a phrase that is a combination of two lemmas (MONEY and MONEY), indicating repetition or emphasis.
3. MONEY!MONEY!MONEY! - This is another instance of a phrase made up of multiple lemmas, indicating further repetition or emphasis.
4. MONEY!MONEY!MONEY!MONEY! - This is an example of a chain of phrases made up of multiple lemmas, each one building on the previous one to indicate a growing amount or val