Evaluating the best LDA model from a hyperparemter search

We need
- The text (lemmatized), or any text
- The LDA model
- The corpus
- The id2word (can be generated on the fly)

In [1]:
import pandas as pd
import numpy as np
import pickle

import gensim

from pathlib import Path
from datetime import datetime
import json
import sys

In [2]:
%load_ext autoreload

In [3]:
sys.path.append('../')

%autoreload 2
from dataset_loader import GENRES, load_dataset

In [4]:
# constants for loading the dataset

genre = GENRES.ACTION
# genre = -1
unique_list = ['review_text']

---

Load the dataset from raw, and keep track with the avaiilable index during processing

Processing is copied from the training script

Load the genre / all dataset

In [5]:
if type(genre) == GENRES:
    dataset_folder = Path(f'../../dataset/topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]')
    dataset, dataset_path = load_dataset(genre, dataset_folder)
else:
    dataset_folder = Path(f'../../dataset/topic_modelling/00_dataset_filtered_all_4045065.pkl').resolve()
    dataset, dataset_path = pd.read_pickle(dataset_folder), dataset_folder

# new: create an untouched ver of the dataset for retrieving original text
dataset_untouched = dataset.copy()

dataset.info(verbose=True)

Load dataset from: /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[review_text]/00_action.pkl



<class 'pandas.core.frame.DataFrame'>
Index: 1273475 entries, 0 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   index         1273475 non-null  int64 
 1   app_id        1273475 non-null  int64 
 2   app_name      1273475 non-null  object
 3   review_text   1273475 non-null  object
 4   review_score  1273475 non-null  int64 
 5   review_votes  1273475 non-null  int64 
 6   genre_id      1273475 non-null  object
 7   category_id   1273475 non-null  object
dtypes: int64(4), object(4)
memory usage: 87.4+ MB


---

Cleaning

In [6]:
%autoreload 2
sys.path.append('../../sa')
import str_cleaning_functions

def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

In [7]:
cleaning(dataset, 'review_text')

In [8]:
# we do not remv reviews with too many punctuations. This is only for training more consistent topic model
# but not inferencing

In [9]:
# then we lemmatize the text

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from datasets import Dataset

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

def lemmatization_dataset(data):
    return {'review_text2': lemmatization(data['review_text'])}

# X_lemmatized = list(map(lambda x: lemmatization(x), X))
temp_dataset = Dataset.from_dict({'review_text': dataset['review_text'].values})
temp_dataset = temp_dataset.map(lemmatization_dataset, num_proc=4)
dataset['review_text_lemmatized'] = temp_dataset['review_text2']      # assign a new column to the dataset

  from .autonotebook import tqdm as notebook_tqdm
Map (num_proc=4): 100%|██████████| 1273475/1273475 [06:31<00:00, 3255.72 examples/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [10]:
# filter out the empty reviews

dataset = dataset[dataset['review_text_lemmatized'].apply(lambda x: len(x) > 0)]

In [11]:
dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1269558 entries, 0 to 4179608
Data columns (total 9 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   index                   1269558 non-null  int64 
 1   app_id                  1269558 non-null  int64 
 2   app_name                1269558 non-null  object
 3   review_text             1269558 non-null  object
 4   review_score            1269558 non-null  int64 
 5   review_votes            1269558 non-null  int64 
 6   genre_id                1269558 non-null  object
 7   category_id             1269558 non-null  object
 8   review_text_lemmatized  1269558 non-null  object
dtypes: int64(4), object(5)
memory usage: 96.9+ MB


In [12]:
# save this dataset for evaluation usage
dataset_eval_path = Path(f'category_{str(genre) if type(genre) == GENRES else "all"}_unique_review_text').joinpath(
    Path(f'lemmatized_data/{genre.value:02}_{str(genre)}_dataset_eval.pkl') if type(genre) == GENRES else Path(f'lemmatized_data/category_all_dataset_eval.pkl')
)

if not dataset_eval_path.parent.exists():
    dataset_eval_path.parent.mkdir(parents=True)

dataset.to_pickle(dataset_eval_path)

---

Or we load a lemmatized data (preprocessed data) for simple quick analysis

either this is from training dataset, or a less-processed evaluation dataset

In [6]:
# load a un-touched version of the dataset
if type(genre) == GENRES:
    dataset_folder = Path(f'../../dataset/topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]')
    dataset, dataset_path = load_dataset(genre, dataset_folder)
else:
    dataset_folder = Path(f'../../dataset/topic_modelling/00_dataset_filtered_all_4045065.pkl').resolve()
    dataset, dataset_path = pd.read_pickle(dataset_folder), dataset_folder

In [13]:
# Load the text

# X_lemmatized_file = Path('category_indie_unique_review_text').joinpath(
#     Path(f'lemmatized_data/{genre.value:02}_{str(genre)}_dataset.pkl')
# )

X_lemmatized_file = Path(f'category_{str(genre) if type(genre) == GENRES else "all"}_unique_review_text').joinpath(
    Path(f'lemmatized_data/{genre.value:02}_{str(genre)}_dataset_eval.pkl') if type(genre) == GENRES \
        else Path(f'lemmatized_data/category_all_dataset_eval.pkl')
)

if X_lemmatized_file.exists():
    with open(X_lemmatized_file, 'rb') as f:
        X_lemmatized_ds = pd.read_pickle(f)
        X_lemmatized = X_lemmatized_ds['review_text_lemmatized'].values

        dataset = X_lemmatized_ds
    print(f'Loaded X_lemmatized')
    print("X_lemmatized len:", len(X_lemmatized))
else:
    raise 'X_lemmatized_file does not exist'

Loaded X_lemmatized
X_lemmatized len: 1269558


In [14]:
X_lemmatized[0]

['ruin', 'life']

---

Load the best model from search

In [15]:
# load the best model from training folder
from eval_metrics import SEARCH_BEHAVIOUR

reuse_corpus = False
search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
# training_datetime = datetime(2024, 2, 17, 1, 18, 55)        # genre indie
# training_datetime = datetime(2024, 2, 20, 1, 4, 8)        # all reviews
training_datetime = datetime(2024, 2, 27, 9, 18, 50)        # genre action

if type(genre) == GENRES and genre.value >= 0:
    training_folder_p = Path(f'category_{str(genre)}_unique_review_text')
    training_folder = Path(f'lda_multicore_genre_{str(genre)}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
elif genre < 0:
    training_folder_p = Path(f'category_all_unique_review_text')
    training_folder = Path(f'lda_multicore_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
training_folder = training_folder_p.joinpath(training_folder)
training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

best_model_checkpoint_path = Path(training_result['best_model_checkpoint'])
best_model_checkpoint_path = best_model_checkpoint_path.parent.joinpath(
    "lda_multicore_lda_num_topics_30"
)

best_id2word = gensim.corpora.Dictionary.load(str(best_model_checkpoint_path.joinpath('lda_multicore.id2word')))
# best_corpus = [best_id2word.doc2bow(text) for text in X_lemmatized]      # recreate the corpus given the id2word (gensim Dictionary) (this is for new data)
if reuse_corpus:
    best_corpus = gensim.corpora.MmCorpus(str(best_model_checkpoint_path.joinpath(f'{best_model_checkpoint_path.stem}_corpus.mm')))
else:
    best_corpus = [best_id2word.doc2bow(text) for text in X_lemmatized]      # recreate the corpus given the id2word (gensim Dictionary) (this is for new data)
    print('create new corpus from new X_lemmatized and existing id2word')
best_model = gensim.models.ldamulticore.LdaMulticore.load(str(best_model_checkpoint_path.joinpath('lda_multicore')))

print('Best model checkpoint path:', best_model_checkpoint_path)

lda_model = best_model
id2word = best_id2word
corpus = best_corpus

create new corpus from new X_lemmatized and existing id2word
Best model checkpoint path: category_action_unique_review_text/lda_multicore_genre_action_grid_search_20240227_091850/lda_multicore_lda_num_topics_30


In [16]:
# the block above can be copied to load diff models for evaluation

Visualize the data

In [19]:
eval_folder_path = Path('../eval_results')
eval_folder_path = eval_folder_path.joinpath(
    best_model_checkpoint_path
)

if not eval_folder_path.exists():
    eval_folder_path.mkdir(parents=True)

In [17]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis



In [20]:
save_html = True
if save_html:
    if type(genre) == GENRES:
        pyLDAvis.save_html(vis, str(eval_folder_path.joinpath(f'pyldavis_{genre.value:02}_{str(genre)}_eval.html')))
    else:
        pyLDAvis.save_html(vis, str(eval_folder_path.joinpath(f'pyldavis_category_all_eval.html')))

Get top 10 keywords for each topics

In [21]:
topic_keywords = {}
top_N_words = 10

for i, topic in lda_model.show_topics(num_topics=lda_model.num_topics, num_words=top_N_words, formatted=False):
    topic_keywords[i] = [word for word, _ in topic]
    
    print(f'Topic {i}:')
    print(', '.join([word for word, _ in topic]))
    print()

Topic 0:
bad, pretty, cool, good, stuff, boring, guy, bore, car, thing

Topic 1:
work, review, fix, problem, bug, issue, save, play, read, write

Topic 2:
life, kill, shoot, gun, half, blow, shot, explosion, simulator, fly

Topic 3:
fps, valve, funny, video, crazy, hilarious, source, engine, card, xd

Topic 4:
puzzle, test, physic, solve, kid, title, cake, award, challenge, deserve

Topic 5:
recommend, awesome, highly, fantastic, fan, play, excellent, definitely, beautiful, brilliant

Topic 6:
buy, worth, sale, money, dlc, free, play, wait, spend, hour

Topic 7:
old, year, classic, play, modern, man, reccomend, school, miss, day

Topic 8:
pc, version, sonic, play, port, xbox, console, support, keyboard, ps

Topic 9:
play, hour, war, day, world, start, star, finish, open, lose

Topic 10:
mod, hack, community, rate, download, okay, alright, create, slash, frame

Topic 11:
good, really, graphic, play, suck, gameplay, ok, think, look, job

Topic 12:
end, infinite, storyline, city, absolute

---

Get the most representative docs

Ref: https://stackoverflow.com/questions/63777101/topic-wise-document-distribution-in-gensim-lda

In [22]:
all_topics = lda_model.print_topics(num_topics=-1)
all_topics

[(0,
  '0.158*"bad" + 0.156*"pretty" + 0.096*"cool" + 0.085*"good" + 0.045*"stuff" + 0.041*"boring" + 0.040*"guy" + 0.039*"bore" + 0.024*"car" + 0.021*"thing"'),
 (1,
  '0.090*"work" + 0.073*"review" + 0.047*"fix" + 0.041*"problem" + 0.037*"bug" + 0.031*"issue" + 0.026*"save" + 0.024*"play" + 0.020*"read" + 0.019*"write"'),
 (2,
  '0.106*"life" + 0.094*"kill" + 0.080*"shoot" + 0.079*"gun" + 0.065*"half" + 0.024*"blow" + 0.022*"shot" + 0.021*"explosion" + 0.016*"simulator" + 0.015*"fly"'),
 (3,
  '0.296*"fps" + 0.082*"valve" + 0.068*"funny" + 0.053*"video" + 0.041*"crazy" + 0.028*"hilarious" + 0.026*"source" + 0.026*"engine" + 0.022*"card" + 0.020*"xd"'),
 (4,
  '0.245*"puzzle" + 0.054*"test" + 0.052*"physic" + 0.043*"solve" + 0.031*"kid" + 0.028*"title" + 0.022*"cake" + 0.018*"award" + 0.016*"challenge" + 0.014*"deserve"'),
 (5,
  '0.239*"recommend" + 0.231*"awesome" + 0.095*"highly" + 0.050*"fantastic" + 0.041*"fan" + 0.035*"play" + 0.032*"excellent" + 0.030*"definitely" + 0.028*"beau

In [23]:
len(all_topics)

30

In [24]:
# setup: get the model's topics in their native ordering...
all_topics = lda_model.print_topics(num_topics=-1)
# ...then create a empty list per topic to collect the docs:
docs_per_topic = {topic_id: [] for (topic_id, _) in all_topics}

docs_top1_per_topic = {topic_id: [] for (topic_id, _) in all_topics}

# now, for every doc...
for doc_id, doc_bow in enumerate(corpus):
    # ...get its topics...
    doc_topics = lda_model.get_document_topics(doc_bow)
    # ...& for each of its topics...
        
    topic_id_max = -1; max_score = float('-inf')

    for topic_id, score in doc_topics:
        # ...add the doc_id & its score to the topic's doc list
        docs_per_topic[topic_id].append((doc_id, score))

        if score > max_score:
            max_score = score
            topic_id_max = topic_id
    
    docs_top1_per_topic[topic_id_max].append((doc_id, max_score))

In [25]:
print(len(docs_per_topic[1]))

466161


In [26]:
docs_per_topic[0][:10]

[(0, 0.016666692),
 (2, 0.011113807),
 (5, 0.011113791),
 (6, 0.016666839),
 (10, 0.011111306),
 (12, 0.011111246),
 (14, 0.016676847),
 (15, 0.016666923),
 (16, 0.016666692),
 (21, 0.20666602)]

In [27]:
for doc_list in docs_per_topic.values():
    doc_list.sort(key=lambda id_and_score: id_and_score[1], reverse=True)

In [28]:
top_N_docs = 10

for i in range(len(docs_per_topic)):
    print(docs_per_topic[i][:top_N_docs])

[(48299, 0.99902654), (591841, 0.98728067), (647650, 0.97803026), (327113, 0.9431367), (931796, 0.8791664), (1182908, 0.8791657), (496096, 0.8791523), (926117, 0.8651954), (1238741, 0.8619044), (264092, 0.8619027)]
[(976090, 0.9982425), (506638, 0.9971057), (439773, 0.99447614), (275355, 0.98862696), (764542, 0.98806185), (437935, 0.9846951), (1148477, 0.9769839), (627304, 0.970707), (11739, 0.9539639), (439783, 0.93555444)]
[(672245, 0.9992737), (921457, 0.99837804), (309042, 0.99783254), (308023, 0.9970164), (305029, 0.99672306), (273603, 0.9963795), (306352, 0.99428), (836554, 0.99361557), (896021, 0.9933334), (305228, 0.9928394)]
[(1123435, 0.9981258), (593647, 0.990997), (989770, 0.98821133), (59106, 0.91944426), (670258, 0.8392409), (875810, 0.83888793), (640894, 0.8388875), (647511, 0.80666375), (1008579, 0.7796291), (415070, 0.7583324)]
[(147366, 0.9925042), (876389, 0.9920869), (878907, 0.979861), (885814, 0.979861), (683630, 0.91412586), (778078, 0.8814862), (874797, 0.838888

In [29]:
# use the ID to retrieve the top docs, and copy them to a file for inspection

# retrieve the original text
df_original_texts = []
for topic_id in docs_per_topic.keys():
    t = dataset.iloc[[doc_id for doc_id, _ in docs_per_topic[topic_id][:top_N_docs]]]
    # t = dataset_untouched.iloc[[doc_id for doc_id, _ in docs_per_topic[topic_id][:top_N_docs]]]
    t = dataset_untouched.loc[t.index]
    t['lemmatized_text'] = dataset.loc[t.index, 'review_text_lemmatized']
    t['topic_id'] = topic_id        # store the topic id

    df_original_texts.append(t)

df_original_texts = pd.concat(df_original_texts)
df_original_texts

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id,lemmatized_text,topic_id
64937,75798,105600,Terraria,This game is so cool so cool so cool so cool s...,1,1,"[1, 25, 23, 3]","[2, 1, 49, 36, 9, 38, 22, 28, 29, 23, 41, 42, 43]","[game, cool, cool, cool, cool, cool, cool, coo...",0
2479429,3864980,304050,Trove,So Cool So Cool So Cool So Cool So Cool So Coo...,1,1,"[1, 25, 4, 29, 3, 37]","[1, 20, 49, 36, 9, 38, 22, 29, 30, 35, 18]","[cool, cool, cool, cool, cool, cool, cool, coo...",0
2558481,3993750,311210,Call of Duty: Black Ops III,cool cool cool cool cool cool cool cool cool c...,1,0,"[1, 25]","[2, 1, 9, 24, 22, 28, 29, 30, 8, 25, 41, 42, 4...","[cool, cool, cool, cool, cool, cool, cool, coo...",0
1590373,2327588,24240,PAYDAY: The Heist,What this game has become. 'HEY ARE YOU OVERD...,1,0,[1],"[2, 1, 9, 22, 29, 23, 15]","[game, become, hey, overdrilling, hey, overdri...",0
3651303,5680221,45760,Ultra Street Fighter IV,"Yeah it's pretty good, but if you take out all...",1,0,[1],"[2, 1, 49, 37, 22, 28, 29, 23, 25, 43, 44]","[yeah, pretty, good, take, good, stuff, pretty...",0
...,...,...,...,...,...,...,...,...,...,...
3929736,6101033,57300,Amnesia: The Dark Descent,I love horror survival games. This is without ...,1,0,"[1, 25, 23]","[2, 22, 28, 13, 17, 14, 41, 42, 43, 44]","[love, horror, survival, game, without, doubt,...",29
1001008,1461672,22200,Zeno Clash,"Unlike the sequel, which is a real shame, orig...",1,0,"[1, 23]","[2, 22, 29, 23]","[unlike, sequel, real, shame, original, zeno, ...",29
29782,36714,102500,Kingdoms of Amalur: Reckoning™,One of my favorite games. More fantasy based w...,1,0,"[1, 3]","[2, 22, 18, 23, 42]","[one, favorite, game, fantasy, base, world, al...",29
2507853,3914959,306130,The Elder Scrolls Online,This is the most immersive MMO on the market. ...,1,0,"[1, 25, 29, 3]","[1, 20, 49, 36, 9, 38, 29, 13, 35, 18]","[immersive, mmo, market, take, feature, previo...",29


In [30]:
# print out the original texts as a log

for topic_id in docs_per_topic.keys():
    print(f'Topic {topic_id}:')
    print()
    t = dataset.iloc[[doc_id for doc_id, _ in docs_per_topic[topic_id][:top_N_docs]]]
    t = dataset_untouched.loc[t.index]
    for index, row in t.iterrows():
        print(f'Doc {index}:')
        print(row['review_text'])
        print()
    print()

Topic 0:

Doc 64937:
This game is so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so cool so coo

In [31]:
dataset_untouched.loc[2578962]

index                                                     4021652
app_id                                                     312530
app_name                                                Duck Game
review_text             Quack Quack Quack 10/10 would Quack again
review_score                                                    1
review_votes                                                    0
genre_id                                                  [1, 23]
category_id     [2, 1, 49, 36, 47, 37, 24, 22, 28, 29, 30, 23,...
Name: 2578962, dtype: object

In [32]:
df_eval_topic_freq = pd.DataFrame(
    {
        'topic_id': [topic_id for topic_id in docs_top1_per_topic.keys()],
        'topic_freq': [len(docs) for docs in docs_top1_per_topic.values()]
    }
)

df_eval_topic_freq

Unnamed: 0,topic_id,topic_freq
0,0,49204
1,1,43433
2,2,40076
3,3,8314
4,4,7805
5,5,28668
6,6,53098
7,7,20437
8,8,18883
9,9,47945


In [33]:
# save the results

eval_folder_path = Path('../eval_results')
eval_folder_path = eval_folder_path.joinpath(
    best_model_checkpoint_path
)

if not eval_folder_path.exists():
    eval_folder_path.mkdir(parents=True)

In [34]:
top_n = 10
df_original_texts.to_pickle(eval_folder_path.joinpath(f'df_eval_top_{top_n}.pkl'))

In [35]:
# also need to save the top N keywords for each topic as json
with open(eval_folder_path.joinpath(f'top_{top_N_words}_keywords.json'), 'w') as f:
    json.dump(topic_keywords, f, indent=2)

In [36]:
# save the topic frequency  (top 1 prob)
df_eval_topic_freq.to_pickle(
    eval_folder_path.joinpath(f'df_eval_topic_freq.pkl')
)

---

Test the capability of LDA with LLM topic naming

In [37]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

# device check

import platform
import torch
if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

cuda


In [38]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

In [39]:
llm = Ollama(model="llama2")        # assuming the port is 11434

In [40]:
# prompt engineering
system_message = "You are a player of the game who is reading the reviews about the game."

human_template = \
'''Create a name for a topic given the topic's keywords and some most representative reviews of the topic. Output a label for the topic in less than 5 words. Output "NA" if the topic is not clear. Do not output other text.

The top keywords of the topic is: \'\'\'{topic_keywords}\'\'\'. 

The most representative reviews of the topic are: \'\'\'{topic_reviews}\'\'\'.'''

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("human", human_template)
])

chain = chat_prompt | llm

In [41]:
new_topic_labels = {}
randomed_topic_reviews = {}

In [35]:
ttt = _reviews_df[_reviews_df.index.isin(random.sample(_reviews_df.index.tolist(), 2))]
check_bool = []
for index, row in ttt.iterrows():
    print(f'Doc {index}:')
    print(row['review_text'])
    print()
    check_bool.append(len(row['review_text']) < 5000)
check_bool

Doc 3038645:
Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Click Cli

[False, False]

In [43]:
import random
import time

N_times = 5

topic_ids = list(docs_per_topic.keys())           # also generate the labels for the outlier topic, as its part of the topic_labels_ attribute

# new_topic_labels = {}
# randomed_topic_reviews = {}

for topic_id in topic_ids:
    _topic_keywords = topic_keywords[topic_id]

    temp_disable_char_limit = False
    _count = 0

    # time.sleep(1)

    _reviews_df = df_original_texts[df_original_texts['topic_id'] == topic_id]
    for i in range(N_times):
        if new_topic_labels.get(topic_id, {}).get(f"call_{i}", None) is not None:
            print(f'{topic_id:02}_call{i}: {new_topic_labels[topic_id][f"call_{i}"]}')
            continue

        while True:
            if _count > 20:
                temp_disable_char_limit = True

            _sampled_reviews_df = _reviews_df.sample(n=2, replace=False)

            # check the length of the topic reviews so that the llm won't be overloaded
            # 5000 character limits
            check_bool = _sampled_reviews_df.apply(lambda x: len(x['review_text']) < 5000, axis=1)
            
            if temp_disable_char_limit:
                break
            
            if all(check_bool):
                break
            else:
                _count += 1
        
        topic_reviews = _sampled_reviews_df['review_text'].values
        print(topic_reviews)

        result = chain.invoke(
            {
                "topic_keywords": _topic_keywords,
                "topic_reviews": topic_reviews
            }
        )

        print(f'{topic_id:02}_call{i}: {result}')
            
        if topic_id not in new_topic_labels:
            new_topic_labels[topic_id] = {}
            randomed_topic_reviews[topic_id] = {}

        new_topic_labels[topic_id][f"call_{i}"] = result
        randomed_topic_reviews[topic_id][f"call_{i}"] = {
            'reviews': topic_reviews.tolist(),
            "col_index": _sampled_reviews_df['index'].values.tolist()
        }


    print('\n')

00_call0: Label: Pretty Good Stuff
00_call1: Topic Label: Overdrilling
00_call2: 
Topic: "KRRRAZZZYY MONNEEEY!"
Label: "Good Stuff, Bad Rest"
00_call3: Topic: Fun and Interesting Game
Label: Good thing
00_call4: Topic: "Pretty Cool Stuff"
Label: "Good Dragons"


01_call0: Topic: Fix the Game
Label: FIX IT
01_call1: 
Topic: Game Crashing / Buggy
Label: "Buggy"
01_call2: 
Topic: "TryYearch"
Label: "HAIL TRYEARCH"
01_call3: "Fix it fix it"
01_call4: Topic: Game Issues
Label: "FIX IT"


02_call0: "Half-Life 3: Explosive Simulation"
02_call1: "Half Life 3"
02_call2: 
It seems like you're trying to generate a random string of words, but the output is not random enough. Here are some suggestions to make it more random:

1. Use a random word generator: There are many online tools that can generate random words for you. You can use these tools to get a list of words and then use them to create your message.
2. Use a random number generator: Random numbers can be used to generate random words. F

Then it's upto human inspection to review the connection btw the name of the topics and the comments

---

Save all the topic names

In [44]:
best_model_checkpoint_path

PosixPath('category_action_unique_review_text/lda_multicore_genre_action_grid_search_20240227_091850/lda_multicore_lda_num_topics_30')

In [45]:
new_topic_labels

{0: {'call_0': 'Label: Pretty Good Stuff',
  'call_1': 'Topic Label: Overdrilling',
  'call_2': '\nTopic: "KRRRAZZZYY MONNEEEY!"\nLabel: "Good Stuff, Bad Rest"',
  'call_3': 'Topic: Fun and Interesting Game\nLabel: Good thing',
  'call_4': 'Topic: "Pretty Cool Stuff"\nLabel: "Good Dragons"'},
 1: {'call_0': 'Topic: Fix the Game\nLabel: FIX IT',
  'call_1': '\nTopic: Game Crashing / Buggy\nLabel: "Buggy"',
  'call_2': '\nTopic: "TryYearch"\nLabel: "HAIL TRYEARCH"',
  'call_3': '"Fix it fix it"',
  'call_4': 'Topic: Game Issues\nLabel: "FIX IT"'},
 2: {'call_0': '"Half-Life 3: Explosive Simulation"',
  'call_1': '"Half Life 3"',
  'call_2': '\nIt seems like you\'re trying to generate a random string of words, but the output is not random enough. Here are some suggestions to make it more random:\n\n1. Use a random word generator: There are many online tools that can generate random words for you. You can use these tools to get a list of words and then use them to create your message.\n2. 

In [46]:
# save the topic labels

llm_generation_result = {
    'new_topic_labels': new_topic_labels,
    'randomed_topic_reviews': randomed_topic_reviews
}

with open(eval_folder_path.joinpath('llm_generation_result.json'), 'w') as f:
    json.dump(llm_generation_result, f, indent=2)

In [42]:
# del new_topic_labels

# topic_names_path = best_model_checkpoint_path.joinpath(
#     f'topic_names_{genre.value:02}_{str(genre)}.json'
# )

# with open(topic_names_path, 'r') as f:
#     new_topic_labels_l = json.load(f)
#     new_topic_labels = {int(k): v for k, v in new_topic_labels_l.items()}       # original key is an int

AttributeError: 'int' object has no attribute 'value'

In [None]:
# print(new_topic_labels[0][3])


The provided text is a string of lemmas, which are the basic units of language in Generative Grammar. Each lemma is a word or phrase that has a specific grammatical function, such as noun, verb, adjective, etc. The lemmatization process involves breaking down words into their constituent parts and classifying them according to their grammatical function.

In the provided text, there are several lemmas that can be identified:

1. MONEY - This is a common lemma in Generative Grammar, representing the idea of something having a specific value or worth.
2. MONEY!MONEY! - This is an example of a phrase that is a combination of two lemmas (MONEY and MONEY), indicating repetition or emphasis.
3. MONEY!MONEY!MONEY! - This is another instance of a phrase made up of multiple lemmas, indicating further repetition or emphasis.
4. MONEY!MONEY!MONEY!MONEY! - This is an example of a chain of phrases made up of multiple lemmas, each one building on the previous one to indicate a growing amount or val