Given a trained BERTopic model, we write LLM prompts to interact with the topics.

And also some qualitative evaluations

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import json
from datetime import datetime

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

In [5]:
import sys
from gensim.models import CoherenceModel

sys.path.append('../')

from eval_metrics import compute_inverted_rbo, compute_topic_diversity, compute_pairwise_jaccard_similarity, \
                        METRICS, SEARCH_BEHAVIOUR, COHERENCE_MODEL_METRICS

In [3]:
import platform
import torch

if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

mps


Dataset

same as training script, just need to reload and apply transform to the dataset for evaluation

In [2]:
dataset_path = Path('../../dataset/topic_modelling/top_11_genres/01_Indie.pkl')

dataset = pd.read_pickle(dataset_path)

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 741913 entries, 25636 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         741913 non-null  int64 
 1   app_id        741913 non-null  int64 
 2   app_name      741913 non-null  object
 3   review_text   741913 non-null  object
 4   review_score  741913 non-null  int64 
 5   review_votes  741913 non-null  int64 
 6   genre_id      741913 non-null  object
 7   category_id   741913 non-null  object
dtypes: int64(4), object(4)
memory usage: 50.9+ MB


In [8]:
%load_ext autoreload
# data preprocessing

sys.path.append('../../sa')

In [9]:
%autoreload 2
import str_cleaning_functions


def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

def cleaning_strlist(str_list):
    str_list = list(map(lambda x: str_cleaning_functions.remove_links(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.remove_links2(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.clean(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.deEmojify(x), str_list))
    str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
    return str_list

In [10]:
cleaning(dataset, 'review_text')
X = dataset['review_text'].values

# remove empty strings
X = list(filter(lambda x: len(x) > 0, X))

In [6]:
from bertopic import BERTopic

def _load_bertopic_model(model_path:Path):
    topic_model = BERTopic.load(str(model_path))

    return topic_model

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
search_behaviour = SEARCH_BEHAVIOUR.RANDOM_SEARCH
training_datetime = datetime(2024, 1, 24, 15, 30, 50)
training_folder = Path(f'bertopic_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')


training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

# embeddings
embeddings_path = training_folder.joinpath(
    f'embeddings_{training_result["best_hyperparameters"]["sbert_params"]["model_name_or_path"]}.pkl'
)
if embeddings_path.exists():
    with open(embeddings_path, 'rb') as f:
        embeddings = np.load(f)
else:
    raise Exception('No embeddings found. Function terminates.')


# model
best_model_checkpoint_path = training_result['best_model_checkpoint']

best_model = _load_bertopic_model(best_model_checkpoint_path)

topic_model = best_model
topics, probs = topic_model.transform(X, embeddings=embeddings)

2024-01-25 15:43:29,182 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [27]:
repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids = topic_model._extract_representative_docs(
    topic_model.c_tf_idf_,
    pd.DataFrame({"Document": X, "ID": range(len(X)), "Topic": topics}),
    topic_model.topic_representations_,
    nr_samples=500,
    nr_repr_docs=1,
)

topic_model.representative_docs_ = {k:v[0] for k, v in repr_docs_mappings.items()}

In [54]:
# get top N representative docs
N_repr_docs = 5

repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids = topic_model._extract_representative_docs(
    topic_model.c_tf_idf_,
    pd.DataFrame({"Document": X, "ID": range(len(X)), "Topic": topics}),
    topic_model.topic_representations_,
    nr_samples=500,
    nr_repr_docs=N_repr_docs,
)

In [55]:
repr_docs_mappings

{-1: [" I originally Kickstarted this one a long time ago before its second name change. What was originally known as Ravensdale is now finally released as Rogue Stormers. I didn't have that high of but perhaps too high, as I'm quite disappointed and underwhelmed by this final release. I don't care that much how it changed over development like others seem I just care about what we have now. I put in over a dozen hours, unlocked 2 extra characters, got a bunch of and I'm honestly bored with it, despite never seeing to the end of the game. This is probably as much as I'll ever play it. When you first start playing, it's impressive. The theme is clearly Warhammer, but for whatever reason, they didn't license it, even though that license is given out in backalleys at this point to anyone that wants it. Have you seen some of the Warhammer games on Steam these In any case, it's space marines and orks. It looks nice. It does have good controls, and switches between controller/keyboard on the

Qualitative Evaluation

In [13]:
topic_model.visualize_topics()

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [17]:
N_TOPICS = len(topic_model.get_topic_freq().index)
print(N_TOPICS)

71


In [18]:
# visualize c-TFIDF score for each topic representation

topic_model.visualize_barchart(top_n_topics=N_TOPICS)

selected topics for llm

[1, 3, 5, 7, 11, 16, 36, 45, 56, 67]

In [19]:
topic_model.visualize_heatmap(top_n_topics=N_TOPICS, width=1000, height=1000)

In [42]:
# create a table (df), showing the topic keywords and the most representative documents for each topic

topic_info = topic_model.get_topic_info()

topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,365992,-1_the_and_to_it,"[the, and, to, it, you, of, is, this, that, in]",Epanalepsis for me personally was a very frust...
1,0,242984,0_the_and_of_to,"[the, and, of, to, it, you, is, this, in, that]",Before i jump into things im gonna go ahead an...
2,1,13984,1_terraria_minecraft_bosses_and,"[terraria, minecraft, bosses, and, you, is, of...",Terraria. Because Minecraft's not on Steam!
3,2,12013,2_goat_rats_goats_simulator,"[goat, rats, goats, simulator, 10, bad, cat, t...",You goat to play this game.
4,3,10084,3_mac_crashes_it_fix,"[mac, crashes, it, fix, on, my, windows, scree...",DOES NOT WORK ON MAC. And what does the devel...
...,...,...,...,...,...
66,65,233,65_table_flip_tables_flipping,"[table, flip, tables, flipping, 10, flipped, f...",flip the table!!!
67,66,211,66_fap_sex_10_again,"[fap, sex, 10, again, times, caught, fapped, 6...",10/10 would fap again
68,67,196,67_screensaver_saver_screen_wallpaper,"[screensaver, saver, screen, wallpaper, intera...",It's a screensaver.
69,68,192,68_vᵃᵖᵒᵘʳʷᵃᵛᵉ_ᴱᵈᵍʸ_𝘺𝘰𝘶_𝘵𝘩𝘦,"[vᵃᵖᵒᵘʳʷᵃᵛᵉ, ᴱᵈᵍʸ, 𝘺𝘰𝘶, 𝘵𝘩𝘦, 𝘛𝘩𝘦, ㅤㅤ, 2015, _人...",𝘐 𝘨𝘰𝘵 𝘢 𝘧𝘦𝘦𝘭𝘪𝘯𝘨 𝘸𝘦 𝘢𝘳𝘦 𝘨𝘰𝘯𝘯𝘢 𝘸𝘪𝘯 𝘖𝘶𝘳 𝘣𝘰𝘥𝘪𝘦𝘴 𝘮𝘢...


In [38]:
import random

doc_info = topic_model.get_document_info(X)

doc_info = doc_info[~doc_info['Topic'].isin([-1, 0])]

doc_info

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Representative_document
36,Does not work in Mac OS or No support availabl...,3,3_mac_crashes_it_fix,"[mac, crashes, it, fix, on, my, windows, scree...",DOES NOT WORK ON MAC. And what does the devel...,mac - crashes - it - fix - on - my - windows -...,False
41,its good cuz therse cards an teyy fight guys,9,9_cards_review_trading_refund,"[cards, review, trading, refund, reviews, buy,...","got it for free, only played it to get the tra...",cards - review - trading - refund - reviews - ...,False
46,game keep crashing at a certain point every ti...,3,3_mac_crashes_it_fix,"[mac, crashes, it, fix, on, my, windows, scree...",DOES NOT WORK ON MAC. And what does the devel...,mac - crashes - it - fix - on - my - windows -...,False
47,WIll not open on mac.,3,3_mac_crashes_it_fix,"[mac, crashes, it, fix, on, my, windows, scree...",DOES NOT WORK ON MAC. And what does the devel...,mac - crashes - it - fix - on - my - windows -...,False
59,It's it comes free with windows,3,3_mac_crashes_it_fix,"[mac, crashes, it, fix, on, my, windows, scree...",DOES NOT WORK ON MAC. And what does the devel...,mac - crashes - it - fix - on - my - windows -...,False
...,...,...,...,...,...,...,...
741675,"Great game, nice and casual. 98/100",6,6_10_ign_11_m8,"[10, ign, 11, m8, r8, gr8, best, again, ever, ...",10\10 NGI 10/10 IGN,10 - ign - 11 - m8 - r8 - gr8 - best - again -...,False
741682,Neat.,59,59_crust_pie_sugar_pastry,"[crust, pie, sugar, pastry, degrees, lattice, ...",If your looking for a review on this game you'...,crust - pie - sugar - pastry - degrees - latti...,False
741684,Good Chillout Game,8,8_fun_addictive_addicting_relaxing,"[fun, addictive, addicting, relaxing, hard, ve...",Very fun and addicting.,fun - addictive - addicting - relaxing - hard ...,False
741706,Awesome game. One of the best I've ever played...,8,8_fun_addictive_addicting_relaxing,"[fun, addictive, addicting, relaxing, hard, ve...",Very fun and addicting.,fun - addictive - addicting - relaxing - hard ...,False


In [46]:
# search for a topic given a word
# then list information about these topics

topic_ids, topic_probs = topic_model.find_topics('Graphics are amazing')

specific_topic_info = topic_info[topic_info['Topic'].isin(topic_ids)]
specific_topic_info = specific_topic_info.sort_values('Topic', key=lambda s: s.apply(topic_ids.index), ignore_index=True)
specific_topic_info['Probs'] = topic_probs

specific_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Probs
0,0,242984,0_the_and_of_to,"[the, and, of, to, it, you, is, this, in, that]",Before i jump into things im gonna go ahead an...,0.54206
1,-1,365992,-1_the_and_to_it,"[the, and, to, it, you, of, is, this, that, in]",Epanalepsis for me personally was a very frust...,0.528156
2,5,5546,5_hot_superhot_guns_super,"[hot, superhot, guns, super, gun, gungeon, the...",SUPER. HOT. SUPER. HOT. SUPER. HOT. SUPER. HOT...,0.465767
3,22,1651,22_hacking_tutorial_hacker_ai,"[hacking, tutorial, hacker, ai, hackers, hack,...","AI War: Fleet Command (AIW), is an Asymmetrica...",0.458109
4,8,4954,8_fun_addictive_addicting_relaxing,"[fun, addictive, addicting, relaxing, hard, ve...",Very fun and addicting.,0.457484


---

Prompt LLM to generate topic name based on keywords

in BERTopic implementation, it uses the keywords, and the top 4 representative sentences to generate a short phrase (2 to 3 words) to represent the topic

instead of using their own implementation, we use Ollama and langchain for flexibility (and test abt performance on diff models)

In [68]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

In [59]:
llm = Ollama(model='llama2')

In [66]:
system_message = 'You are an english speaking professional game reviewer. You are expert in generating keywords summary for a given game and reviews.'

human_template = \
'''I have a topic described by the following KEYWORDS and top {N} representative documents. Describe the topic where exact word should exist in the user inputed text.

Here is the KEYWORDS \'\'\'{keywords}\'\'\'

Here is the top {N} representative documents \'\'\'{topic_reviews}\'\'\'

Output only the name of the topic. Do NOT output adjectives which are not in the KEYWORDS nor the top {N} representative documents.'''

chat_prompt = ChatPromptTemplate.from_messages([
    ('system', system_message),
    ('human', human_template)
])

chain = chat_prompt | llm

In [50]:
# selected topic IDs: 

selected_topic_ids = [1, 3, 5, 7, 11, 16, 36, 45, 56, 67]

selected_topic_info = topic_info[topic_info['Topic'].isin(selected_topic_ids)]

selected_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
2,1,13984,1_terraria_minecraft_bosses_and,"[terraria, minecraft, bosses, and, you, is, of...",Terraria. Because Minecraft's not on Steam!
4,3,10084,3_mac_crashes_it_fix,"[mac, crashes, it, fix, on, my, windows, scree...",DOES NOT WORK ON MAC. And what does the devel...
6,5,5546,5_hot_superhot_guns_super,"[hot, superhot, guns, super, gun, gungeon, the...",SUPER. HOT. SUPER. HOT. SUPER. HOT. SUPER. HOT...
8,7,5120,7_controller_finger_punch_keyboard,"[controller, finger, punch, keyboard, mouse, c...",(This review has been slightly altered. If you...
12,11,4172,11_bird_birds_dating_duck,"[bird, birds, dating, duck, quack, story, pige...",This is going to be my first review made ever....
17,16,2881,16_shark_sharks_divers_diver,"[shark, sharks, divers, diver, as, depth, fish...",Depth is a first person multiplayer game pitti...
37,36,699,36_scary_spooky_spooks_spooked,"[scary, spooky, spooks, spooked, 2spooky4me, s...",scary
46,45,508,45_spacechem_chemistry_puzzle_molecules,"[spacechem, chemistry, puzzle, molecules, prog...",Zachtronic + Chemistry + Insane puzzle = Space...
57,56,330,56_trump_donald_american_america,"[trump, donald, american, america, freedom, mu...",Best Donald Trump Simulator of the world.
68,67,196,67_screensaver_saver_screen_wallpaper,"[screensaver, saver, screen, wallpaper, intera...",It's a screensaver.


In [76]:
all_topics_idx = topic_info[topic_info['Topic'].apply(lambda x: x >= 0)]['Topic'].values

all_topics_idx

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69])

In [77]:
# K = 5
N = 4



for id in all_topics_idx:
    # get topic keywords and top N representative documents

    topic_keywords = topic_info[topic_info['Topic'] == id]['Representation'].values[0]
    top_N_repr_docs = repr_docs_mappings[id][:N]
    print(topic_keywords)
    print('\n'.join(top_N_repr_docs))

    msg = chain.invoke({
        "N": N,
        # "K": K,
        "keywords": topic_keywords,
        "topic_reviews": '\n'.join(top_N_repr_docs)
    })

    print(msg)

    print('\n\n')


['the', 'and', 'of', 'to', 'it', 'you', 'is', 'this', 'in', 'that']
This game is I love it. I can't stop playing. It's so easy to lose track of time. I've played 11+ hours and it felt like I only played for 5 hours. People will say it's like minecraft and other games similar but don't knock it till you try it. It's worth the money and I like that you can play with friends. I currently don't have anyone who I know playing this yet, so I have yet to explore the adventure with others. The art style is just sooo cute. I love the slimes the best. This game starts of super cute and fun with the slimes, bunnies and pigs but go into the wrong portal and hell awaits. I think it's a fun way to explore diffrent places and a good way to test how good you are at the game. It may be a little sad when you die and you got so far but each time you play you get better and better. Just don't stay still for a long will face the horrible fate that has come across you.
If the retail price was $1, I'd recomm