In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import json
from datetime import datetime

import gensim
import nltk
import pyLDAvis

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

In [2]:
import platform
import torch

if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

cuda


In [3]:
%load_ext autoreload

In [4]:
import sys

sys.path.append('../')

In [5]:
%autoreload 2
from dataset_loader import GENRES, load_dataset

genre = GENRES.INDIE
# unique_list = ['app_id', 'review_text']
unique_list = ['review_text']

---

Load the dataset from raw

(split and not split)

In [6]:
# load the dataset
# TODO: load any external dataset

dataset_folder = Path(f'../../dataset/topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]')
dataset, dataset_path = load_dataset(genre, dataset_folder)

# new: create an untouched ver of the dataset for retrieving original text
dataset_untouched = dataset.copy()

dataset.info(verbose=True)

Load dataset from: /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[review_text]/01_indie.pkl





<class 'pandas.core.frame.DataFrame'>
Index: 725737 entries, 25636 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         725737 non-null  int64 
 1   app_id        725737 non-null  int64 
 2   app_name      725737 non-null  object
 3   review_text   725737 non-null  object
 4   review_score  725737 non-null  int64 
 5   review_votes  725737 non-null  int64 
 6   genre_id      725737 non-null  object
 7   category_id   725737 non-null  object
dtypes: int64(4), object(4)
memory usage: 49.8+ MB


In [7]:
# data preprocessing

sys.path.append('../../sa')

%autoreload 2
import str_cleaning_functions


def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

# def cleaning_strlist(str_list):
#     str_list = list(map(lambda x: str_cleaning_functions.remove_links(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.remove_links2(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.clean(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.deEmojify(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
#     return str_list

In [8]:
cleaning(dataset, 'review_text')

In [9]:
# same as LDA, we skip removing reviews with too many punctuations for more realistic results

# def calculate_nonalphabet_ratio(review: str) -> float:
#     count = 0
#     for char in review:
#         if not char.isalpha():
#             count += 1
#     return count / (len(review) + 1e-5)

# dataset['alphabet_ratio'] = dataset['review_text'].apply(calculate_nonalphabet_ratio)

# dataset['alphabet_ratio'].describe([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

# dataset = dataset[dataset['alphabet_ratio'] < 0.40]

In [10]:
# remove empty strings

dataset = dataset[dataset['review_text'].apply(lambda x: len(x) > 0)]

In [11]:
# check the length b4 saving
print(len(dataset))

725707


In [11]:
# save the dataset for eval

dataset_path = Path('category_indie_unique_review_text')
dataset_path = dataset_path.joinpath(
    Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval.pkl')
)

if not dataset_path.exists():
    dataset.to_pickle(dataset_path)
else:
    print(f'File {dataset_path} already exists')
    print('Skip saving')

In [13]:
X = dataset['review_text'].values

In [14]:
# check the length to be identical in the training script
print(len(X))
print(X[0])

718440
Take one part Faerie Solitaire and two parts Puzzle Quest and mix in a little Poker or Yahtzee for good measure and you will get something like Runespell: Overture. You're a changeling of some sort and you fight monsters and take quests in exchange for coin and buffs (which come in the form of power-up cards). There's a story but it's not the strongest element in the game. Like the Puzzle Quest games, your battles are determined by playing a mini-game. Instead of match-3 though, the game is a card game similar to poker in which making certain combinations of cards (pairs, 5 of a kind, full house, flush, straight) will do a certain amount of damage to your opponent, who is trying to do the same to you. The ability to steal some cards from your opponent, plus the limited number of moves you get per turn to move cards or play power-ups adds just enough strategy to the game to keep it interesting. Admittedly, the game can get a bit repetitive after a while and I found the dialogue o

Create split text for models trained with split tokens

In [35]:
# tokens spliting helper functions
# copied from bertopic_training.ipynb on 20240217

def split_tokens_into_smaller_chunks(
    data,
    chunk_size: int,
    stride: int,
    minimal_chunk_length: int,
) -> dict:
    """Splits tokens into overlapping chunks with given size and stride."""

    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []

    # eval_only
    _original_iloc = []

    for input_id, token_type_id, mask_chunk, iloc in zip(data['input_ids'], data['token_type_ids'], data['attention_mask'], data['X_iloc']):
        _input_id_chunk = split_overlapping(input_id, chunk_size, stride, minimal_chunk_length)
        _token_type_id = split_overlapping(token_type_id, chunk_size, stride, minimal_chunk_length)
        _mask_chunk = split_overlapping(mask_chunk, chunk_size, stride, minimal_chunk_length)

        _new_input_id_chunks.extend(_input_id_chunk)
        _new_token_type_ids.extend(_token_type_id)
        _new_mask_chunks.extend(_mask_chunk)

        _original_iloc.extend([iloc] * len(_input_id_chunk))

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks, 'X_iloc': _original_iloc}

def split_overlapping(tensor:list[int], chunk_size: int, stride: int, minimal_chunk_length: int) -> list[list[int]]:
    """Helper function for dividing 1-dimensional tensors into overlapping chunks."""
    # check_split_parameters_consistency(chunk_size, stride, minimal_chunk_length)
    result = [tensor[i : i + chunk_size] for i in range(0, len(tensor), stride)]
    if len(result) > 1:
        # ignore chunks with less than minimal_length number of tokens
        result = [x for x in result if len(x) >= minimal_chunk_length]
    return result


def tokenize_dataset(data, tokenizer):
    # return sbert_model[0].tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)
    return tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)

In [36]:
from sentence_transformers import SentenceTransformer
from datasets import Dataset

split_sentence = True
sbert_model_name = 'all-MiniLM-L6-v2'       # !!! check with the model to be loaded
sbert = SentenceTransformer(sbert_model_name, device=device)

if split_sentence:
    X_new = []

    tokenizer = sbert[0].tokenizer

    # tokenize the dataset
    # then split the tokens into smaller chunks
    ds_sentences = Dataset.from_dict({'text': X})
    ds_sentences = ds_sentences.map(tokenize_dataset, batched=True, fn_kwargs={'tokenizer':tokenizer})
    ds_sentences2 = Dataset.from_dict({'input_ids': ds_sentences['input_ids'], 'token_type_ids': ds_sentences['token_type_ids'], 'attention_mask': ds_sentences['attention_mask'], 'X_iloc': list(range(len(X)))})
    ds_sentences2 = ds_sentences2.map(split_tokens_into_smaller_chunks, batched=True, fn_kwargs={'chunk_size': sbert.max_seq_length-2, 'stride': sbert.max_seq_length-2, 'minimal_chunk_length': 1})

    # re-create new sentences based on tokens
    for input_id in ds_sentences2['input_ids']:
        X_new.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id)))

    embeddings = sbert.encode(X_new, show_progress_bar=True, batch_size=64)

    print('Created embeddings with split sentences')
else:
    embeddings = sbert.encode(X, show_progress_bar=True, batch_size=64)
    

Map:   0%|          | 0/725707 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (914 > 512). Running this sequence through the model will result in indexing errors
Map:   0%|          | 3000/725707 [00:00<01:10, 10283.75 examples/s]

Map: 100%|██████████| 725707/725707 [01:28<00:00, 8227.34 examples/s] 
Map: 100%|██████████| 725707/725707 [00:37<00:00, 19578.80 examples/s]
Batches: 100%|██████████| 12362/12362 [04:57<00:00, 41.62it/s] 


Created embeddings with split sentences


In [45]:
len(ds_sentences2['X_iloc'])

791154

In [44]:
dataset.iloc[ds_sentences2['X_iloc']]

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25637,32134,102200,Runespell: Overture,Why don't they make more games like this?! Sim...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
...,...,...,...,...,...,...,...,...
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179605,6416381,99900,Spiral Knights,This game use to be they did the rework on ene...,0,0,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179607,6416383,99900,Spiral Knights,This game is good to play by your self or with...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"


In [46]:
dataset_eval_split = dataset.iloc[ds_sentences2['X_iloc']]
dataset_eval_split['review_text_split'] = X_new

# load the untouched dataset, create a column for the review_text from the untouched dataset
dataset_eval_split['review_text_untouched'] = dataset_untouched.loc[dataset_eval_split.index]['review_text'].values

dataset_eval_split

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id,review_text_split,review_text_untouched
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",take one part faerie solitaire and two parts p...,Take one part Faerie Solitaire and two parts P...
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",it ' s an entertaining casual game to play. it...,Take one part Faerie Solitaire and two parts P...
25637,32134,102200,Runespell: Overture,Why don't they make more games like this?! Sim...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",why don ' t they make more games like this?! s...,Why don't they make more games like this?! Si...
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",runespell : overture melds together classic rp...,Runespell: Overture melds together classic RPG...
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]","##itaire, poker game - and it is done very wel...",Runespell: Overture melds together classic RPG...
...,...,...,...,...,...,...,...,...,...,...
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",i hadn ' t played spiral knights for over 2 ye...,I hadn't played Spiral Knights for over 2 year...
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",you go out of your way to organise something w...,I hadn't played Spiral Knights for over 2 year...
4179605,6416381,99900,Spiral Knights,This game use to be they did the rework on ene...,0,0,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",this game use to be they did the rework on ene...,This game use to be good..until they did the r...
4179607,6416383,99900,Spiral Knights,This game is good to play by your self or with...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",this game is good to play by your self or with...,This game is good to play by your self or with...


In [47]:
# save the eval dataset for reloading

dataset_path = Path('category_indie_unique_review_text')
dataset_path = dataset_path.joinpath(
    Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval_split.pkl')
)

if not dataset_path.exists():
    dataset_eval_split.to_pickle(dataset_path)

    print('Save to', dataset_path)

----

Or load the preprocessed data (saved during training/first evaluation)

In [6]:
split_sentence = True

dataset_preprocessed_path = Path(f'category_{str(genre)}_unique_review_text')
dataset_preprocessed_path = dataset_preprocessed_path.joinpath(
    Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval{"_[split]" if split_sentence else ""}.pkl')
)

print(dataset_preprocessed_path)

if dataset_preprocessed_path.exists():
    dataset = pd.read_pickle(dataset_preprocessed_path)
    print(len(dataset))
    print('\n\n')
    print(dataset.info(verbose=True))

    if split_sentence:
        X = dataset['review_text_split'].values
    else:
        X = dataset['review_text'].values
else:
    print(f"{dataset_preprocessed_path} does not exist")

category_indie_unique_review_text/preprocessed_data/01_indie_dataset_eval_[split].pkl
791154



<class 'pandas.core.frame.DataFrame'>
Index: 791154 entries, 25636 to 4179608
Data columns (total 10 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   index                  791154 non-null  int64 
 1   app_id                 791154 non-null  int64 
 2   app_name               791154 non-null  object
 3   review_text            791154 non-null  object
 4   review_score           791154 non-null  int64 
 5   review_votes           791154 non-null  int64 
 6   genre_id               791154 non-null  object
 7   category_id            791154 non-null  object
 8   review_text_split      791154 non-null  object
 9   review_text_untouched  791154 non-null  object
dtypes: int64(4), object(6)
memory usage: 66.4+ MB
None


---

Evaluation

In [7]:
from eval_metrics import SEARCH_BEHAVIOUR
from bertopic_utils import _load_bertopic_model

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Test whether the result are the same when load the model from the disk

# load the best model and the embedding from the config folder

split_sentence = True
search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
# training_datetime = datetime(2024, 2, 16, 9, 47, 40)
training_datetime = datetime(2024, 2, 14, 11, 15, 56)
training_folder_p = Path(f'category_{str(genre)}_unique_review_text')
training_folder = Path(f'bertopic{"[split]" if split_sentence else ""}_genre_{str(genre)}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
training_folder = training_folder_p.joinpath(training_folder)


training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

# embeddings (uncomment it to load embeddings from training set for quick verification)
# embeddings_path = training_folder.joinpath(
#     f'embeddings_{training_result["best_hyperparameters"]["sbert_params"]["model_name_or_path"]}.pkl'
# )
# if embeddings_path.exists():
#     with open(embeddings_path, 'rb') as f:
#         embeddings = np.load(f)

#     assert embeddings.shape[0] == len(X), f'Number of embeddings ({embeddings.shape[0]}) does not match the number of reviews ({len(X)}). Function terminates.'
# else:
#     # raise Exception('No embeddings found. Function terminates.')
#     print('No embeddings found.')
#     embeddings = []


# model
# best_model_checkpoint_path = training_result['best_model_checkpoint']
best_model_checkpoint_path = training_folder_p.joinpath(
    Path(training_result['best_model_checkpoint'])
)

print(best_model_checkpoint_path)

category_indie_unique_review_text/bertopic[split]_genre_indie_grid_search_20240214_111556/bertopic_bt_nr_topics_100


In [10]:
# Create embeddings from datasets (eval)
from sentence_transformers import SentenceTransformer

# sbert_model = SentenceTransformer(training_result["best_hyperparameters"]["sbert_params"]["model_name_or_path"])
sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)        # we only use this model

if split_sentence:
    embeddings = sbert_model.encode(dataset['review_text_split'].values, show_progress_bar=True, batch_size=64)
else:
    embeddings = sbert_model.encode(dataset['review_text'].values, show_progress_bar=True, batch_size=64)

# save the embeddings
save_embs = True
if save_embs:
    embeddings_path = Path(f'category_{str(genre)}_unique_review_text')
    embeddings_path = embeddings_path.joinpath(
        Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_embeddings_eval{"_[split]" if split_sentence else ""}.npy')
    )

    if not embeddings_path.exists():
        with open(embeddings_path, 'wb') as f:
            np.save(f, embeddings)

        print('Save embedding to', embeddings_path)
    else:
        print(f'File {embeddings_path} already exists')
        print('Skip saving')

Batches: 100%|██████████| 12362/12362 [03:49<00:00, 53.79it/s] 


Save embedding to category_indie_unique_review_text/preprocessed_data/01_indie_embeddings_eval_[split].npy


In [20]:
# load pre-created embeddings

embeddings_path = Path(f'category_{str(genre)}_unique_review_text')
embeddings_path = embeddings_path.joinpath(
    Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_embeddings_eval{"_[split]" if split_sentence else ""}.npy')
)

if embeddings_path.exists():
    embeddings = np.load(embeddings_path)
else:
    print(f"{embeddings_path} does not exist")

In [11]:
# or change the model path to a different one if needed
best_model_checkpoint_path = Path(best_model_checkpoint_path).parent.joinpath('bertopic_bt_nr_topics_40')

In [12]:
print(best_model_checkpoint_path)

category_indie_unique_review_text/bertopic[split]_genre_indie_grid_search_20240214_111556/bertopic_bt_nr_topics_40


In [13]:
best_model_loaded = _load_bertopic_model(best_model_checkpoint_path)

X = dataset['review_text_split'].values if split_sentence else dataset['review_text'].values
topics, probs = best_model_loaded.transform(X, embeddings=embeddings)

topic_model = best_model_loaded

2024-02-18 23:58:33,406 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


---

For visualization, refer to bertopic_eval_vis.ipynb

---

In [14]:
print(len(X))
print(topics.shape)

791154
(791154,)


In [15]:
print(probs[0])
print(len(probs[0]))

[0.5374453  0.5189512  0.4346826  0.37318867 0.58256996 0.2778609
 0.47171327 0.45534953 0.41457385 0.4298167  0.5155171  0.32520804
 0.36372456 0.18628922 0.39025533 0.32786146 0.24533634 0.39875606
 0.4266532  0.36006427 0.34415138 0.37041742 0.25701025 0.31673697
 0.3491488  0.33225518 0.40629965 0.25379664 0.28325963 0.4382311
 0.33966392 0.32805648 0.20663342 0.42446476 0.3163567  0.3581906
 0.33609867 0.28339902 0.4587275  0.43734604 0.40636736]
41


Get top 10 keywords for each topics

In [21]:
top_N_words = 10
topic_keywords = {}

for topic_id, _keywords in topic_model.get_topics().items():
    if topic_id not in topic_keywords:
        topic_keywords[topic_id] = []
    
    for i in range(top_N_words):
        topic_keywords[topic_id].append(_keywords[i][0])

for topic_id, keywords in topic_keywords.items():
    print(f'Topic {topic_id}:')
    print(', '.join(keywords))
    print()

Topic -1:
like, play, fun, really, good, 10, great, gameplay, playing, little

Topic 0:
fun, like, play, great, minecraft, good, best, really, friends, buy

Topic 1:
horror, scary, freddy, night, nights, scares, fnaf, scared, scare, jumpscares

Topic 2:
minecraft, 2d, items, world, building, hours, build, new, like, friends

Topic 3:
puzzle, puzzles, minesweeper, platformer, like, short, challenging, great, really, click

Topic 4:
10, ign, 11, pig, killed, wolves, play, best, kill, fun

Topic 5:
dating, visual, novel, stanley, parable, lisa, birds, ending, moon, vn

Topic 6:
addictive, addicting, fun, hours, addicted, boring, waster, crack, played, drugs

Topic 7:
sam, guns, hot, gun, weapons, innovative, played, narrator, gameplay, years

Topic 8:
ship, ships, shark, divers, diver, crew, sea, pirate, combat, like

Topic 9:
hard, level, fun, challenging, relaxing, simple, challenge, quit, really, difficult

Topic 10:
goat, goats, simulator, 10, animal, farm, life, lick, best, mmo

Topi

---

Get the most representative docs per topic

In [22]:
# get top 10 representative docs for each topic

# Approximate most representative documents per topic by sampling
# a subset of the documents in each topic and calculating which are
# most represenative to their topic based on the cosine similarity between
# c-TF-IDF representations

# the method was called internally in the fit_transform method
# so that the .get_topic_info() can work properly when the model was reloaded from disk
repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids = topic_model._extract_representative_docs(
    topic_model.c_tf_idf_,
    pd.DataFrame({"Document": X, "ID": range(len(X)), "Topic": topics}),
    topic_model.topic_representations_,
    nr_samples=500,
    nr_repr_docs=10          # the number of representative documents per topic
)

In [28]:
len(repr_docs_ids)

41

In [23]:
# the mapping is in no particular order

repr_docs_mappings

{-1: ["this is a great time killer in my opinion. personally i play it on difficult and see how far i can go. even on easy it ' s really fun to play through as well, because it is still fairly difficult. a game i ' d recommend anyone to play if they enjoy difficulty or if they have time to kill.",
  "custom maps are back. that ' s pretty much the only thing you need to know if you ' re thinking about getting this game. they give the game tons of hours of play time. as of now though there are almost none since well, the game just launched but soon however i know we ' ll see some epic stories. alright, now moving to the game itself. i played it for about an hour and so far i can say that it ' s awesome. the graphics aren ' t extremely good, they ' re alright, acceptable but nothing more than that. the atmosphere, sound and gameplay are the things that matter here however and those seem to be put in their right place. the story seems to be interesting so far and yeah, i ' ll be updating t

Use the id to create a df to only select the repr docs from the dataframe

In [30]:
df_original_texts = []

for i, topic_repr_docs_id in enumerate(repr_docs_ids):
    t = dataset.iloc[topic_repr_docs_id]
    t['topic_id'] = i - 1           # starts from -1, as -1 represents outliers

    df_original_texts.append(t)

df_original_texts = pd.concat(df_original_texts)
df_original_texts

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id,review_text_split,review_text_untouched,topic_id
2541673,3971922,310890,Breach & Clear: Deadline Rebirth (2016),This game is pretty decent. The best part abou...,1,0,"[1, 23, 3, 2]","[2, 22, 28, 29, 15, 25]",this game is pretty decent. the best part abou...,This game is pretty decent. The best part abou...,-1
2976086,4635679,356570,Party Hard,not a bad game if u like a bit of party killin...,1,0,"[1, 23, 2]","[2, 1, 9, 39, 24, 22, 28, 29, 42, 43, 44]",not a bad game if u like a bit of party killin...,not a bad game if u like a bit of party killin...,-1
3825969,5941023,533300,Zup!,"The devs of this game are seriously geniuses, ...",1,0,"[4, 23]","[2, 22, 29, 23]","the devs of this game are seriously geniuses, ...","The devs of this game are seriously geniuses, ...",-1
2633346,4101564,317410,Color Symphony,Color Symphoney. Really I don't have to say an...,1,0,"[1, 37, 23]","[2, 28, 23]",color symphoney. really i don ' t have to say ...,Color Symphoney. Really I don't have to say an...,-1
3992854,6183334,63700,BIT.TRIP BEAT,"Although hard, this game is fun and has a grea...",1,0,"[1, 4, 23]","[2, 22, 18, 23, 25]","although hard, this game is fun and has a grea...","Although hard, this game is fun and has a grea...",-1
...,...,...,...,...,...,...,...,...,...,...,...
2779714,4331385,333600,NEKOPARA Vol. 1,This game is as colorful as my language while ...,1,1,"[4, 23]","[2, 22, 29, 23, 41, 42]",this game is as colorful as my language while ...,This game is as colorful as my language while ...,39
2168257,3352512,274980,Influent,Great game! It doesnt really help you to learn...,1,0,"[4, 23, 28, 2]","[2, 22, 29, 13, 23, 15]",great game! it doesnt really help you to learn...,Great game! It doesnt really help you to learn...,39
2168712,3353068,274980,Influent,"As a vocab primer, it's fun. My Spanish is OK ...",0,0,"[4, 23, 28, 2]","[2, 22, 29, 13, 23, 15]","as a vocab primer, it ' s fun. my spanish is o...","As a vocab primer, it's fun. My Spanish is OK ...",39
2168642,3352986,274980,Influent,I want to make it clear before I go on that I ...,0,0,"[4, 23, 28, 2]","[2, 22, 29, 13, 23, 15]",i want to make it clear before i go on that i ...,I want to make it clear before I go on that I ...,39


In [36]:
# check which repr docs do not begin from beginning of the original review_text

def check_beginning_of_review_text(row):
    t = row['review_text_split'].split()[0].lower()
    return t != row['review_text'][:len(t)].lower()

df_original_texts[
df_original_texts.apply(
    lambda x: check_beginning_of_review_text(x),
    axis=1
)]

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id,review_text_split,review_text_untouched,topic_id
2023492,3136574,265930,Goat Simulator,10/10,1,0,"[4, 23, 28]","[2, 1, 24, 22, 28, 29, 30, 23, 25, 17, 43, 44]",10 / 10,... 10/10,4
2265659,3484493,285840,Enemy Mind,"I have over 600 games in my Steam account, and...",1,0,"[1, 23]","[2, 1, 24, 28, 29, 25, 44]","in enemy mind, you don ' t pick up powerups, b...","I have over 600 games in my Steam account, and...",8
2588285,4033857,312960,Starion Tactics,Bugs: - Sometimes had trouble moving ships. - ...,0,0,"[1, 23, 2]","[2, 1, 9, 22, 29]","your own empire, which makes them difficult ( ...",Bugs: - Sometimes had trouble moving ships. - ...,8
2744884,4276279,329130,Reassembly,If you're a sucker for design-it-and-fail kind...,1,0,"[1, 23, 2]","[2, 22, 28, 29, 30, 23, 17]",for - free mode if you don ' t mind grinding a...,If you're a sucker for design-it-and-fail kind...,8
2446342,3770732,300060,ADR1FT,Played it with Vive System. I sure haven't pla...,0,1,"[25, 23]","[2, 22, 28, 52, 53, 31]","is a really poor choice in a vr game, even if ...",Played it with Vive System. I sure haven't pla...,16
2800673,4359636,334230,Town of Salem,Town of Salem is presented somewhat in the sty...,1,1,"[23, 3, 2]","[1, 27, 22, 35, 23]",", and win the game. so what is the gameplay? w...",Town of Salem is presented somewhat in the sty...,17
2464730,3838141,302610,Boson X,Quite possibly the most addicting game I have ...,1,0,"[1, 23]","[2, 22, 28, 29, 25]",. the music is simple. each level has two loop...,Quite possibly the most addicting game I have ...,19
56770,66892,105000,A New Beginning - Final Cut,A New Beginning- Final Cut Fast Review. Point ...,1,0,"[25, 23]","[2, 22]",s a pretty easy perfect ( 100 % achievements )...,A New Beginning- Final Cut Fast Review. Point...,24
216164,259836,11340,Larva Mortus,Top down action RPG-lite with a nice atmospher...,1,0,"[1, 23]","[2, 22]",the achievements.,Top down action RPG-lite with a nice atmospher...,24
2006992,3115635,265630,Fistful of Frags,Pass the Whiskey! Pass the Whiskey! Pass the W...,1,0,"[1, 37, 23]","[1, 49, 36, 47, 9, 38, 48, 27, 22, 30, 15, 25,...",whiskey! pass the whiskey! pass the whiskey! p...,Pass the Whiskey! Pass the Whiskey! Pass the W...,31


In [38]:
# print out the original text and the split (if any) for reference

for topic_id in repr_docs_mappings.keys():
    print(f'Topic {topic_id}:')

    for index, row in df_original_texts[df_original_texts['topic_id'] == topic_id].iterrows():
        print(f'Doc {index}')
        print(f'Original: {row["review_text"]}')
        if split_sentence:
            print(f'Split: {row["review_text_split"]}')
        print()

Topic -1:
Doc 2541673
Original: This game is pretty decent. The best part about it is the gameplay itself, as I'm always a fan of killing zombies and strategy games but I don't think I've played another game where the two concepts are combined quite like this. The soundtrack is pretty nice too. I did, however, find myself starting to get pretty bored after a while because after a certain point the game starts to feel quite repetitive. Overall I think it's worth playing and it's a neat concept. I'd probably give it a 6/10.
Split: this game is pretty decent. the best part about it is the gameplay itself, as i ' m always a fan of killing zombies and strategy games but i don ' t think i ' ve played another game where the two concepts are combined quite like this. the soundtrack is pretty nice too. i did, however, find myself starting to get pretty bored after a while because after a certain point the game starts to feel quite repetitive. overall i think it ' s worth playing and it ' s a ne

In [23]:
X[333883]

"I'm really impressed with this game, i bought it on sale. Its very atmospheric and the world is really interesting which is brilliant. The premise is amazing too. Unfortunately, i'm not very good at these type of games and have little patience for item based puzzles (never got on with Secret of Monkey island) and so heavily relied on a walkthrough."

---

Miscellenous

Topic frequency table

In [20]:
# get topic frequency table
freq = topic_model.get_topic_freq()
print(freq)
print('Num of topics:', len(freq))
print('\n\n')

# sum the 'Count'
print('Total number of docs:', freq['Count'].sum())
print('Number of in-liers:', freq['Count'].sum() - freq[freq['Topic'] == -1]['Count'].sum())
print('Ratio of in-liners:', (freq['Count'].sum() - freq[freq['Topic'] == -1]['Count'].sum()) / float(freq['Count'].sum()))

    Topic   Count
0      -1  391903
4       0  132961
29      1   23823
7       2   19587
20      3   16063
2       4   14425
5       5   12623
14      6   10790
17      7   10770
31      8    7863
13      9    6818
16     10    6802
18     11    6318
10     12    6043
11     13    5247
25     14    4041
3      15    3701
19     16    3456
22     17    3071
12     18    2906
40     19    2638
9      20    2363
21     21    2289
1      22    2284
8      23    2180
33     24    1747
35     25    1602
24     26    1377
27     27    1321
32     28    1152
39     29    1131
34     30    1060
6      31    1025
23     32    1020
36     33     962
15     34     944
30     35     852
28     36     827
37     37     803
26     38     801
38     39     722
Num of topics: 41



Total number of docs: 718311
Number of in-liers: 326408
Ratio of in-liners: 0.454410415544242


find related topics based on a sentence/keyword input

In [31]:
topic_model.find_topics('horror')

([3, 9, 7, 5, 36],
 [0.48339954, 0.41095716, 0.28916863, 0.28884655, 0.28675893])

---

Test the capability of bertopic with LLM topic naming

In [39]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

In [40]:
# can try diff llama2: https://ollama.com/library/yarn-llama2

llm = Ollama(model="llama2")        # assuming the port is 11434

In [41]:
# prompt engineering
system_message = "You are a player of the game who is reading the reviews about the game."

human_template = \
'''Create a name for a topic given the topic's keywords and some most representative reviews of the topic. Output a label for the topic in less than 5 words. Do not output other text. 

The top keywords of the topic is: \'\'\'{topic_keywords}\'\'\'. 

The most representative reviews of the topic are: \'\'\'{topic_reviews}\'\'\'. '''

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("human", human_template)
])

chain = chat_prompt | llm

In [46]:
import random
import time

N_times = 5

topic_ids = list(repr_docs_mappings.keys())

new_topic_labels = {}

for topic_id in topic_ids:
    _topic_keywords = topic_keywords[topic_id]

    time.sleep(1)

    for i in range(N_times):
        if split_sentence:
            _reviews = list(df_original_texts[df_original_texts['topic_id'] == topic_id]['review_text_split'].values)
        else:
            _reviews = list(df_original_texts[df_original_texts['topic_id'] == topic_id]['review_text'].values)


        while True:
            topic_reviews = random.sample(_reviews, 2)

            check_bool = [len(topic_reviews[i]) < 5000 for i in range(len(topic_reviews))]
            if all(check_bool):
                break

        result = chain.invoke({
            'topic_keywords':_topic_keywords,
            'topic_reviews':topic_reviews
        })

        print(f'{topic_id:02}_call{i}: {result}')

        if topic_id not in new_topic_labels:
            new_topic_labels[topic_id] = [result]
        else:
            new_topic_labels[topic_id].append(result)

    print()

-1_call0: "Fun maps return"
-1_call1: 
Topic Label: Fun Puzzle Game with Mediocre Graphics and Sound.
-1_call2: "Fun, simple game with great story and art."
-1_call3: 
"Simple Fun Game"
-1_call4: Fun Puzzle Game

00_call0: 
Friendly Minecraft
00_call1: Friendly Minecraft
00_call2: "Best Sandbox Game"
00_call3: Fun game for playing with friends.
00_call4: Friendly Minecraft

01_call0: 
Topic Label: "Scary Freddy Nightmare"
01_call1: 
Freddy's Nightmare
01_call2: Spooky Fun
01_call3: Scare Night
01_call4: Freddy's Nightmare

02_call0: Terra Building
02_call1: Terraire - A Minecraft Alternative
02_call2: 
"Minecraft-like in 2D"
02_call3: Terraria
02_call4: 
Topic Label: New Minecraft Alternative

03_call0: Puzzle Game
03_call1: Mix of Puzzle and FPS
03_call2: Puzzle Game
03_call3: Puzzling Platformer
03_call4: Puzzle Platformer

04_call0: "Best killing game ever!"
04_call1: 
Topic Label: "Killed Pig"
04_call2: Topic Label: "Wolves Killed Pig"
04_call3: Best Fun Kill
04_call4: 
Topic Label

In [48]:
# save the topic labels

print(best_model_checkpoint_path)

topic_names_path = best_model_checkpoint_path.joinpath(
    f'topic_names_{genre.value:02}_{str(genre)}.json'
)

with open(topic_names_path, 'w') as f:
    json.dump(new_topic_labels, f, indent=2)

category_indie_unique_review_text/bertopic[split]_genre_indie_grid_search_20240214_111556/bertopic_bt_nr_topics_40


---