In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import json
from datetime import datetime

import gensim
import nltk
import pyLDAvis

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

In [2]:
import platform
import torch

if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

cuda


In [3]:
%load_ext autoreload

In [4]:
import sys

sys.path.append('../')

In [5]:
%autoreload 2
from dataset_loader import GENRES, load_dataset

genre = GENRES.ACTION
# genre = -1
# unique_list = ['app_id', 'review_text']
unique_list = ['review_text']

---

Load the dataset from raw

(split and not split)

In [6]:
# load the dataset
# TODO: load any external dataset

if type(genre) == GENRES:
    dataset_folder = Path(f'../../dataset/topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]')
    dataset, dataset_path = load_dataset(genre, dataset_folder)
else:
    dataset_folder = Path(f'../../dataset/topic_modelling/00_dataset_filtered_all_4045065.pkl').resolve()
    dataset, dataset_path = pd.read_pickle(dataset_folder), dataset_folder
    dataset_folder = dataset_path.parent

# new: create an untouched ver of the dataset for retrieving original text
dataset_untouched = dataset.copy()

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 4045065 entries, 0 to 4180147
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
dtypes: int64(4), object(2)
memory usage: 216.0+ MB


In [7]:
# data preprocessing

sys.path.append('../../sa')

%autoreload 2
import str_cleaning_functions


def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

# def cleaning_strlist(str_list):
#     str_list = list(map(lambda x: str_cleaning_functions.remove_links(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.remove_links2(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.clean(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.deEmojify(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
#     return str_list

In [8]:
cleaning(dataset, 'review_text')

In [9]:
# same as LDA, we skip removing reviews with too many punctuations for more realistic results

# def calculate_nonalphabet_ratio(review: str) -> float:
#     count = 0
#     for char in review:
#         if not char.isalpha():
#             count += 1
#     return count / (len(review) + 1e-5)

# dataset['alphabet_ratio'] = dataset['review_text'].apply(calculate_nonalphabet_ratio)

# dataset['alphabet_ratio'].describe([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

# dataset = dataset[dataset['alphabet_ratio'] < 0.40]

In [10]:
# remove empty strings

dataset = dataset[dataset['review_text'].apply(lambda x: len(x) > 0)]

In [11]:
# check the length b4 saving
print(len(dataset))

4044917


In [18]:
# save the dataset for eval

if type(genre) == GENRES:
    dataset_path = Path(f'category_{str(genre)}_unique_review_text')
    dataset_path = dataset_path.joinpath(
        Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval.pkl')
    )

else:
    dataset_path = Path(f'category_all_unique_review_text')
    dataset_path = dataset_path.joinpath(
        Path(f'preprocessed_data/category_all_dataset_eval.pkl')
    )

if not dataset_path.parent.exists():
    dataset_path.parent.mkdir(parents=True)

if not dataset_path.exists():
    dataset.to_pickle(dataset_path)
else:
    print(f'File {dataset_path} already exists')
    print('Skip saving')

In [13]:
X = dataset['review_text'].values

In [14]:
# check the length to be identical in the training script
print(len(X))
print(X[0])

4044917
Ruined my life.


---

If create conducting evaluation on split models, continue

Create split text for models trained with split tokens

In [15]:
# tokens spliting helper functions
# copied from bertopic_training.ipynb on 20240217

def split_tokens_into_smaller_chunks(
    data,
    chunk_size: int,
    stride: int,
    minimal_chunk_length: int,
) -> dict:
    """Splits tokens into overlapping chunks with given size and stride."""

    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []

    # eval_only
    _original_iloc = []

    for input_id, token_type_id, mask_chunk, iloc in zip(data['input_ids'], data['token_type_ids'], data['attention_mask'], data['X_iloc']):
        _input_id_chunk = split_overlapping(input_id, chunk_size, stride, minimal_chunk_length)
        _token_type_id = split_overlapping(token_type_id, chunk_size, stride, minimal_chunk_length)
        _mask_chunk = split_overlapping(mask_chunk, chunk_size, stride, minimal_chunk_length)

        _new_input_id_chunks.extend(_input_id_chunk)
        _new_token_type_ids.extend(_token_type_id)
        _new_mask_chunks.extend(_mask_chunk)

        _original_iloc.extend([iloc] * len(_input_id_chunk))

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks, 'X_iloc': _original_iloc}

def split_overlapping(tensor:list[int], chunk_size: int, stride: int, minimal_chunk_length: int) -> list[list[int]]:
    """Helper function for dividing 1-dimensional tensors into overlapping chunks."""
    # check_split_parameters_consistency(chunk_size, stride, minimal_chunk_length)
    result = [tensor[i : i + chunk_size] for i in range(0, len(tensor), stride)]
    if len(result) > 1:
        # ignore chunks with less than minimal_length number of tokens
        result = [x for x in result if len(x) >= minimal_chunk_length]
    return result


def tokenize_dataset(data, tokenizer):
    # return sbert_model[0].tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)
    return tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)

In [19]:
from sentence_transformers import SentenceTransformer
from datasets import Dataset

split_sentence = False
sbert_model_name = 'all-MiniLM-L6-v2'       # !!! check with the model to be loaded
sbert = SentenceTransformer(sbert_model_name, device=device)

if split_sentence:
    X_new = []

    tokenizer = sbert[0].tokenizer

    # tokenize the dataset
    # then split the tokens into smaller chunks
    ds_sentences = Dataset.from_dict({'text': X})
    ds_sentences = ds_sentences.map(tokenize_dataset, batched=True, fn_kwargs={'tokenizer':tokenizer})
    ds_sentences2 = Dataset.from_dict({'input_ids': ds_sentences['input_ids'], 'token_type_ids': ds_sentences['token_type_ids'], 'attention_mask': ds_sentences['attention_mask'], 'X_iloc': list(range(len(X)))})
    ds_sentences2 = ds_sentences2.map(split_tokens_into_smaller_chunks, batched=True, fn_kwargs={'chunk_size': sbert.max_seq_length-2, 'stride': sbert.max_seq_length-2, 'minimal_chunk_length': 1})

    # re-create new sentences based on tokens
    for input_id in ds_sentences2['input_ids']:
        X_new.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id)))

    embeddings = sbert.encode(X_new, show_progress_bar=True, batch_size=64)

    print('Created embeddings with split sentences')
else:
    embeddings = sbert.encode(X, show_progress_bar=True, batch_size=64)
    

Batches:   6%|▌         | 3579/63202 [06:17<1:44:42,  9.49it/s]


KeyboardInterrupt: 

In [None]:
len(ds_sentences2['X_iloc'])

791154

In [None]:
dataset.iloc[ds_sentences2['X_iloc']]

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25637,32134,102200,Runespell: Overture,Why don't they make more games like this?! Sim...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
...,...,...,...,...,...,...,...,...
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179605,6416381,99900,Spiral Knights,This game use to be they did the rework on ene...,0,0,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179607,6416383,99900,Spiral Knights,This game is good to play by your self or with...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"


In [46]:
dataset_eval_split = dataset.iloc[ds_sentences2['X_iloc']]
dataset_eval_split['review_text_split'] = X_new

# load the untouched dataset, create a column for the review_text from the untouched dataset
dataset_eval_split['review_text_untouched'] = dataset_untouched.loc[dataset_eval_split.index]['review_text'].values

dataset_eval_split

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id,review_text_split,review_text_untouched
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",take one part faerie solitaire and two parts p...,Take one part Faerie Solitaire and two parts P...
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",it ' s an entertaining casual game to play. it...,Take one part Faerie Solitaire and two parts P...
25637,32134,102200,Runespell: Overture,Why don't they make more games like this?! Sim...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",why don ' t they make more games like this?! s...,Why don't they make more games like this?! Si...
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",runespell : overture melds together classic rp...,Runespell: Overture melds together classic RPG...
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]","##itaire, poker game - and it is done very wel...",Runespell: Overture melds together classic RPG...
...,...,...,...,...,...,...,...,...,...,...
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",i hadn ' t played spiral knights for over 2 ye...,I hadn't played Spiral Knights for over 2 year...
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",you go out of your way to organise something w...,I hadn't played Spiral Knights for over 2 year...
4179605,6416381,99900,Spiral Knights,This game use to be they did the rework on ene...,0,0,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",this game use to be they did the rework on ene...,This game use to be good..until they did the r...
4179607,6416383,99900,Spiral Knights,This game is good to play by your self or with...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",this game is good to play by your self or with...,This game is good to play by your self or with...


In [None]:
# save the eval dataset for reloading

if type(genre) == GENRES:
    dataset_path = Path(f'category_{str(genre)}_unique_review_text')
    dataset_path = dataset_path.parent.joinpath(
        Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval{"_split" if split_sentence else ""}.pkl')
    )
else:
    dataset_path = Path(f'category_all_unique_review_text')
    dataset_path = dataset_path.parent.joinpath(
        Path(f'preprocessed_data/category_all_dataset_eval{"_split" if split_sentence else ""}.pkl')
    )

if not dataset_path.exists():
    dataset_eval_split.to_pickle(dataset_path)

    print('Save to', dataset_path)

----

Or load the preprocessed data (saved during training/first evaluation)

In [6]:
split_sentence = True

if type(genre) == GENRES:
    dataset_preprocessed_path = Path(f'category_{str(genre)}_unique_review_text')
    dataset_preprocessed_path = dataset_preprocessed_path.joinpath(
        Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval{"_split" if split_sentence else ""}.pkl')
    )
else:
    dataset_preprocessed_path = Path(f'category_all_unique_review_text')
    dataset_preprocessed_path = dataset_preprocessed_path.joinpath(
        Path(f'preprocessed_data/category_all_dataset_eval{"_split" if split_sentence else ""}.pkl')
    )

print(dataset_preprocessed_path)

if dataset_preprocessed_path.exists():
    dataset = pd.read_pickle(dataset_preprocessed_path)
    print(len(dataset))
    print('\n\n')
    print(dataset.info(verbose=True))

    if split_sentence:
        X = dataset['review_text_split'].values
    else:
        X = dataset['review_text'].values
else:
    print(f"{dataset_preprocessed_path} does not exist")

category_action_unique_review_text/preprocessed_data/00_action_dataset_eval_split.pkl


1379010



<class 'pandas.core.frame.DataFrame'>
Index: 1379010 entries, 0 to 4179608
Data columns (total 10 columns):
 #   Column                 Non-Null Count    Dtype 
---  ------                 --------------    ----- 
 0   index                  1379010 non-null  int64 
 1   app_id                 1379010 non-null  int64 
 2   app_name               1379010 non-null  object
 3   review_text            1379010 non-null  object
 4   review_score           1379010 non-null  int64 
 5   review_votes           1379010 non-null  int64 
 6   genre_id               1379010 non-null  object
 7   category_id            1379010 non-null  object
 8   review_text_split      1379010 non-null  object
 9   review_text_untouched  1379010 non-null  object
dtypes: int64(4), object(6)
memory usage: 115.7+ MB
None


---

Evaluation

In [7]:
from eval_metrics import SEARCH_BEHAVIOUR
from bertopic_utils import _load_bertopic_model

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Test whether the result are the same when load the model from the disk

# load the best model and the embedding from the config folder

split_sentence = True
search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
# training_datetime = datetime(2024, 2, 16, 9, 47, 40)
# training_datetime = datetime(2024, 2, 14, 11, 15, 56)
# training_datetime = datetime(2024, 2, 23, 23, 37, 39)
# training_datetime = datetime(2024, 2, 21, 16, 30, 28)
# training_datetime = datetime(2024, 2, 29, 23, 51, 15)
training_datetime = datetime(2024, 3, 1, 9, 51, 49)
if type(genre) == GENRES:
    training_folder_p = Path(f'category_{str(genre)}_unique_review_text')
    training_folder = Path(f'bertopic{"[split]" if split_sentence else ""}_genre_{str(genre)}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
else:
    training_folder_p = Path(f'category_all_unique_review_text')
    training_folder = Path(f'bertopic{"[split]" if split_sentence else ""}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
training_folder = training_folder_p.joinpath(training_folder)


training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

# embeddings (uncomment it to load embeddings from training set for quick verification)
# embeddings_path = training_folder.joinpath(
#     f'embeddings_{training_result["best_hyperparameters"]["sbert_params"]["model_name_or_path"]}.pkl'
# )
# if embeddings_path.exists():
#     with open(embeddings_path, 'rb') as f:
#         embeddings = np.load(f)

#     assert embeddings.shape[0] == len(X), f'Number of embeddings ({embeddings.shape[0]}) does not match the number of reviews ({len(X)}). Function terminates.'
# else:
#     # raise Exception('No embeddings found. Function terminates.')
#     print('No embeddings found.')
#     embeddings = []


# model
# best_model_checkpoint_path = training_result['best_model_checkpoint']
best_model_checkpoint_path = Path(training_result['best_model_checkpoint'])


print(best_model_checkpoint_path)

category_action_unique_review_text/bertopic[split]_genre_action_grid_search_20240301_095149/bertopic_bt_nr_topics_100


In [10]:
# Create embeddings from datasets (eval)
from sentence_transformers import SentenceTransformer

# sbert_model = SentenceTransformer(training_result["best_hyperparameters"]["sbert_params"]["model_name_or_path"])
sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)        # we only use this model

if split_sentence:
    embeddings = sbert_model.encode(dataset['review_text_split'].values, show_progress_bar=True, batch_size=64)
else:
    embeddings = sbert_model.encode(dataset['review_text'].values, show_progress_bar=True, batch_size=64)

KeyboardInterrupt: 

In [None]:
# save the embeddings
save_embs = True
if save_embs:
    embeddings_path = Path(f'category_{str(genre) if type(genre) == GENRES else "all"}_unique_review_text')
    embeddings_path = embeddings_path.joinpath(
        Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_embeddings_eval{"_[split]" if split_sentence else ""}.npy') if type(genre) == GENRES \
            else Path(f'preprocessed_data/category_all_embeddings_eval{"_[split]" if split_sentence else ""}.npy'
            )
    )

    if not embeddings_path.exists():
        with open(embeddings_path, 'wb') as f:
            np.save(f, embeddings)

        print('Save embedding to', embeddings_path)
    else:
        print(f'File {embeddings_path} already exists')
        print('Skip saving')

In [9]:
# load pre-created embeddings

embeddings_path = Path(f'category_{str(genre) if type(genre) == GENRES else "all"}_unique_review_text')
embeddings_path = embeddings_path.joinpath(
    Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_embeddings_eval{"_[split]" if split_sentence else ""}.npy') if type(genre) == GENRES \
        else Path(f'preprocessed_data/category_all_embeddings_eval{"_[split]" if split_sentence else ""}.npy'
        )
)

if embeddings_path.exists():
    embeddings = np.load(embeddings_path)
    print('Load embeddings from', embeddings_path)
else:
    print(f"{embeddings_path} does not exist")

Load embeddings from category_action_unique_review_text/preprocessed_data/00_action_embeddings_eval_[split].npy


In [10]:
# or change the model path to a different one if needed
best_model_checkpoint_path = Path(best_model_checkpoint_path).parent.joinpath('bertopic_bt_nr_topics_30')

In [11]:
print(best_model_checkpoint_path)

category_action_unique_review_text/bertopic[split]_genre_action_grid_search_20240301_095149/bertopic_bt_nr_topics_30


In [12]:
best_model_loaded = _load_bertopic_model(best_model_checkpoint_path)

X = dataset['review_text_split'].values if split_sentence else dataset['review_text'].values
topics, probs = best_model_loaded.transform(X, embeddings=embeddings)

topic_model = best_model_loaded

2024-03-04 13:23:39,608 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


---

For visualization, refer to bertopic_eval_vis.ipynb

---

In [13]:
print(len(X))
print(topics.shape)

1379010
(1379010,)


In [14]:
print(probs[0])
print(len(probs[0]))

[0.20988199 0.16903743 0.17353551 0.1787247  0.09904128 0.33024308
 0.33011085 0.07244194 0.14272942 0.1598581  0.0760559  0.14272466
 0.1572313  0.15609594 0.18612865 0.16687196 0.12972866 0.10887292
 0.1642027  0.12331469 0.08038902 0.10711574 0.19784753 0.06903511
 0.13948978 0.08854909 0.2573272  0.18537052 0.14068761 0.3523863
 0.11892466]
31


Get top 10 keywords for each topics

In [15]:
top_N_words = 10
topic_keywords = {}

for topic_id, _keywords in topic_model.get_topics().items():
    if topic_id not in topic_keywords:
        topic_keywords[topic_id] = []
    
    for i in range(top_N_words):
        topic_keywords[topic_id].append(_keywords[i][0])

for topic_id, keywords in topic_keywords.items():
    print(f'Topic {topic_id}:')
    print(', '.join(keywords))
    print()

Topic -1:
like, play, fun, good, 10, really, great, playing, played, gameplay

Topic 0:
like, good, great, play, best, really, fun, gameplay, played, new

Topic 1:
op, fun, friends, best, great, play, multiplayer, mods, good, puzzle

Topic 2:
free, buy, worth, dlc, review, reviews, sale, play, pay, good

Topic 3:
minecraft, 2d, like, items, building, world, fun, build, hours, better

Topic 4:
addictive, hours, addicting, fun, addicted, good, awesome, played, boring, pretty

Topic 5:
10, 11, ign, guns, killed, gun, shot, shoot, guy, kill

Topic 6:
arma, moba, play, community, like, mods, dota2, best, players, playing

Topic 7:
valve, puzzles, puzzle, glados, portals, great, best, test, aperture, played

Topic 8:
controls, keyboard, simulator, support, buttons, xbox, button, play, control, fun

Topic 9:
port, pc, gtx, fps, settings, ram, runs, lag, i7, issues

Topic 10:
dinosaurs, shark, hunters, evolve, divers, dinos, dinosaur, diver, fun, play

Topic 11:
crashes, windows, club, rocksta

---

Get the most representative docs per topic

In [16]:
# get top 10 representative docs for each topic

# Approximate most representative documents per topic by sampling
# a subset of the documents in each topic and calculating which are
# most represenative to their topic based on the cosine similarity between
# c-TF-IDF representations

# the method was called internally in the fit_transform method
# so that the .get_topic_info() can work properly when the model was reloaded from disk
repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids = topic_model._extract_representative_docs(
    topic_model.c_tf_idf_,
    pd.DataFrame({"Document": X, "ID": range(len(X)), "Topic": topics}),
    topic_model.topic_representations_,
    nr_samples=500,
    nr_repr_docs=10          # the number of representative documents per topic
)

In [17]:
len(repr_docs_ids)

31

In [18]:
# the mapping is in no particular order

repr_docs_mappings

{-1: ['9 / 10 i bought this game at a reduced price after reading some very good reviews. i was completely new to the franchise but picked up the game relatively quickly. at the time of wrting this reveiw i have completed the game with one of the three characters and i have just started a new story with a second character on veteran. the veteran option allows you to play the story mode from a level 30 character which was a really nice addition which provided a very different style of gameplay and more features, so it was as if you are getting two games for the price of one. however i was initally confused why i was only level 30 after completing the campaign on casual mode. gamplay functions as a normal rpg should. however i do stuggle with some aspects such as very small action bars ( 1 - 6 ) and the left mouse being responsible for movement and the primary attack which would send me running into a hoarde of enemies instead of attacking them. bosses are challenging and hoardes of enem

Use the id to create a df to only select the repr docs from the dataframe

In [19]:
repr_docs_ids

[[481345,
  458523,
  881861,
  961700,
  1335988,
  389484,
  836199,
  431304,
  898053,
  777963],
 [15393,
  55986,
  287149,
  944549,
  824919,
  633282,
  747273,
  565503,
  900258,
  1216934],
 [1209648,
  67093,
  1125256,
  115530,
  1360218,
  1113922,
  235377,
  1253489,
  428622,
  146189],
 [1072793,
  1186501,
  24228,
  30718,
  106544,
  397213,
  607660,
  1010106,
  716850,
  138866],
 [87384, 64308, 90227, 116297, 77081, 65470, 77200, 109420, 86812, 50649],
 [772481,
  36785,
  1169528,
  372434,
  23244,
  1164398,
  94961,
  1146982,
  611836,
  95736],
 [303826,
  359837,
  795179,
  35968,
  1046251,
  300268,
  30448,
  1001006,
  951912,
  23586],
 [158627,
  141935,
  165776,
  928969,
  154458,
  153526,
  142213,
  869551,
  153460,
  1252158],
 [959019,
  1104278,
  1115588,
  1225114,
  1213329,
  973042,
  964231,
  1213427,
  1232908,
  961584],
 [664200,
  598472,
  1090289,
  255341,
  550647,
  381937,
  1000880,
  1019403,
  914083,
  403997],
 [2

In [20]:
df_original_texts = []

for i, topic_repr_docs_id in enumerate(repr_docs_ids):
    t = dataset.iloc[topic_repr_docs_id]
    t['topic_id'] = i - 1           # starts from -1, as -1 represents outliers

    df_original_texts.append(t)

df_original_texts = pd.concat(df_original_texts)
df_original_texts

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id,review_text_split,review_text_untouched,topic_id
2170010,3354453,275390,Guacamelee! Super Turbo Championship Edition,"I must admit, I wanted it out of curiosity in ...",1,0,"[1, 25, 23]","[2, 1, 9, 39, 24, 22, 28, 29, 30, 23, 25, 41, ...",never really feel dangerous on their own right...,"I must admit, I wanted it out of curiosity in ...",-1
2134803,3308090,272470,The Incredible Adventures of Van Helsing II,9/10 I bought this game at a reduced price aft...,1,0,"[1, 25, 23, 3]","[2, 1, 9, 22, 28, 29, 23, 15, 25, 43]",9 / 10 i bought this game at a reduced price a...,9/10 I bought this game at a reduced price aft...,-1
2860929,4441403,341020,Chronicles of Teddy,This is another game that I just finished. Whe...,1,0,"[1, 25, 23, 3]","[2, 22, 28, 29, 15]",this is another game that i just finished. whe...,This is another game that I just finished. Whe...,-1
3387862,5233120,400,Portal,A friend gave this game to me as a gift. I've ...,1,0,[1],"[2, 22, 13, 18, 17, 16, 14, 41, 42, 44]",a friend gave this game to me as a gift. i ' v...,A friend gave this game to me as a gift. I've ...,-1
4111561,6328816,8870,BioShock Infinite,"Amazing Game, Really great gameplay, interesti...",1,0,[1],"[2, 22, 28, 29, 23, 42, 43]","amazing game, really great gameplay, interesti...","Amazing Game, Really great gameplay, interesti...",-1
...,...,...,...,...,...,...,...,...,...,...,...
2672866,4156572,322170,Geometry Dash,Great Rage Game I Play Geometry Dash when I ca...,1,0,"[1, 23]","[2, 22, 29, 18, 17, 41, 42]",great rage game i play geometry dash when i ca...,Great Rage Game I Play Geometry Dash when I ca...,29
2677124,4161122,322170,Geometry Dash,Geometry Dash is funny games!,1,0,"[1, 23]","[2, 22, 29, 18, 17, 41, 42]",geometry dash is funny games!,Geometry Dash is funny games!,29
2669841,4153299,322170,Geometry Dash,Geometry Dash,1,1,"[1, 23]","[2, 22, 29, 18, 17, 41, 42]",geometry dash,Geometry Dash,29
2669933,4153399,322170,Geometry Dash,Geometry Dash servers are,1,0,"[1, 23]","[2, 22, 29, 18, 17, 41, 42]",geometry dash servers are,Geometry Dash servers are down....,29


In [24]:
# check which repr docs do not begin from beginning of the original review_text
# only for models trained with split text

def check_beginning_of_review_text(row):
    t = row['review_text_split'].split()[0].lower()
    return t != row['review_text'][:len(t)].lower()

df_original_texts[
df_original_texts.apply(
    lambda x: check_beginning_of_review_text(x),
    axis=1
)]

KeyError: 'review_text_split'

In [21]:
# print out the original text and the split (if any) for reference

for topic_id in repr_docs_mappings.keys():
    print(f'Topic {topic_id}:')

    for index, row in df_original_texts[df_original_texts['topic_id'] == topic_id].iterrows():
        print(f'Doc {index}')
        print(f'Original: {row["review_text"]}')
        if split_sentence:
            print(f'Split: {row["review_text_split"]}')
        print()

Topic -1:
Doc 2170010
Original: I must admit, I wanted it out of curiosity in the beginning, but I couldn't expect how much fun it could actually be. The fighting system feels fast, responsive, and with a lot of potential for combos, and juggling enemies in the air. Each move you have has a specific purpose and is clearly differentiated from the others, creating a small yet varied set of tools at your disposal. Yet one can't help but think that a couple more moves wpuld help make battles more varied and interesting. The tactical side of things, later on with the addition of shielded and alternate dimension enemies you can't just take them all on at once, you have to be more methodical, to decide which targets are a priority, which should be knocked down for later. The game can be pretty tactical, yet not so much so as to become slow or unenjoyable. The game doesn't ask you to make big plans about each fight, but to assess the situation and make on-the-fly choices to defeat you enemies 

In [22]:
X[333883]

'counter stirke : source is a game where you have two ( 2 ) sides, counter terrorist and terrorist. if you are on the counter terrorist side you can defend the bomb site and or hostages. if you are the terrorist you will have to plant the bomb and kill the counter terrorist. there is also custom gamemodes such as zombie : escape, deathrun, bunny hop, and minigames plus much much more. if you are looking for a skilled shooter game this is the game.'

In [23]:
# save the df_original_text object for reference
eval_folder_path = Path('../eval_results')
eval_folder_path = eval_folder_path.joinpath(
    best_model_checkpoint_path
)

if not eval_folder_path.exists():
    eval_folder_path.mkdir(parents=True)

print(eval_folder_path)

../eval_results/category_action_unique_review_text/bertopic[split]_genre_action_grid_search_20240301_095149/bertopic_bt_nr_topics_30


In [24]:
top_n = 10
df_original_texts.to_pickle(eval_folder_path.joinpath(f'df_eval_top_{top_n}.pkl'))

In [25]:
# also need to save the top N keywords for each topic as json
with open(eval_folder_path.joinpath(f'top_{top_N_words}_keywords.json'), 'w') as f:
    json.dump(topic_keywords, f, indent=2)

---

Miscellenous

Topic frequency table

In [26]:
# get topic frequency table
freq = topic_model.get_topic_freq()
print(freq)
print('Num of topics:', len(freq))
print('\n\n')

# sum the 'Count'
print('Total number of docs:', freq['Count'].sum())
print('Number of in-liers:', freq['Count'].sum() - freq[freq['Topic'] == -1]['Count'].sum())
print('Ratio of in-liners:', (freq['Count'].sum() - freq[freq['Topic'] == -1]['Count'].sum()) / float(freq['Count'].sum()))

    Topic   Count
0      -1  712326
1       0  310555
5       1   74546
7       2   35732
16      3   32438
6       4   29370
11      5   26502
15      6   20802
10      7   20362
18      8   19046
4       9   11986
24     10   11726
17     11   10519
8      12    6701
20     13    5396
3      14    4641
19     15    4109
2      16    3100
13     17    2911
25     18    2701
22     19    2669
26     20    2545
27     21    2207
28     22    2194
29     23    1952
12     24    1805
9      25    1469
14     26    1415
23     27    1353
21     28    1349
30     29    1282
Num of topics: 31



Total number of docs: 1365709
Number of in-liers: 653383
Ratio of in-liners: 0.47842036627129203


In [27]:
freq.to_pickle(eval_folder_path.joinpath(f'df_eval_topic_freq.pkl'))

find related topics based on a sentence/keyword input

In [28]:
topic_model.find_topics('horror')

([4, 5, 25, 17, -1],
 [0.3456078, 0.30335438, 0.28459024, 0.28219795, 0.27011302])

---

Test the capability of bertopic with LLM topic naming

In [29]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

In [30]:
# can try diff llama2: https://ollama.com/library/yarn-llama2

llm = Ollama(model="llama2")        # assuming the port is 11434

In [31]:
# prompt engineering
system_message = "You are a player of the game who is reading the reviews about the game."

human_template = \
'''Create a name for a topic given the topic's keywords and some most representative reviews of the topic. Output a label for the topic in less than 5 words. Do not output other text. 

The top keywords of the topic is: \'\'\'{topic_keywords}\'\'\'. 

The most representative reviews of the topic are: \'\'\'{topic_reviews}\'\'\'. '''

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("human", human_template)
])

chain = chat_prompt | llm

In [33]:
new_topic_labels = {}
randomed_topic_reviews = {}

In [34]:
import time

N_times = 5

topic_ids = list(repr_docs_mappings.keys())

for topic_id in topic_ids:
    _topic_keywords = topic_keywords[topic_id]

    time.sleep(1)

    temp_disable_char_limit = False
    _count = 0

    _reviews_df = df_original_texts[df_original_texts['topic_id'] == topic_id]
    for i in range(N_times):
        if new_topic_labels.get(topic_id, {}).get(f"call_{i}", None) is not None:
            print(f'{topic_id:02}_call{i}: {new_topic_labels[topic_id][f"call_{i}"]}')
            continue

        while True:
            if _count > 20:
                temp_disable_char_limit = True

            
            _sampled_reviews_df = _reviews_df.sample(n=2, replace=False)

            if split_sentence:
                check_bool = _sampled_reviews_df.apply(lambda x: len(x['review_text_split']) < 5000, axis=1)
            else:
                check_bool = _sampled_reviews_df.apply(lambda x: len(x['review_text']) < 5000, axis=1)
        
            
            if temp_disable_char_limit:
                break
            
            if all(check_bool):
                break
            else:
                _count += 1

        if split_sentence:
            topic_reviews = list(_sampled_reviews_df['review_text_split'].values)
        else:
            topic_reviews = list(_sampled_reviews_df['review_text'].values)

        result = chain.invoke({
            'topic_keywords':_topic_keywords,
            'topic_reviews':topic_reviews
        })

        print(f'{topic_id:02}_call{i}: {result}')

        if topic_id not in new_topic_labels:
            new_topic_labels[topic_id] = {}
            randomed_topic_reviews[topic_id] = {}


        new_topic_labels[topic_id][f"call_{i}"] = result
        randomed_topic_reviews[topic_id][f"call_{i}"] = {
            'reviews': topic_reviews,
            "col_index": _sampled_reviews_df['index'].values.tolist()
        }

    print()

-1_call0: 
Topic: Fun Survival Game with Good Gameplay
Label: Good Survival Game
-1_call1: Game: Fun Shooter
-1_call2: Game with Puzzles - Great Experience
-1_call3: Luminosity: Fun and simple arcade-style game.
-1_call4: Game: Fun Adventure

00_call0: Game with engaging co-op and challenging bosses.
00_call1: 
Label: Fun Samurai War Game
00_call2: New AC Game - Fun and Improved Experience
00_call3: 
Label: Fun Paris Adventure
00_call4: Topic: Half-Life 2: Episode One
Label: "Underwhelming continuation"

01_call0: Co-op Fun
01_call1: Co-op Fun
01_call2: "Fun multiplayer puzzle game"
01_call3: Topic Label: Great Multiplayer Fun
01_call4: "Fun Multiplayer Puzzler"

02_call0: "Free Battlefield Alternative"
02_call1: "Free Gem"
02_call2: "Free Battlefield Alternative"
02_call3: "Good Free Game"
02_call4: "Buy Now"

03_call0: MiniCraft
03_call1: 
Topic Label: Terraria
03_call2: 
Minecraft 2.0
03_call3: 
Topic Label: Minecraft-like game
03_call4: Terraria

04_call0: "Fun and Addictive Game"


In [35]:
randomed_topic_reviews

{-1: {'call_0': {'reviews': ['9 / 10 i bought this game at a reduced price after reading some very good reviews. i was completely new to the franchise but picked up the game relatively quickly. at the time of wrting this reveiw i have completed the game with one of the three characters and i have just started a new story with a second character on veteran. the veteran option allows you to play the story mode from a level 30 character which was a really nice addition which provided a very different style of gameplay and more features, so it was as if you are getting two games for the price of one. however i was initally confused why i was only level 30 after completing the campaign on casual mode. gamplay functions as a normal rpg should. however i do stuggle with some aspects such as very small action bars ( 1 - 6 ) and the left mouse being responsible for movement and the primary attack which would send me running into a hoarde of enemies instead of attacking them. bosses are challeng

In [36]:
# save the topic labels, sampled docs and their ids
# in the eval folder

llm_generation_result = {
    'new_topic_labels': new_topic_labels,
    'randomed_topic_reviews': randomed_topic_reviews
}

with open(eval_folder_path.joinpath('llm_generation_result.json'), 'w') as f:
    json.dump(llm_generation_result, f, indent=2)

---