In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import json
from datetime import datetime

import gensim
import nltk
import pyLDAvis

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

In [2]:
import platform
import torch

if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

cuda


In [3]:
%load_ext autoreload

In [4]:
import sys

sys.path.append('../')

In [5]:
%autoreload 2
from dataset_loader import GENRES, load_dataset

# genre = GENRES.INDIE
genre = -1
# unique_list = ['app_id', 'review_text']
unique_list = ['review_text']

---

Load the dataset from raw

(split and not split)

In [6]:
# load the dataset
# TODO: load any external dataset

if type(genre) == GENRES:
    dataset_folder = Path(f'../../dataset/topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]')
    dataset, dataset_path = load_dataset(genre, dataset_folder)
else:
    dataset_folder = Path(f'../../dataset/topic_modelling/00_dataset_filtered_all_4045065.pkl').resolve()
    dataset, dataset_path = pd.read_pickle(dataset_folder), dataset_folder
    dataset_folder = dataset_path.parent

# new: create an untouched ver of the dataset for retrieving original text
dataset_untouched = dataset.copy()

dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 4045065 entries, 0 to 4180147
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
dtypes: int64(4), object(2)
memory usage: 216.0+ MB


In [7]:
# data preprocessing

sys.path.append('../../sa')

%autoreload 2
import str_cleaning_functions


def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

# def cleaning_strlist(str_list):
#     str_list = list(map(lambda x: str_cleaning_functions.remove_links(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.remove_links2(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.clean(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.deEmojify(x), str_list))
#     str_list = list(map(lambda x: str_cleaning_functions.unify_whitespaces(x), str_list))
#     return str_list

In [8]:
cleaning(dataset, 'review_text')

In [9]:
# same as LDA, we skip removing reviews with too many punctuations for more realistic results

# def calculate_nonalphabet_ratio(review: str) -> float:
#     count = 0
#     for char in review:
#         if not char.isalpha():
#             count += 1
#     return count / (len(review) + 1e-5)

# dataset['alphabet_ratio'] = dataset['review_text'].apply(calculate_nonalphabet_ratio)

# dataset['alphabet_ratio'].describe([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

# dataset = dataset[dataset['alphabet_ratio'] < 0.40]

In [10]:
# remove empty strings

dataset = dataset[dataset['review_text'].apply(lambda x: len(x) > 0)]

In [11]:
# check the length b4 saving
print(len(dataset))

4044917


In [18]:
# save the dataset for eval

if type(genre) == GENRES:
    dataset_path = Path(f'category_{str(genre)}_unique_review_text')
    dataset_path = dataset_path.joinpath(
        Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval.pkl')
    )

else:
    dataset_path = Path(f'category_all_unique_review_text')
    dataset_path = dataset_path.joinpath(
        Path(f'preprocessed_data/category_all_dataset_eval.pkl')
    )

if not dataset_path.parent.exists():
    dataset_path.parent.mkdir(parents=True)

if not dataset_path.exists():
    dataset.to_pickle(dataset_path)
else:
    print(f'File {dataset_path} already exists')
    print('Skip saving')

In [13]:
X = dataset['review_text'].values

In [14]:
# check the length to be identical in the training script
print(len(X))
print(X[0])

4044917
Ruined my life.


---

If create conducting evaluation on split models, continue

Create split text for models trained with split tokens

In [6]:
# tokens spliting helper functions
# copied from bertopic_training.ipynb on 20240217

def split_tokens_into_smaller_chunks(
    data,
    chunk_size: int,
    stride: int,
    minimal_chunk_length: int,
) -> dict:
    """Splits tokens into overlapping chunks with given size and stride."""

    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []

    # eval_only
    _original_iloc = []

    for input_id, token_type_id, mask_chunk, iloc in zip(data['input_ids'], data['token_type_ids'], data['attention_mask'], data['X_iloc']):
        _input_id_chunk = split_overlapping(input_id, chunk_size, stride, minimal_chunk_length)
        _token_type_id = split_overlapping(token_type_id, chunk_size, stride, minimal_chunk_length)
        _mask_chunk = split_overlapping(mask_chunk, chunk_size, stride, minimal_chunk_length)

        _new_input_id_chunks.extend(_input_id_chunk)
        _new_token_type_ids.extend(_token_type_id)
        _new_mask_chunks.extend(_mask_chunk)

        _original_iloc.extend([iloc] * len(_input_id_chunk))

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks, 'X_iloc': _original_iloc}

def split_overlapping(tensor:list[int], chunk_size: int, stride: int, minimal_chunk_length: int) -> list[list[int]]:
    """Helper function for dividing 1-dimensional tensors into overlapping chunks."""
    # check_split_parameters_consistency(chunk_size, stride, minimal_chunk_length)
    result = [tensor[i : i + chunk_size] for i in range(0, len(tensor), stride)]
    if len(result) > 1:
        # ignore chunks with less than minimal_length number of tokens
        result = [x for x in result if len(x) >= minimal_chunk_length]
    return result


def tokenize_dataset(data, tokenizer):
    # return sbert_model[0].tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)
    return tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)

In [7]:
from sentence_transformers import SentenceTransformer
from datasets import Dataset

split_sentence = False
sbert_model_name = 'all-MiniLM-L6-v2'       # !!! check with the model to be loaded
sbert = SentenceTransformer(sbert_model_name, device=device)

if split_sentence:
    X_new = []

    tokenizer = sbert[0].tokenizer

    # tokenize the dataset
    # then split the tokens into smaller chunks
    ds_sentences = Dataset.from_dict({'text': X})
    ds_sentences = ds_sentences.map(tokenize_dataset, batched=True, fn_kwargs={'tokenizer':tokenizer})
    ds_sentences2 = Dataset.from_dict({'input_ids': ds_sentences['input_ids'], 'token_type_ids': ds_sentences['token_type_ids'], 'attention_mask': ds_sentences['attention_mask'], 'X_iloc': list(range(len(X)))})
    ds_sentences2 = ds_sentences2.map(split_tokens_into_smaller_chunks, batched=True, fn_kwargs={'chunk_size': sbert.max_seq_length-2, 'stride': sbert.max_seq_length-2, 'minimal_chunk_length': 1})

    # re-create new sentences based on tokens
    for input_id in ds_sentences2['input_ids']:
        X_new.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id)))

    embeddings = sbert.encode(X_new, show_progress_bar=True, batch_size=64)

    print('Created embeddings with split sentences')
else:
    embeddings = sbert.encode(X, show_progress_bar=True, batch_size=64)
    

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'X' is not defined

In [None]:
len(ds_sentences2['X_iloc'])

791154

In [None]:
dataset.iloc[ds_sentences2['X_iloc']]

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25637,32134,102200,Runespell: Overture,Why don't they make more games like this?! Sim...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]"
...,...,...,...,...,...,...,...,...
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179605,6416381,99900,Spiral Knights,This game use to be they did the rework on ene...,0,0,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"
4179607,6416383,99900,Spiral Knights,This game is good to play by your self or with...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]"


In [46]:
dataset_eval_split = dataset.iloc[ds_sentences2['X_iloc']]
dataset_eval_split['review_text_split'] = X_new

# load the untouched dataset, create a column for the review_text from the untouched dataset
dataset_eval_split['review_text_untouched'] = dataset_untouched.loc[dataset_eval_split.index]['review_text'].values

dataset_eval_split

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id,review_text_split,review_text_untouched
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",take one part faerie solitaire and two parts p...,Take one part Faerie Solitaire and two parts P...
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",it ' s an entertaining casual game to play. it...,Take one part Faerie Solitaire and two parts P...
25637,32134,102200,Runespell: Overture,Why don't they make more games like this?! Sim...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",why don ' t they make more games like this?! s...,Why don't they make more games like this?! Si...
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",runespell : overture melds together classic rp...,Runespell: Overture melds together classic RPG...
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]","##itaire, poker game - and it is done very wel...",Runespell: Overture melds together classic RPG...
...,...,...,...,...,...,...,...,...,...,...
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",i hadn ' t played spiral knights for over 2 ye...,I hadn't played Spiral Knights for over 2 year...
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",you go out of your way to organise something w...,I hadn't played Spiral Knights for over 2 year...
4179605,6416381,99900,Spiral Knights,This game use to be they did the rework on ene...,0,0,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",this game use to be they did the rework on ene...,This game use to be good..until they did the r...
4179607,6416383,99900,Spiral Knights,This game is good to play by your self or with...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",this game is good to play by your self or with...,This game is good to play by your self or with...


In [None]:
# save the eval dataset for reloading

if type(genre) == GENRES:
    dataset_path = Path(f'category_{str(genre)}_unique_review_text')
    dataset_path = dataset_path.parent.joinpath(
        Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval{"_split" if split_sentence else ""}.pkl')
    )
else:
    dataset_path = Path(f'category_all_unique_review_text')
    dataset_path = dataset_path.parent.joinpath(
        Path(f'preprocessed_data/category_all_dataset_eval{"_split" if split_sentence else ""}.pkl')
    )

if not dataset_path.exists():
    dataset_eval_split.to_pickle(dataset_path)

    print('Save to', dataset_path)

----

Or load the preprocessed data (saved during training/first evaluation)

In [6]:
split_sentence = True

if type(genre) == GENRES:
    dataset_preprocessed_path = Path(f'category_{str(genre)}_unique_review_text')
    dataset_preprocessed_path = dataset_preprocessed_path.joinpath(
        Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval{"_[split]" if split_sentence else ""}.pkl')
    )
else:
    dataset_preprocessed_path = Path(f'category_all_unique_review_text')
    dataset_preprocessed_path = dataset_preprocessed_path.joinpath(
        Path(f'preprocessed_data/category_all_dataset_eval{"_[split]" if split_sentence else ""}.pkl')
    )

print(dataset_preprocessed_path)

if dataset_preprocessed_path.exists():
    dataset = pd.read_pickle(dataset_preprocessed_path)
    print(len(dataset))
    print('\n\n')
    print(dataset.info(verbose=True))

    if split_sentence:
        X = dataset['review_text_split'].values
    else:
        X = dataset['review_text'].values
else:
    print(f"{dataset_preprocessed_path} does not exist")

category_all_unique_review_text/preprocessed_data/category_all_dataset_eval_[split].pkl
4454791



<class 'pandas.core.frame.DataFrame'>
Index: 4454791 entries, 0 to 4180147
Data columns (total 8 columns):
 #   Column                 Dtype 
---  ------                 ----- 
 0   index                  int64 
 1   app_id                 int64 
 2   app_name               object
 3   review_text            object
 4   review_score           int64 
 5   review_votes           int64 
 6   review_text_split      object
 7   review_text_untouched  object
dtypes: int64(4), object(4)
memory usage: 305.9+ MB
None


---

Evaluation

In [7]:
from eval_metrics import SEARCH_BEHAVIOUR
from bertopic_utils import _load_bertopic_model

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Test whether the result are the same when load the model from the disk

# load the best model and the embedding from the config folder

split_sentence = True
search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
# training_datetime = datetime(2024, 2, 16, 9, 47, 40)
# training_datetime = datetime(2024, 2, 14, 11, 15, 56)
training_datetime = datetime(2024, 2, 23, 23, 37, 39)
# training_datetime = datetime(2024, 2, 21, 16, 30, 28)
# training_datetime = datetime(2024, 2, 29, 23, 51, 15)
# training_datetime = datetime(2024, 3, 1, 9, 51, 49)
if type(genre) == GENRES:
    training_folder_p = Path(f'category_{str(genre)}_unique_review_text')
    training_folder = Path(f'bertopic{"[split]" if split_sentence else ""}_genre_{str(genre)}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
else:
    training_folder_p = Path(f'category_all_unique_review_text')
    training_folder = Path(f'bertopic{"[split]" if split_sentence else ""}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
training_folder = training_folder_p.joinpath(training_folder)


training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

# embeddings (uncomment it to load embeddings from training set for quick verification)
# embeddings_path = training_folder.joinpath(
#     f'embeddings_{training_result["best_hyperparameters"]["sbert_params"]["model_name_or_path"]}.pkl'
# )
# if embeddings_path.exists():
#     with open(embeddings_path, 'rb') as f:
#         embeddings = np.load(f)

#     assert embeddings.shape[0] == len(X), f'Number of embeddings ({embeddings.shape[0]}) does not match the number of reviews ({len(X)}). Function terminates.'
# else:
#     # raise Exception('No embeddings found. Function terminates.')
#     print('No embeddings found.')
#     embeddings = []


# model
# best_model_checkpoint_path = training_result['best_model_checkpoint']
best_model_checkpoint_path = Path(training_result['best_model_checkpoint'])


print(best_model_checkpoint_path)

category_all_unique_review_text/bertopic[split]_grid_search_20240223_233739/bertopic_bt_nr_topics_100


In [10]:
# Create embeddings from datasets (eval)
from sentence_transformers import SentenceTransformer

# sbert_model = SentenceTransformer(training_result["best_hyperparameters"]["sbert_params"]["model_name_or_path"])
sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)        # we only use this model

if split_sentence:
    embeddings = sbert_model.encode(dataset['review_text_split'].values, show_progress_bar=True, batch_size=64)
else:
    embeddings = sbert_model.encode(dataset['review_text'].values, show_progress_bar=True, batch_size=64)

KeyboardInterrupt: 

In [None]:
# save the embeddings
save_embs = True
if save_embs:
    embeddings_path = Path(f'category_{str(genre) if type(genre) == GENRES else "all"}_unique_review_text')
    embeddings_path = embeddings_path.joinpath(
        Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_embeddings_eval{"_[split]" if split_sentence else ""}.npy') if type(genre) == GENRES \
            else Path(f'preprocessed_data/category_all_embeddings_eval{"_[split]" if split_sentence else ""}.npy'
            )
    )

    if not embeddings_path.exists():
        with open(embeddings_path, 'wb') as f:
            np.save(f, embeddings)

        print('Save embedding to', embeddings_path)
    else:
        print(f'File {embeddings_path} already exists')
        print('Skip saving')

In [9]:
# load pre-created embeddings

embeddings_path = Path(f'category_{str(genre) if type(genre) == GENRES else "all"}_unique_review_text')
embeddings_path = embeddings_path.joinpath(
    Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_embeddings_eval{"_[split]" if split_sentence else ""}.npy') if type(genre) == GENRES \
        else Path(f'preprocessed_data/category_all_embeddings_eval{"_[split]" if split_sentence else ""}.npy'
        )
)

if embeddings_path.exists():
    embeddings = np.load(embeddings_path)
    print('Load embeddings from', embeddings_path)
else:
    print(f"{embeddings_path} does not exist")

Load embeddings from category_all_unique_review_text/preprocessed_data/category_all_embeddings_eval_[split].npy


In [10]:
# or change the model path to a different one if needed
best_model_checkpoint_path = Path(best_model_checkpoint_path).parent.joinpath('bertopic_bt_nr_topics_10')

In [11]:
print(best_model_checkpoint_path)

category_all_unique_review_text/bertopic[split]_grid_search_20240223_233739/bertopic_bt_nr_topics_10


In [12]:
best_model_loaded = _load_bertopic_model(best_model_checkpoint_path)

X = dataset['review_text_split'].values if split_sentence else dataset['review_text'].values
topics, probs = best_model_loaded.transform(X, embeddings=embeddings)

topic_model = best_model_loaded

2024-03-14 18:48:50,662 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


---

For visualization, refer to bertopic_eval_vis.ipynb

---

In [13]:
print(len(X))
print(topics.shape)

4454791
(4454791,)


In [14]:
print(probs[0])
print(len(probs[0]))

[0.21346521 0.1684452  0.13875614 0.3599972  0.2636776  0.3092895
 0.2348781  0.315082   0.21682432 0.20033419 0.14932235]
11


Get top 10 keywords for each topics

In [15]:
top_N_words = 10
topic_keywords = {}

for topic_id, _keywords in topic_model.get_topics().items():
    if topic_id not in topic_keywords:
        topic_keywords[topic_id] = []
    
    for i in range(top_N_words):
        topic_keywords[topic_id].append(_keywords[i][0])

for topic_id, keywords in topic_keywords.items():
    print(f'Topic {topic_id}:')
    print(', '.join(keywords))
    print()

Topic -1:
like, play, good, fun, 10, really, great, playing, played, gameplay

Topic 0:
like, good, play, great, fun, really, best, played, 10, new

Topic 1:
port, play, uplay, pc, controls, crashes, keyboard, mac, work, ubisoft

Topic 2:
10, goat, 11, goats, best, play, killed, simulator, cat, fun

Topic 3:
addictive, clicker, addicting, hours, fun, click, boring, addicted, play, played

Topic 4:
good, awesome, pretty, cool, yes, bad, say, amazing, guess, awsome

Topic 5:
russian, 10, chinese, allahu, russians, akbar, language, nazis, hitler, russia

Topic 6:
cried, 10, tears, emotional, crying, quit, play, like, fun, sad

Topic 7:
achievements, achievement, award, badge, nominated, awards, 100, 10, minutes, hours

Topic 8:
short, sweet, fun, good, longer, worth, really, great, bit, little

Topic 9:
gud, que, dis, gaem, jogo, la, gam, es, en, se



---

Get the most representative docs per topic

In [16]:
# get top 10 representative docs for each topic

# Approximate most representative documents per topic by sampling
# a subset of the documents in each topic and calculating which are
# most represenative to their topic based on the cosine similarity between
# c-TF-IDF representations

# the method was called internally in the fit_transform method
# so that the .get_topic_info() can work properly when the model was reloaded from disk
repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids = topic_model._extract_representative_docs(
    topic_model.c_tf_idf_,
    pd.DataFrame({"Document": X, "ID": range(len(X)), "Topic": topics}),
    topic_model.topic_representations_,
    nr_samples=500,
    nr_repr_docs=10          # the number of representative documents per topic
)

In [17]:
len(repr_docs_ids)

11

In [18]:
# the mapping is in no particular order

repr_docs_mappings

{-1: ['fun game and is easy to play 8 / 8',
  'this game is great even more fun with friends. if you dont like this something is wrong with you',
  'this game plays out like a movie. if i saw it as a game, i would not recommend it as the gameplay is not something i would like to see copied. the side quests are their own nice little stories, but largely forgettable in the main story. i would recommend buying this game on sale and playing through the story mission. the enemies are absolutely terrifying and i shuttered every time i needed to come face to face with them, but were just another step to get me to the next plot point. which was thankfully compelling enough to have me overlook continuity errors like how ronan mannaged to get into the church when there were no doors opened for him. somehow, i feel like the best experience from this game would come from following the main story step by step and not to stop for every little thing you find. in short, if you treat it more like a mov

Use the id to create a df to only select the repr docs from the dataframe

In [19]:
repr_docs_ids

[[3469931,
  1158977,
  3314281,
  203263,
  1534516,
  4387614,
  948432,
  1382368,
  306095,
  3420140],
 [2951311,
  3267226,
  2510612,
  951510,
  3964312,
  1307971,
  1278467,
  1998264,
  3346247,
  1077431],
 [3022005,
  4307783,
  4265624,
  997699,
  2319399,
  3306775,
  700080,
  4141786,
  1632640,
  2745369],
 [1164487,
  1757971,
  3011431,
  1385780,
  1009741,
  2145033,
  3496397,
  2607703,
  3936960,
  3123696],
 [2655191,
  1815812,
  3082313,
  1948250,
  108307,
  2241079,
  1892098,
  4259171,
  980246,
  1611457],
 [3022091,
  1661191,
  146396,
  823060,
  2220815,
  2191913,
  3921545,
  3022041,
  4423489,
  3658131],
 [1623146,
  4168658,
  1068375,
  1463495,
  1367345,
  4168251,
  4171062,
  1062829,
  4153834,
  2426414],
 [582290,
  579326,
  2372994,
  1184183,
  1859164,
  3545444,
  217530,
  585985,
  3582308,
  3821644],
 [3238311,
  2073248,
  4386689,
  4368599,
  155724,
  2101779,
  2578946,
  4011820,
  1338485,
  4095193],
 [3637401,
  206

In [20]:
df_original_texts = []

for i, topic_repr_docs_id in enumerate(repr_docs_ids):
    t = dataset.iloc[topic_repr_docs_id]
    t['topic_id'] = i - 1           # starts from -1, as -1 represents outliers

    df_original_texts.append(t)

df_original_texts = pd.concat(df_original_texts)
df_original_texts

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,review_text_split,review_text_untouched,topic_id
3234105,5020077,385590,Lethal RPG: War,This game is another title of the series 'Gene...,0,1,this game is another title of the series ' gen...,This game is another title of the series 'Gene...,-1
1106892,1599579,224260,No More Room in Hell,"I recommend this game to zombie game fans, but...",1,0,"i recommend this game to zombie game fans, but...","I recommend this game to zombie game fans, but...",-1
3093104,4803277,368360,60 Seconds!,fun game and is easy to play 8/8,1,0,fun game and is easy to play 8 / 8,fun game and is easy to play 8/8,-1
197186,238459,113020,Monaco,this game is great even more fun with friends....,1,0,this game is great even more fun with friends....,this game is great even more fun with friends....,-1
1453491,2146357,238460,BattleBlock Theater,Good stuff about this game This is Super fun g...,1,0,good stuff about this game this is super fun g...,Good stuff about this game This is Super fun ...,-1
...,...,...,...,...,...,...,...,...,...
1963666,3041957,262280,Dungeons 2,A very gud game.,1,0,a very gud game.,A very gud game.,9
707134,950576,212680,FTL: Faster Than Light,is gud gaem has spess ships n stuff,1,0,is gud gaem has spess ships n stuff,is gud gaem has spess ships n stuff,9
3729426,5785855,48700,Mount & Blade: Warband,r very gud grephicxA+++g,1,1,r very gud grephicxa + + + g,r very gud grephicxA+++g,9
3137698,4871784,374320,DARK SOULS™ III,"Is very gud, but for casuals is very hurd so g...",1,0,"is very gud, but for casuals is very hurd so g...","Is very gud, but for casuals is very hurd so g...",9


In [21]:
# check which repr docs do not begin from beginning of the original review_text
# only for models trained with split text

def check_beginning_of_review_text(row):
    t = row['review_text_split'].split()[0].lower()
    return t != row['review_text'][:len(t)].lower()

df_original_texts[
df_original_texts.apply(
    lambda x: check_beginning_of_review_text(x),
    axis=1
)]

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,review_text_split,review_text_untouched,topic_id
1217848,1793237,230410,Warframe,"I've played since early Beta, and have seen th...",1,0,without its flaws. but as with everyone else w...,"I've played since early Beta, and have seen th...",0
4038879,6238447,7010,Project: Snowblind,"So starting with the bad, this is a console po...",1,0,stay far away from this console port if you ' ...,"So starting with the bad, this is a console po...",1
951309,1312327,220240,Far Cry® 3,"VALUE: I'm a frugal gamer, and so value is one...",1,0,"t have a really powerful graphics card, but wi...","VALUE: I'm a frugal gamer, and so value is on...",1
2918710,4544185,348620,Voices from the Sea,10/10 would cry again,1,0,10 / 10 would cry again,this... was... amazing... 10/10 would cry again,2
782551,1051254,215930,Jagged Alliance 2 - Wildfire,nothing so good,0,0,nothing so good,.... nothing so good,4
1388658,2056740,236390,War Thunder,What can be said about War Thunder. Looks goo...,1,0,what can be said about war thunder. looks good...,So... What can be said about War Thunder. Look...,5
1017312,1479811,222880,Insurgency,"I'm pinned down behind a broken down jeep, som...",1,0,"at what cost? how many families on both sides,...","I'm pinned down behind a broken down jeep, som...",9
3605176,5585207,442080,Riders of Icarus,"English: This game its P2W, you just can't bea...",0,1,"##2 e e pvp random, simplesmente uma guild gra...","English: This game its P2W, you just can't bea...",9


In [22]:
# print out the original text and the split (if any) for reference

for topic_id in repr_docs_mappings.keys():
    print(f'Topic {topic_id}:')

    for index, row in df_original_texts[df_original_texts['topic_id'] == topic_id].iterrows():
        print(f'Doc {index}')
        print(f'Original: {row["review_text"]}')
        if split_sentence:
            print(f'Split: {row["review_text_split"]}')
        print()

Topic -1:
Doc 3234105
Original: This game is another title of the series 'Generic JRPG with no real value whatsoever'. Why? I'm so glad you asked, so let's find out, shall we? First off, the story is dull and uninteresting. It will provide you with no motivation to keep playing. And here's the next thing: Gameplay is just as bad. Don't get me wrong, I love JRPGs if they have been done well - especially if you can grind your chars to be very powerful against upcoming enemies, which feels good, just because you spent that much time doing it. But in this game it's like in The Last Remnant - enemies will always be leveled to match yours. I know there are people who like that but I do not. Also, the graphics and especially the animations are really, really bad. Oh, and the music does not even remotely match the scenery it's being played it. Just listen to the battle music once and you will know what I mean. It's really sad as there are a few good things about this game as well - it definite

In [23]:
X[333883]

'can help you decide wether or not you want to attempt animation using this service. as it is free, and can be very advanced for some, dank memes are made in dis town yo.'

In [24]:
# save the df_original_text object for reference
eval_folder_path = Path('../eval_results')
eval_folder_path = eval_folder_path.joinpath(
    best_model_checkpoint_path
)

if not eval_folder_path.exists():
    eval_folder_path.mkdir(parents=True)

print(eval_folder_path)

../eval_results/category_all_unique_review_text/bertopic[split]_grid_search_20240223_233739/bertopic_bt_nr_topics_10


In [25]:
top_n = 10
df_original_texts.to_pickle(eval_folder_path.joinpath(f'df_eval_top_{top_n}.pkl'))

In [26]:
# also need to save the top N keywords for each topic as json
with open(eval_folder_path.joinpath(f'top_{top_N_words}_keywords.json'), 'w') as f:
    json.dump(topic_keywords, f, indent=2)

---

Miscellenous

Topic frequency table

In [27]:
# get topic frequency table
freq = topic_model.get_topic_freq()
print(freq)
print('Num of topics:', len(freq))
print('\n\n')

# sum the 'Count'
print('Total number of docs:', freq['Count'].sum())
print('Number of in-liers:', freq['Count'].sum() - freq[freq['Topic'] == -1]['Count'].sum())
print('Ratio of in-liners:', (freq['Count'].sum() - freq[freq['Topic'] == -1]['Count'].sum()) / float(freq['Count'].sum()))

    Topic    Count
0      -1  2590430
1       0  1521163
3       1   112511
2       2    62670
4       3    49289
5       4    23672
8       5    13923
9       6    13824
6       7    11256
10      8     7248
7       9     6562
Num of topics: 11



Total number of docs: 4412548
Number of in-liers: 1822118
Ratio of in-liners: 0.41294009719554325


In [28]:
freq.to_pickle(eval_folder_path.joinpath(f'df_eval_topic_freq.pkl'))

find related topics based on a sentence/keyword input

In [29]:
topic_model.find_topics('horror')

([4, 2, -1, 9, 3], [0.37968403, 0.33279437, 0.2719255, 0.2647285, 0.2597218])

---

Test the capability of bertopic with LLM topic naming

In [30]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

In [31]:
# can try diff llama2: https://ollama.com/library/yarn-llama2

llm = Ollama(model="llama2")        # assuming the port is 11434

In [32]:
# prompt engineering
system_message = "You are a player of the game who is reading the reviews about the game."

human_template = \
'''Create a name for a topic given the topic's keywords and some most representative reviews of the topic. Output a label for the topic in less than 5 words. Do not output other text. 

The top keywords of the topic is: \'\'\'{topic_keywords}\'\'\'. 

The most representative reviews of the topic are: \'\'\'{topic_reviews}\'\'\'. '''

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("human", human_template)
])

chain = chat_prompt | llm

In [33]:
new_topic_labels = {}
randomed_topic_reviews = {}

In [34]:
import time

N_times = 5

topic_ids = list(repr_docs_mappings.keys())

for topic_id in topic_ids:
    _topic_keywords = topic_keywords[topic_id]

    time.sleep(1)

    temp_disable_char_limit = False
    _count = 0

    _reviews_df = df_original_texts[df_original_texts['topic_id'] == topic_id]
    for i in range(N_times):
        if new_topic_labels.get(topic_id, {}).get(f"call_{i}", None) is not None:
            print(f'{topic_id:02}_call{i}: {new_topic_labels[topic_id][f"call_{i}"]}')
            continue

        while True:
            if _count > 20:
                temp_disable_char_limit = True

            
            _sampled_reviews_df = _reviews_df.sample(n=2, replace=False)

            if split_sentence:
                check_bool = _sampled_reviews_df.apply(lambda x: len(x['review_text_split']) < 5000, axis=1)
            else:
                check_bool = _sampled_reviews_df.apply(lambda x: len(x['review_text']) < 5000, axis=1)
        
            
            if temp_disable_char_limit:
                break
            
            if all(check_bool):
                break
            else:
                _count += 1

        if split_sentence:
            topic_reviews = list(_sampled_reviews_df['review_text_split'].values)
        else:
            topic_reviews = list(_sampled_reviews_df['review_text'].values)

        result = chain.invoke({
            'topic_keywords':_topic_keywords,
            'topic_reviews':topic_reviews
        })

        print(f'{topic_id:02}_call{i}: {result}')

        if topic_id not in new_topic_labels:
            new_topic_labels[topic_id] = {}
            randomed_topic_reviews[topic_id] = {}


        new_topic_labels[topic_id][f"call_{i}"] = result
        randomed_topic_reviews[topic_id][f"call_{i}"] = {
            'reviews': topic_reviews,
            "col_index": _sampled_reviews_df['index'].values.tolist()
        }

    print()

-1_call0: Game for Zombie Fans - 5 words
-1_call1: 
Label: Fun horror game with online multiplayer
-1_call2: 
The topic label in less than 5 words: "Underwhelming experience with too much repetition."
-1_call3: Topic Label: Underwhelming Experience with Too Much Grinding
-1_call4: Game has Potential but Falls Short | 5 Words: "Underwhelming experience with too repetition."

00_call0: Here is a label for the topic in less than 5 words: "Fun RPG with engaging storyline".
00_call1: 
Topic Label: Great Warhammer Game
00_call2: New Version
00_call3: "Fun game with mild strategy"
00_call4: 
Topic Label: "Warhammer Total War"

01_call0: Port issue
01_call1: 
Topic Label: "Deus Ex Mankind Divided"
01_call2: Ubisoft game issues
01_call3: Great game!
01_call4: Port and play with ease

02_call0: "Goat Simulator"
02_call1: "Crush Goats"
02_call2: 
Crushing Fun
02_call3: "Goat Riding Simulator"
02_call4: 
Topic Label: Goat Simulator

03_call0: "Addictive Clicker Game"
03_call1: Addictive Clicker Ga

In [35]:
randomed_topic_reviews

{-1: {'call_0': {'reviews': ["i recommend this game to zombie game fans, but, for others, i am skeptical that you will be among those to like this game. i love this game, i really do, but, after taking a long break from it, i discovered why it got a little stale over prolonged periods of play. like the doom franchise, you have to get from point a to point b, but with objectives to do so in between to get there. that being said, the majority of the time, it ' s fun to panic with a team through zombie infested streets and tunnels. but, it gets old really fast over time as your progress through the same order of business over and over again. not only does this get boring after this, but, the team play does not work with the gameplay if the team does not play like a team. it pains me to say it, but, some players can kill the experience and really mess things up for the others enjoying the game. it doesn ' t work all the time and, unfortunately, games like these could become rather stale wi

In [36]:
# save the topic labels, sampled docs and their ids
# in the eval folder

llm_generation_result = {
    'new_topic_labels': new_topic_labels,
    'randomed_topic_reviews': randomed_topic_reviews
}

with open(eval_folder_path.joinpath('llm_generation_result.json'), 'w') as f:
    json.dump(llm_generation_result, f, indent=2)

---