Separate from bertopic_external_doc_quali.ipynb

As for small games, we use 30 topic model, and test on the general performance (as it is impossible to create useful topics per game)

In [107]:
import pandas as pd
import numpy as np

from pathlib import Path
import json
from datetime import datetime
import pickle
import traceback

import gensim
import nltk
import pyLDAvis

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

import sys

sys.path.append('../')

In [108]:
import platform
import torch

if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

cuda


In [109]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [110]:
# the text to be evaluated

# game_steamid = 1814540
# game_name = 'cosmos_conquer'

# game_steamid = 1757970
# game_name = 'oasis_dark_forest'

# game_steamid = 1607720
# game_name = 'rimebeard'

# game_steamid = 1814460
# game_name = 'the_brave_vs_dragon'

game_steamid = 1814510
game_name = 'yavi'

# load the reviews from folder

reviews_reqs = []

# get existing folder and retrieve the cursor object (?)

# load the latest file
game_folder = Path(f'../../dataset/data_scraping/steam_comments_scraping/{game_name}').resolve()
if game_folder.exists():
    try:
        latest_file_path = sorted(game_folder.glob('steam_reviews_*.pkl'))[-1]
        with open(latest_file_path, 'rb') as f:
            reviews_reqs = pickle.load(f)           # retrieve the list of reviews
            print('Loaded:', latest_file_path)
    except IndexError as e:
        print('Error loading the latest file:', e)
        traceback.print_exc()

Loaded: /root/FYP/NLP/dev-workspace/dataset/data_scraping/steam_comments_scraping/yavi/steam_reviews_1814510_unique.pkl


In [111]:
# create a dataframe like in training/evaluation
reviews_df = pd.DataFrame(reviews_reqs)

reviews_df = reviews_df[['recommendationid', 'review', 'timestamp_created', 'voted_up', 'steam_purchase', 'received_for_free']]
# reviews_df = reviews_df[reviews_df['timestamp_created'] >= datetime_until.timestamp()]

# filter unique reviews
reviews_df = reviews_df.drop_duplicates(subset=['review', 'voted_up'])

# convert timestamp to datetime. The datetime converted is in utc+0
reviews_df['timestamp_created'] = pd.to_datetime(reviews_df['timestamp_created'], unit='s')

# convert the voted_up to 1 and -1
reviews_df['voted_up'] = reviews_df['voted_up'].apply(lambda x: 1 if x else -1)

reviews_df['review_original'] = reviews_df['review']

reviews_df

Unnamed: 0,recommendationid,review,timestamp_created,voted_up,steam_purchase,received_for_free,review_original
0,150188804,"Neat pokemon-like game, it's really fun and th...",2023-11-15 20:48:55,1,False,True,"Neat pokemon-like game, it's really fun and th..."
1,142944112,Okay - the game is interesting and I like the ...,2023-07-28 09:42:48,-1,False,False,Okay - the game is interesting and I like the ...
2,140306633,Originally a $9.99 game that went Free. The ba...,2023-06-18 23:39:28,-1,False,True,Originally a $9.99 game that went Free. The ba...
3,133609998,Pokemon-style game as interplanet delivery boy...,2023-02-25 20:34:32,-1,False,True,Pokemon-style game as interplanet delivery boy...
4,129715521,Cute sprites. Amusing dialogues. Moody music.,2022-12-30 19:34:07,1,False,True,Cute sprites. Amusing dialogues. Moody music.
5,122704322,This is a very very skeptical answer...\nI usu...,2022-09-23 20:13:48,-1,True,False,This is a very very skeptical answer...\nI usu...
6,120590370,I was given a copy of this game specifically t...,2022-08-14 02:23:29,1,False,True,I was given a copy of this game specifically t...
7,116999542,A cute little game that mixes pokemon-like mec...,2022-06-15 00:45:15,1,True,False,A cute little game that mixes pokemon-like mec...
8,113971695,If you take pokemon and replace pokemon with r...,2022-04-16 21:01:32,1,True,False,If you take pokemon and replace pokemon with r...
9,113345116,Never thought it's gonna be that engaging! The...,2022-04-04 18:41:00,1,True,False,Never thought it's gonna be that engaging! The...


In [112]:
sys.path.append('../../sa')
import str_cleaning_functions


def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

In [113]:
cleaning(reviews_df, 'review')

In [114]:
X = reviews_df['review'].values

In [115]:
# tokens spliting helper functions
# copied from bertopic_training.ipynb on 20240217

def split_tokens_into_smaller_chunks(
    data,
    chunk_size: int,
    stride: int,
    minimal_chunk_length: int,
) -> dict:
    """Splits tokens into overlapping chunks with given size and stride."""

    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []

    # eval_only
    _original_iloc = []

    for input_id, token_type_id, mask_chunk, iloc in zip(data['input_ids'], data['token_type_ids'], data['attention_mask'], data['X_iloc']):
        _input_id_chunk = split_overlapping(input_id, chunk_size, stride, minimal_chunk_length)
        _token_type_id = split_overlapping(token_type_id, chunk_size, stride, minimal_chunk_length)
        _mask_chunk = split_overlapping(mask_chunk, chunk_size, stride, minimal_chunk_length)

        _new_input_id_chunks.extend(_input_id_chunk)
        _new_token_type_ids.extend(_token_type_id)
        _new_mask_chunks.extend(_mask_chunk)

        _original_iloc.extend([iloc] * len(_input_id_chunk))

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks, 'X_iloc': _original_iloc}

def split_overlapping(tensor:list[int], chunk_size: int, stride: int, minimal_chunk_length: int) -> list[list[int]]:
    """Helper function for dividing 1-dimensional tensors into overlapping chunks."""
    # check_split_parameters_consistency(chunk_size, stride, minimal_chunk_length)
    result = [tensor[i : i + chunk_size] for i in range(0, len(tensor), stride)]
    if len(result) > 1:
        # ignore chunks with less than minimal_length number of tokens
        result = [x for x in result if len(x) >= minimal_chunk_length]
    return result


def tokenize_dataset(data, tokenizer):
    # return sbert_model[0].tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)
    return tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)

In [116]:
from sentence_transformers import SentenceTransformer
from datasets import Dataset

split_sentence = True
sbert_model_name = 'all-MiniLM-L6-v2'       # !!! check with the model to be loaded
sbert = SentenceTransformer(sbert_model_name, device=device)

if split_sentence:
    X_new = []

    tokenizer = sbert[0].tokenizer

    # tokenize the dataset
    # then split the tokens into smaller chunks
    ds_sentences = Dataset.from_dict({'text': X})
    ds_sentences = ds_sentences.map(tokenize_dataset, batched=True, fn_kwargs={'tokenizer':tokenizer})
    ds_sentences2 = Dataset.from_dict({'input_ids': ds_sentences['input_ids'], 'token_type_ids': ds_sentences['token_type_ids'], 'attention_mask': ds_sentences['attention_mask'], 'X_iloc': list(range(len(X)))})
    ds_sentences2 = ds_sentences2.map(split_tokens_into_smaller_chunks, batched=True, fn_kwargs={'chunk_size': sbert.max_seq_length-2, 'stride': sbert.max_seq_length-2, 'minimal_chunk_length': 1})

    # re-create new sentences based on tokens
    for input_id in ds_sentences2['input_ids']:
        X_new.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id)))

    embeddings = sbert.encode(X_new, show_progress_bar=True, batch_size=64)

    print('Created embeddings with split sentences')
else:
    embeddings = sbert.encode(X, show_progress_bar=True, batch_size=64)
    

Map:   0%|          | 0/15 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1560 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 15/15 [00:00<00:00, 2511.16 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 4942.62 examples/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.34it/s]

Created embeddings with split sentences





In [117]:
if split_sentence:
    reviews_df_split = reviews_df.iloc[ds_sentences2['X_iloc']]
    reviews_df_split['review_split'] = X_new

else:
    reviews_df_split = reviews_df

reviews_df_split

Unnamed: 0,recommendationid,review,timestamp_created,voted_up,steam_purchase,received_for_free,review_original,review_split
0,150188804,"Neat pokemon-like game, it's really fun and th...",2023-11-15 20:48:55,1,False,True,"Neat pokemon-like game, it's really fun and th...","neat pokemon - like game, it ' s really fun an..."
1,142944112,Okay - the game is interesting and I like the ...,2023-07-28 09:42:48,-1,False,False,Okay - the game is interesting and I like the ...,okay - the game is interesting and i like the ...
2,140306633,Originally a game that went Free. The battles ...,2023-06-18 23:39:28,-1,False,True,Originally a $9.99 game that went Free. The ba...,originally a game that went free. the battles ...
3,133609998,Pokemon-style game as interplanet delivery boy...,2023-02-25 20:34:32,-1,False,True,Pokemon-style game as interplanet delivery boy...,pokemon - style game as interplanet delivery b...
4,129715521,Cute sprites. Amusing dialogues. Moody music.,2022-12-30 19:34:07,1,False,True,Cute sprites. Amusing dialogues. Moody music.,cute sprites. amusing dialogues. moody music.
5,122704322,This is a very very skeptical usually go for t...,2022-09-23 20:13:48,-1,True,False,This is a very very skeptical answer...\nI usu...,this is a very very skeptical usually go for t...
5,122704322,This is a very very skeptical usually go for t...,2022-09-23 20:13:48,-1,True,False,This is a very very skeptical answer...\nI usu...,##cs with a robot of their own - you then save...
5,122704322,This is a very very skeptical usually go for t...,2022-09-23 20:13:48,-1,True,False,This is a very very skeptical answer...\nI usu...,to use to teleport you to the workshops to mak...
5,122704322,This is a very very skeptical usually go for t...,2022-09-23 20:13:48,-1,True,False,This is a very very skeptical answer...\nI usu...,- walk back to heal. costs 5g i think you can ...
5,122704322,This is a very very skeptical usually go for t...,2022-09-23 20:13:48,-1,True,False,This is a very very skeptical answer...\nI usu...,stations as affordable as workshops ( this als...


Evaluation

In [118]:
from eval_metrics import SEARCH_BEHAVIOUR

sys.path.append('../../topic_modelling/bertopic_dev')
from bertopic_utils import _load_bertopic_model

%autoreload 2
from dataset_loader import GENRES, load_dataset

In [119]:
# load the bertopic model

# Test whether the result are the same when load the model from the disk

# load the best model and the embedding from the config folder

genre = GENRES.INDIE
split_sentence = True
search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
training_datetime = datetime(2024, 2, 14, 11, 15, 56)
training_folder_p = Path(f'../bertopic_dev/category_{str(genre)}_unique_review_text')
training_folder = Path(f'bertopic{"[split]" if split_sentence else ""}_genre_{str(genre)}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
training_folder = training_folder_p.joinpath(training_folder)


training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

# embeddings (uncomment it to load embeddings from training set for quick verification)
# embeddings_path = training_folder.joinpath(
#     f'embeddings_{training_result["best_hyperparameters"]["sbert_params"]["model_name_or_path"]}.pkl'
# )
# if embeddings_path.exists():
#     with open(embeddings_path, 'rb') as f:
#         embeddings = np.load(f)

#     assert embeddings.shape[0] == len(X), f'Number of embeddings ({embeddings.shape[0]}) does not match the number of reviews ({len(X)}). Function terminates.'
# else:
#     # raise Exception('No embeddings found. Function terminates.')
#     print('No embeddings found.')
#     embeddings = []


# model
# best_model_checkpoint_path = training_result['best_model_checkpoint']
best_model_checkpoint_path = training_folder_p.parent.joinpath(
    Path(training_result['best_model_checkpoint'])
)

print(best_model_checkpoint_path)

../bertopic_dev/category_indie_unique_review_text/bertopic[split]_genre_indie_grid_search_20240214_111556/bertopic_bt_nr_topics_100


In [120]:
# or change the model path to a different one if needed

N_TOPICS = 30

best_model_checkpoint_path = Path(best_model_checkpoint_path).parent.joinpath(f'bertopic_bt_nr_topics_{N_TOPICS}')

print(best_model_checkpoint_path)

../bertopic_dev/category_indie_unique_review_text/bertopic[split]_genre_indie_grid_search_20240214_111556/bertopic_bt_nr_topics_30


In [121]:
bertopic_model = _load_bertopic_model(best_model_checkpoint_path)

if split_sentence:
    X = reviews_df_split['review_split'].values
else:
    X = reviews_df_split['review'].values

topics, probs = bertopic_model.transform(X, embeddings=embeddings)

topic_model = bertopic_model

2024-03-15 00:08:08,117 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


Evaluation

Getting top N keyword is meaningless, what we want is the topic frequency table

In [122]:
eval_results_external_folder_path = Path(f'../eval_results_external/{game_name}')

print(eval_results_external_folder_path)

if not eval_results_external_folder_path.exists():
    eval_results_external_folder_path.mkdir(parents=True)

../eval_results_external/yavi


In [123]:
# build the topic frequency table from results

from collections import Counter

topic_freq = Counter(topics)
topic_freq = pd.DataFrame(topic_freq.items(), columns=['topic_id', 'count'])
topic_freq = topic_freq.sort_values(by='topic_id')
topic_freq.reset_index(drop=True, inplace=True)
# rename columns
topic_freq.columns = ['Topic', 'Count']
topic_freq

Unnamed: 0,Topic,Count
0,-1,7
1,0,4
2,4,1
3,6,2
4,13,1
5,14,1
6,18,5


In [124]:
topic_freq.to_pickle(eval_results_external_folder_path.joinpath(f'df_eval_topic_freq.pkl'))

Since the number of reviews

maybe we can print all reviews and their topics

Beforehand, we read the topic_id_to_label.json file to view the topic names

In [125]:
# save the df_original_text object for reference
eval_folder_path = Path('../eval_results')
eval_folder_path = eval_folder_path.joinpath(
    *best_model_checkpoint_path.parts[2:]
)

if not eval_folder_path.exists():
    eval_folder_path.mkdir(parents=True)

print(eval_folder_path)

../eval_results/category_indie_unique_review_text/bertopic[split]_genre_indie_grid_search_20240214_111556/bertopic_bt_nr_topics_30


In [126]:
topic_id_to_label_json_path = eval_folder_path.joinpath('topic_id_to_label.json')

with open(topic_id_to_label_json_path, 'r') as f:
    topic_id_to_label_json = json.load(f)

topic_id_to_label_json

{'-1': 'Challenging Fun',
 '0': 'Friendly Fun',
 '1': "Freddy's Nightmares",
 '2': 'Mac/Linux Support',
 '3': 'Better Minecraft',
 '4': 'VR Truck Simulator',
 '5': 'Goat Simulator',
 '6': 'Good Story',
 '7': 'Mountain Adventure',
 '8': 'Addictive Fun',
 '9': 'Easy Achievements',
 '10': 'Shark vs Divers Fun',
 '11': 'Trading Cards Simulator',
 '12': 'Lemon Capitalist',
 '13': 'Salem Prison Escape',
 '14': 'Short & Fun',
 '15': 'Depression Game',
 '16': 'Whiskey Party',
 '17': 'Mountains Screensaver',
 '18': 'Sexy Anime',
 '19': 'Bad Rats',
 '20': 'Trine 3: Disappointing 3D Puzzle Adventure',
 '21': 'Cat Game',
 '22': 'Geometry Dash',
 '23': 'Quacking Fun',
 '24': 'Gud Magic Vidy Gaem',
 '25': 'BAD GAME - Requested Refund',
 '26': 'Cancer Game',
 '27': 'Beautiful Indie Puzzle Platformer',
 '28': 'Learn English Vocabulary',
 '29': 'Toilet Disaster'}

In [127]:
# print the topic id, the original reviews
# and the topic label

for i, (topic, prob) in enumerate(zip(topics, probs)):
    print(f'Topic {topic} ({topic_id_to_label_json[str(topic)]})')
    # print(f'Prob: {prob}')
    print(f'Review: {X[i]}')
    print('')

Topic 0 (Friendly Fun)
Review: neat pokemon - like game, it ' s really fun and the monsters are really cool and interesting. it feels polished but there are some aspects that yet feel unfinished, overall i ' d say it ' s worth it to give it a try if you are into this type of games

Topic -1 (Challenging Fun)
Review: okay - the game is interesting and i like the humour and the signs telling you facts you would rather not read but the music is very repetitive and annoying. it ' s sounds interesting and kind of cool but only for a short time before i hate hearing it. it also doesn ' t alive enough if that makes sense. a lot of similar games are just walking simulators ( they shouldn ' t be ) but this game has that annoying wandering around feeling that i hate. not all games in this genre feel like that.

Topic 0 (Friendly Fun)
Review: originally a game that went free. the battles are pretty boring with a lot of the fights being against the same enemies, terrible to look at, and the music 