CTM evaluation

In [1]:
import pandas as pd
import numpy as np


from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
# from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

import nltk

import os
from pathlib import Path
import json
from datetime import datetime
import pickle

os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload

In [3]:
import sys

sys.path.append('../')

In [4]:
# load the dataset
# TODO: load external dataset

%autoreload 2
from dataset_loader import GENRES, load_dataset

# genre = -1
genre = GENRES.ACTION
unique_list = ['review_text']

if type(genre) == GENRES:
    dataset_folder = Path(f'../../dataset/topic_modelling/top_11_genres_unique_[{",".join(unique_list)}]')
    dataset, dataset_path = load_dataset(genre, dataset_folder)
else:
    dataset_folder = Path(f'../../dataset/topic_modelling/00_dataset_filtered_all_4045065.pkl').resolve()
    dataset, dataset_path = pd.read_pickle(dataset_folder), dataset_folder

# create an untoucher ver of the dataset for retrieving original text
dataset_untouched = dataset.copy()

dataset.info(verbose=True)

Load dataset from: /root/FYP/NLP/dev-workspace/dataset/topic_modelling/top_11_genres_unique_[review_text]/00_action.pkl



<class 'pandas.core.frame.DataFrame'>
Index: 1273475 entries, 0 to 4179608
Data columns (total 8 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   index         1273475 non-null  int64 
 1   app_id        1273475 non-null  int64 
 2   app_name      1273475 non-null  object
 3   review_text   1273475 non-null  object
 4   review_score  1273475 non-null  int64 
 5   review_votes  1273475 non-null  int64 
 6   genre_id      1273475 non-null  object
 7   category_id   1273475 non-null  object
dtypes: int64(4), object(4)
memory usage: 87.4+ MB


In [8]:
# data preprocessing
# MUST BE IDENTICAL TO THE ONE USED IN TRAINING

import sys
sys.path.append('../../sa/')

%autoreload 2
import str_cleaning_functions

# copied from lda_demo_gridsearch.ipynb
def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

# def cleaning_strlist(str_list):
#     str_list = list(map(lambda x: clean(x), str_list))
#     str_list = list(map(lambda x: deEmojify(x), str_list))

#     str_list = list(map(lambda x: x.lower(), str_list))
#     str_list = list(map(lambda x: remove_num(x), str_list))
#     str_list = list(map(lambda x: unify_whitespaces(x), str_list))

#     str_list = list(map(lambda x: _deaccent(x), str_list))
#     str_list = list(map(lambda x: remove_non_alphabets(x), str_list))
#     str_list = list(map(lambda x: remove_stopword(x), str_list))
#     return str_list

# copied from bert_demo_gridsearch.ipynb
def cleaning_little(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

In [9]:
# create a copy of the dataset, as we need both untouched text and cleaned text
dataset['review_text_bow'] = dataset['review_text'].copy()

# also a copy for eval with LLM
# dataset_eval = dataset.copy()

In [10]:
cleaning(dataset, 'review_text_bow')
cleaning_little(dataset, 'review_text')

In [11]:
# skip removing reviews with too many punctuations for real-life performance

In [12]:
# remove docs with 0 len

# remove docs with 0 len

def _filter_zero_len(x):
    if len(x['review_text']) == 0 or len(x['review_text_bow']) == 0:
        return False
    return True

dataset = dataset[dataset.apply(lambda x: _filter_zero_len(x), axis=1)]

In [13]:
print(len(dataset))

723659


Apply lemmatizing to the preprocessed dataset as well (for BoW)

The function is identical in LDA

In [14]:
# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

In [15]:
from datasets import Dataset

# X_preprocessed = list(map(lambda x: lemmatization(x), X_preprocessed))
# X_preprocessed = list(map(lambda x: ' '.join(x), X_preprocessed))

def lemmatization_dataset(data):
    return {'review_text2': ' '.join(lemmatization(data['review_text']))}

temp_dataset = Dataset.from_dict({'review_text': dataset['review_text_bow'].values})
temp_dataset = temp_dataset.map(lemmatization_dataset, num_proc=4)      # speed up lemmatization
dataset['review_text_bow'] = temp_dataset['review_text2']

Map (num_proc=4): 100%|██████████| 723659/723659 [02:50<00:00, 4243.50 examples/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [None]:
from copy import deepcopy

X_contextual = dataset['review_text'].values
X_bow = dataset['review_text_bow'].values
X = deepcopy(X_contextual)

In [61]:
# save the dataset for eval

dataset_path = Path(f'category_{str(genre)}_unique_review_text')
dataset_path = dataset_path.joinpath(
    Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval.pkl')
)

if not dataset_path.exists():
    dataset.to_pickle(dataset_path)
else:
    print(f'File {dataset_path} already exists')
    print('Skip saving')

Create split text for models trained with split tokens (Optional)

In [16]:
import torch
import platform
if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series mac machine

print(device)

cuda


In [53]:
# copy from ctm_training.ipynb

# from datasets import Dataset
from tqdm.autonotebook import trange

def split_X_contextual_X_bow(X_contextual, X_bow, X, sbert, split:bool=False):
    if not split:
        return X_contextual, X_bow, X, list(range(len(X_contextual)))
    else:
        X_contextual_new, X_bow_new, X_new = [], [], []
        tokenizer = sbert[0].tokenizer

        # for eval
        _original_iloc = []

        batch_size = 64
        for start_index in trange(0, len(X_contextual), batch_size, desc="Batches", disable=False):
            sentence_batch = X_contextual[start_index:start_index+batch_size]
            features = tokenizer(sentence_batch, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)

            # split overlapping
            features_split = split_tokens_into_smaller_chunks(features, sbert.max_seq_length-2,  sbert.max_seq_length-2, 1)

            for i, input_id_list in enumerate(features_split['input_ids']):
                for input_id in input_id_list:
                    X_contextual_new.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id)))
                    X_bow_new.append(X_bow[start_index+i])
                    X_new.append(X[start_index+i])

                # for eval
                _original_iloc.extend([start_index+i]*len(input_id_list))


        assert len(X_contextual_new) == len(X_bow_new), "X_contextual_new and X_bow_new should have the same length. Found: {} and {}".format(len(X_contextual_new), len(X_bow_new))
        assert len(X_contextual_new) == len(X_new), "X_contextual_new and X_new should have the same length. Found: {} and {}".format(len(X_contextual_new), len(X_new))
        assert len(X_contextual_new) == len(_original_iloc), "X_contextual_new and _original_iloc should have the same length. Found: {} and {}".format(len(X_contextual_new), len(_original_iloc))
        return X_contextual_new, X_bow_new, X_new, _original_iloc
   
# ####################
# # helper functions
# ####################
    
# tokens spliting helper functions

def split_tokens_into_smaller_chunks(
    data,
    chunk_size: int,
    stride: int,
    minimal_chunk_length: int,
) -> dict:
    """Splits tokens into overlapping chunks with given size and stride."""

    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []

    for input_id, token_type_id, mask_chunk in zip(data['input_ids'], data['token_type_ids'], data['attention_mask']):
        _input_id_chunk = split_overlapping(input_id, chunk_size, stride, minimal_chunk_length)
        _token_type_id = split_overlapping(token_type_id, chunk_size, stride, minimal_chunk_length)
        _mask_chunk = split_overlapping(mask_chunk, chunk_size, stride, minimal_chunk_length)

        _new_input_id_chunks.append(_input_id_chunk)
        _new_token_type_ids.append(_token_type_id)
        _new_mask_chunks.append(_mask_chunk)    

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks}

def split_overlapping(tensor:list[int], chunk_size: int, stride: int, minimal_chunk_length: int) -> list[list[int]]:
    """Helper function for dividing 1-dimensional tensors into overlapping chunks."""
    # check_split_parameters_consistency(chunk_size, stride, minimal_chunk_length)
    result = [tensor[i : i + chunk_size] for i in range(0, len(tensor), stride)]
    if len(result) > 1:
        # ignore chunks with less than minimal_length number of tokens
        result = [x for x in result if len(x) >= minimal_chunk_length]
    return result


def tokenize_dataset(data, tokenizer):
    # return sbert_model[0].tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)
    return {'tokenized': tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)}


In [18]:
# ATTENTION !!!!!
# define the sbert model (SHOULD BE THE SAME AS TRAINING)
# also define whether we want to split the tokens or not

split_sentence = True
sbert_model_name = 'all-MiniLM-L6-v2'

# load the sbert model
from sentence_transformers import SentenceTransformer
sbert = SentenceTransformer(sbert_model_name, device=device)

In [54]:
X = list(dataset['review_text'].values)
X_preprocessed = list(dataset['review_text_bow'].values)

X_contextual, X_bow, X, _original_iloc = split_X_contextual_X_bow(
    X, X_preprocessed, X, 
    sbert, 
    split=split_sentences)

Batches: 100%|██████████| 11308/11308 [01:58<00:00, 95.14it/s] 


In [55]:
_original_iloc

[0,
 0,
 1,
 2,
 2,
 3,
 4,
 4,
 4,
 4,
 5,
 5,
 6,
 7,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 22,
 23,
 24,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 104,
 105,
 106,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,

In [23]:
print(len(X_contextual), len(X_bow), len(X))

789451 789451 789451


In [58]:
# save the preprocessed data

dataset_eval_split = dataset.iloc[_original_iloc]
dataset_eval_split['review_text_split'] = X_contextual

dataset_eval_split['review_text_untouched'] = dataset_untouched.loc[dataset_eval_split.index]['review_text'].values

dataset_eval_split

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_eval_split['review_text_split'] = X_contextual
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_eval_split['review_text_untouched'] = dataset_untouched.loc[dataset_eval_split.index]['review_text'].values


Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id,review_text_bow,review_text_split,review_text_untouched
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",take one part faerie solitaire two part puzzle...,take one part faerie solitaire and two parts p...,Take one part Faerie Solitaire and two parts P...
25636,32133,102200,Runespell: Overture,Take one part Faerie Solitaire and two parts P...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",take one part faerie solitaire two part puzzle...,it ' s an entertaining casual game to play. it...,Take one part Faerie Solitaire and two parts P...
25637,32134,102200,Runespell: Overture,Why don't they make more games like this?! Sim...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",make game like simple card play mechanic fun a...,why don ' t they make more games like this?! s...,Why don't they make more games like this?! Si...
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",runespell overture meld together classic rpg c...,runespell : overture melds together classic rp...,Runespell: Overture melds together classic RPG...
25638,32135,102200,Runespell: Overture,Runespell: Overture melds together classic RPG...,1,0,"[25, 23, 3]","[2, 22, 23, 15, 25]",runespell overture meld together classic rpg c...,"##itaire, poker game - and it is done very wel...",Runespell: Overture melds together classic RPG...
...,...,...,...,...,...,...,...,...,...,...,...
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",played spiral knight year decide give another ...,i hadn ' t played spiral knights for over 2 ye...,I hadn't played Spiral Knights for over 2 year...
4179604,6416380,99900,Spiral Knights,I hadn't played Spiral Knights for over 2 year...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",played spiral knight year decide give another ...,you go out of your way to organise something w...,I hadn't played Spiral Knights for over 2 year...
4179605,6416381,99900,Spiral Knights,This game use to be they did the rework on ene...,0,0,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",game use rework energy heat level long time ag...,this game use to be they did the rework on ene...,This game use to be good..until they did the r...
4179607,6416383,99900,Spiral Knights,This game is good to play by your self or with...,1,1,"[1, 25, 4, 37, 23, 29, 3]","[2, 1, 20, 9, 22, 29, 35, 18]",game good play self friends get annoy farm new...,this game is good to play by your self or with...,This game is good to play by your self or with...


In [6]:
# save the eval dataset for reloading

dataset_path = Path(f'category_{str(genre) if type(genre) == GENRES else "all"}_unique_review_text')
dataset_path = dataset_path.joinpath(
    Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval.pkl') if type(genre) == GENRES \
    else Path(f'preprocessed_data/category_all_dataset_eval{"_[split]" if split_sentence else ""}.pkl')
)

if not dataset_path.parent.exists():
    dataset_path.parent.mkdir(parents=True)

if not dataset_path.exists():
    dataset_eval_split.to_pickle(dataset_path)

    print('Save to', dataset_path)

---

Reload the preprocessed data

In [5]:
# load the preprocessed data

split_sentence = False

dataset_preprocessed_path = Path(f'category_{str(genre) if type(genre) == GENRES else "all"}_unique_review_text')
dataset_preprocessed_path = dataset_preprocessed_path.joinpath(
    Path(f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval{"_[split]" if split_sentence else ""}.pkl') if type(genre) == GENRES \
    else Path(f'preprocessed_data/category_all_dataset_eval{"_[split]" if split_sentence else ""}.pkl')
)

print(dataset_preprocessed_path)

if dataset_preprocessed_path.exists():
    dataset = pd.read_pickle(dataset_preprocessed_path)
    print(len(dataset))
    print('\n\n')
    print(dataset.info(verbose=True))

    if split_sentence:
        X_contextual, X_bow, X  = list(dataset['review_text_split'].values), list(dataset['review_text_bow'].values), list(dataset['review_text'].values)
    else:
        X_contextual, X_bow, X  = list(dataset['review_text'].values), list(dataset['review_text_bow'].values), list(dataset['review_text'].values)
        
else:
    print(f"{dataset_preprocessed_path} does not exist")

category_action_unique_review_text/preprocessed_data/00_action_dataset_eval.pkl
1269558



<class 'pandas.core.frame.DataFrame'>
Index: 1269558 entries, 0 to 4179608
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   index            1269558 non-null  int64 
 1   app_id           1269558 non-null  int64 
 2   app_name         1269558 non-null  object
 3   review_text      1269558 non-null  object
 4   review_score     1269558 non-null  int64 
 5   review_votes     1269558 non-null  int64 
 6   genre_id         1269558 non-null  object
 7   category_id      1269558 non-null  object
 8   review_text_bow  1269558 non-null  object
dtypes: int64(4), object(5)
memory usage: 96.9+ MB
None


---

Load the training result

In [6]:
sys.path.append('../')

from eval_metrics import compute_inverted_rbo, compute_topic_diversity, compute_pairwise_jaccard_similarity, \
                        METRICS, SEARCH_BEHAVIOUR, COHERENCE_MODEL_METRICS

In [7]:
%autoreload 2
from ctm_dataset_creation import create_ctm_dataset
from ctm_utils import _load_ctm_model, _get_topics, _get_topic_document_metrix, _get_topic_word_metrix

In [8]:
import torch
import platform
if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series mac machine

print(device)

cuda


In [9]:
# copy from ctm_training.ipynb

from tqdm.autonotebook import trange

def split_X_contextual_X_bow(X_contextual, X_bow, X, sbert, split:bool=False):
    if not split:
        return X_contextual, X_bow, X
    else:
        X_contextual_new, X_bow_new, X_new = [], [], []
        tokenizer = sbert[0].tokenizer

        batch_size = 64
        for start_index in trange(0, len(X_contextual), batch_size, desc="Batches", disable=False):
            sentence_batch = X_contextual[start_index:start_index+batch_size]
            features = tokenizer(sentence_batch, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)

            # split overlapping
            features_split = split_tokens_into_smaller_chunks(features, sbert.max_seq_length-2,  sbert.max_seq_length-2, 1)

            for i, input_id_list in enumerate(features_split['input_ids']):
                for input_id in input_id_list:
                    X_contextual_new.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id)))
                    X_bow_new.append(X_bow[start_index+i])
                    X_new.append(X[start_index+i])


        assert len(X_contextual_new) == len(X_bow_new), "X_contextual_new and X_bow_new should have the same length. Found: {} and {}".format(len(X_contextual_new), len(X_bow_new))
        assert len(X_contextual_new) == len(X_new), "X_contextual_new and X_new should have the same length. Found: {} and {}".format(len(X_contextual_new), len(X_new))
        return X_contextual_new, X_bow_new, X_new
    
# ####################
# # helper functions
# ####################
    
# tokens spliting helper functions

def split_tokens_into_smaller_chunks(
    data,
    chunk_size: int,
    stride: int,
    minimal_chunk_length: int,
) -> dict:
    """Splits tokens into overlapping chunks with given size and stride."""

    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []

    for input_id, token_type_id, mask_chunk in zip(data['input_ids'], data['token_type_ids'], data['attention_mask']):
        _input_id_chunk = split_overlapping(input_id, chunk_size, stride, minimal_chunk_length)
        _token_type_id = split_overlapping(token_type_id, chunk_size, stride, minimal_chunk_length)
        _mask_chunk = split_overlapping(mask_chunk, chunk_size, stride, minimal_chunk_length)

        _new_input_id_chunks.append(_input_id_chunk)
        _new_token_type_ids.append(_token_type_id)
        _new_mask_chunks.append(_mask_chunk)    

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks}

def split_overlapping(tensor:list[int], chunk_size: int, stride: int, minimal_chunk_length: int) -> list[list[int]]:
    """Helper function for dividing 1-dimensional tensors into overlapping chunks."""
    # check_split_parameters_consistency(chunk_size, stride, minimal_chunk_length)
    result = [tensor[i : i + chunk_size] for i in range(0, len(tensor), stride)]
    if len(result) > 1:
        # ignore chunks with less than minimal_length number of tokens
        result = [x for x in result if len(x) >= minimal_chunk_length]
    return result


def tokenize_dataset(data, tokenizer):
    # return sbert_model[0].tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)
    return {'tokenized': tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)}


In [10]:
# load the model from disk to compare the results

search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
split_sentences = False
sbert_model_name = 'all-MiniLM-L6-v2'

# training_datetime = datetime(2024, 2, 14, 22, 4, 32)      # split=True
# training_datetime = datetime(2024, 2, 16, 11, 59, 10)        # split=False
# training_datetime = datetime(2024, 2, 24, 14, 32, 52)        # split=True
# training_datetime = datetime(2024, 2, 23, 0, 8, 27)        # split=True
# training_datetime = datetime(2024, 2, 29, 8, 43, 52)        # split=True
training_datetime = datetime(2024, 2, 29, 0, 55, 30)        # split=False

if type(genre) == GENRES:
    training_folder_p = Path(f'category_{str(genre)}_unique_review_text')
    training_folder = Path(f'ctm{"[split]" if split_sentence else ""}_genre_{str(genre)}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
else:
    training_folder_p = Path(f'category_all_unique_review_text')
    training_folder = Path(f'ctm{"[split]" if split_sentence else ""}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
training_folder = training_folder_p.joinpath(training_folder)

training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)


# load the embeddings (training)
# model_name_or_path = training_result['best_hyperparameters']['sbert_params']['model_name_or_path']
# embeddings_path = training_folder.joinpath(f'embeddings_{model_name_or_path}.pkl')
# with open(embeddings_path, 'rb') as f:
#     embeddings = np.load(f)

best_model_path = training_result['best_model_checkpoint']
ctm_hyperparameters = training_result['best_hyperparameters']['ctm_params']
sbert_params = training_result['best_hyperparameters']['sbert_params']

# ctm_hyperparameters['bow_size'] = 2000
# ctm_hyperparameters['contextual_size'] = 768

# create the dataset on the fly
with open(Path(best_model_path).joinpath('count_vectorizer.pkl'), 'rb') as f:
    vectorizer = pickle.load(f)



In [11]:
training_dataset, _, _, _, _ = create_ctm_dataset(
    X_contextual, X_bow, X,
    sbert_params, training_folder, 
    vectorizer=vectorizer,
    X_contextual_embedding_path=training_folder.parent.joinpath(
        f'preprocessed_data/{genre.value:02}_{str(genre)}_dataset_eval{"_[split]" if split_sentences else ""}_X_contextual_embeddings.npy') if type(genre) == GENRES \
        else training_folder.parent.joinpath(f'preprocessed_data/category_all_dataset_eval{"_[split]" if split_sentences else ""}_X_contextual_embeddings.npy')
)

Found existing sbert embeddings at category_action_unique_review_text/preprocessed_data/00_action_dataset_eval_X_contextual_embeddings.npy. Reusing them.


In [89]:
# # save the embeddings for later use
# X_contextual_embeddings_path_p = Path(best_model_path).parent.parent.joinpath('preprocessed_data')
# X_contextual_embeddings_path = X_contextual_embeddings_path_p.joinpath(
#     f'{genre.value:02}_{str(genre)}_dataset_eval{"_[split]" if split_sentences else ""}_X_contextual_embeddings.npy')       # save the embeddings for future evaluation

# if not X_contextual_embeddings_path_p.exists():
#     X_contextual_embeddings_path_p.mkdir(parents=True)

# if not X_contextual_embeddings_path.exists():
#     with open(X_contextual_embeddings_path, 'wb') as f:
#         np.save(f, training_dataset.X_contextual)

#     print('Save X_contextual embeddings to', X_contextual_embeddings_path)

Save X_contextual embeddings to category_indie_unique_review_text/preprocessed_data/01_indie_dataset_eval_X_contextual_embeddings.npy


In [12]:
# change the best model path

best_model_path = Path(training_result['best_model_checkpoint']).parent.joinpath(
    'ctm_ctm_n_components_30'
)

In [13]:
print(Path(best_model_path))

category_action_unique_review_text/ctm_genre_action_grid_search_20240229_005530/ctm_ctm_n_components_30


In [14]:
best_model = _load_ctm_model(Path(best_model_path), ctm_hyperparameters)



In [15]:
# whether the model is trained with sentence-split or not

---

Visualization

(Go to another notebook ctm_eval_vis.ipynb for those results)

---

In [16]:
# save the df_original_text object for reference
eval_folder_path = Path('../eval_results')
eval_folder_path = eval_folder_path.joinpath(
    best_model_path
)

if not eval_folder_path.exists():
    eval_folder_path.mkdir(parents=True)

Get top N keywords for each topic

In [17]:
# get top N keywords for each topic

topic_list = best_model.get_topic_lists(10)

topic_list

top_N_words = 10

topic_keywords = {}

for topic_id, _keywords in enumerate(best_model.get_topic_lists(top_N_words)):
    topic_keywords[topic_id] = _keywords

    print(f'Topic {topic_id}:')
    print(', '.join(_keywords))
    print()

Topic 0:
friend, play, multiplayer, fun, friends, op, online, server, singleplayer, campaign

Topic 1:
pretty, good, graphic, bad, damn, realy, ok, kinda, overall, job

Topic 2:
dungeon, character, new, soul, item, level, isaac, stats, generate, different

Topic 3:
ship, station, review, like, thing, sky, say, really, hour, money

Topic 4:
kill, shoot, guy, shot, gun, dinosaur, die, wan, cop, blow

Topic 5:
youtube, epic, video, brilliant, award, deserve, favourite, favorite, masterpiece, absolutely

Topic 6:
mainly, alright, conclusion, dr, worry, prefer, rating, speak, tl, obviously

Topic 7:
highly, action, fan, recommend, rpg, paced, recommended, person, genre, fast

Topic 8:
friend, friends, addict, addictive, ton, fun, alot, especially, extremely, hilarious

Topic 9:
minecraft, world, terrarium, building, build, boss, craft, adventure, stuff, explore

Topic 10:
ive, simulator, market, possibly, best, period, date, history, star, duty

Topic 11:
alright, mainly, conclusion, worry,

Get most representative docs

In [18]:
# note that due to its stochastic (sampling) nature, the doc-topic distribution may not be the same each time.
# although a n_samples param is applied
doc_topic_distribution = best_model.get_doc_topic_distribution(training_dataset, n_samples=20)

# sample call
top_docs = best_model.get_top_documents_per_topic_id(X, doc_topic_distribution, 8, k=10)

# repr_docs = {}
# repr_docs_ids = {}

# for topic_id in range(best_model.n_components):
#     repr_docs[topic_id] = best_model.get_top_documents_per_topic_id(X, doc_topic_distribution, topic_id, k=10)
#     repr_docs_ids[topic_id] = list(_get_top_docs_index_per_topic(X, doc_topic_distribution, topic_id, 10))

  0%|          | 0/19360 [00:00<?, ?it/s]

100%|██████████| 19360/19360 [00:39<00:00, 490.90it/s]


In [19]:
repr_docs = {}
repr_docs_ids = {}

# to get the index of the top documents
def _get_top_docs_index_per_topic(X, doc_topic_distributions, topic_id, k=10):
    probability_list = doc_topic_distributions.T[topic_id]
    ind = probability_list.argsort()[-k:][::-1]
    return ind

for topic_id in range(best_model.n_components):
    repr_docs[topic_id] = best_model.get_top_documents_per_topic_id(X, doc_topic_distribution, topic_id, k=10)
    repr_docs_ids[topic_id] = list(_get_top_docs_index_per_topic(X, doc_topic_distribution, topic_id, 10))

In [20]:
repr_docs

{0: [("Mount & Blade: Warband is a game like none ever before. It is peerless, perfect and a true everlasting masterpiece. It is an open world sandbox medieval RPG large scale battle simulation. Start out as an unknown adventurer and rise to be one of the mightiest lords, if not a ruler, of the entire realm. Don't get put off by the dated visuals, the depth is infinite and Warband is one of the best ten games I have ever played in my entire life so far. (And I played thousands of them) What might start out as a confusing journey to some, becomes the pinnacle of immersement when you're riding with 200 knights along you into battle, singling out the enemy commander and killing him in single combat while dozens of arrows hit the ground next to you. I can't wait for Mount & Blade 2: Bannerlord to finally finish development. It is personally my most expected game of the decade.",
   0.9995994),
  ('cant believe a free game can be this much fun, best to play with friend for easier communicat

In [21]:
repr_docs_ids

{0: [948056,
  891962,
  121559,
  1188801,
  280468,
  129648,
  526145,
  642973,
  261588,
  710713],
 1: [334039,
  55238,
  540617,
  973837,
  1045864,
  897888,
  1008057,
  983236,
  1037023,
  153873],
 2: [22356,
  575833,
  93722,
  375877,
  661637,
  376419,
  757002,
  1128223,
  458516,
  464808],
 3: [489304,
  441754,
  447886,
  453975,
  438799,
  441028,
  487786,
  451662,
  453330,
  439399],
 4: [656658,
  700278,
  698480,
  300873,
  700010,
  701274,
  700120,
  697658,
  299921,
  698409],
 5: [856343,
  859971,
  1060064,
  856319,
  1104357,
  856478,
  856181,
  309560,
  266413,
  856704],
 6: [358563,
  358381,
  360504,
  359028,
  362780,
  361362,
  1173872,
  359006,
  364430,
  360478],
 7: [771032,
  135455,
  362709,
  1154559,
  305211,
  473410,
  1217818,
  369554,
  1201223,
  757141],
 8: [549044,
  356636,
  544910,
  985136,
  174836,
  1020653,
  1110855,
  890273,
  73985,
  1128619],
 9: [99595, 39726, 68675, 44675, 101512, 85726, 59482,

In [22]:
# create a dataframe with only these repr docs, their topic id, and the probability
df_original_texts = []

for topic_id, _repr_docs_ids in repr_docs_ids.items():
    t = dataset.iloc[_repr_docs_ids]
    t['topic_id'] = topic_id

    df_original_texts.append(t)

df_original_texts = pd.concat(df_original_texts)
df_original_texts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['topic_id'] = topic_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['topic_id'] = topic_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['topic_id'] = topic_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,genre_id,category_id,review_text_bow,topic_id
3723741,5777553,48700,Mount & Blade: Warband,Mount & Blade: Warband is a game like none eve...,1,0,"[1, 3]","[2, 1, 22, 29, 30]",mount blade warband game like none ever peerle...,0
3489274,5374980,41300,Altitude,"cant believe a free game can be this much fun,...",1,0,"[1, 4, 37, 23, 28, 2]","[2, 1, 49, 36, 47, 9, 38, 48, 27, 22, 18, 15, 17]",cant believe free game much fun best play frie...,0
139523,160499,107100,Bastion,I heard a lot about this game before I decided...,1,0,"[1, 23, 3]","[2, 22, 28, 29, 23, 25, 41, 42, 43]",heard lot game decide buy heard great soundtra...,0
4060015,6265724,7670,BioShock,Great game i beat it on xbox 360 along time ag...,1,0,"[1, 3]","[2, 18]",great game beat xbox along time ago bring beat...,0
1075691,1554118,22380,Fallout: New Vegas,Johnny didn't have enough funds to buy Fallout...,1,0,"[1, 3]","[2, 22, 18, 25, 41, 42]",johnny enough fund buy fallout johnny saw fall...,0
...,...,...,...,...,...,...,...,...,...,...
3390611,5236128,400,Portal,A brilliant puzzler that is easy to pick up on...,1,0,[1],"[2, 22, 13, 18, 17, 16, 14, 41, 42, 44]",brilliant puzzler easy pick lot fun play,29
1966431,3045141,262410,World of Guns: Gun Disassembly,The only thing you do in this game is disassem...,0,0,"[1, 4, 37, 23, 28, 2]","[2, 22, 29, 30, 35]",thing game disassemble gun put back together w...,29
2236110,3442790,282140,SOMA,This game invokes that feeling of dread in the...,1,0,"[1, 25, 23]","[2, 22, 28, 29, 30, 23, 43, 44]",game invokes feel dread pit stomach slowly cre...,29
2690657,4193533,322920,theHunter: Primal,"Save yourself $32 CAD, and go buy Ark: Surviva...",0,0,"[1, 25, 28]","[2, 1, 9, 22, 15, 25]",save cad go buy ark survival evolve thehunter ...,29


In [23]:
# check which repr docs do not begin from beginning of the original review_text

def check_beginning_of_review_text(row):
    t = row['review_text_split'].split()[0].lower()
    return t != row['review_text'][:len(t)].lower()

t = df_original_texts[
    df_original_texts.apply(
        lambda x: check_beginning_of_review_text(x),
        axis=1
    )]

print(len(t))

26


In [24]:
t

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes,review_text_bow,review_text_split,review_text_untouched,topic_id
1376097,2034132,235800,Audiosurf 2,"Ah, Audiosurf. The game where you can fly thro...",1,0,ah audiosurf game fly color change track colle...,' s lots of fun.,"Ah, Audiosurf. The game where you can fly thro...",0
3316346,5129879,391540,Undertale,there's certainly a lot to be said of this ga...,1,0,certainly lot say game amaziingly story depart...,there ' s certainly a lot to be said of this g...,Well.... there's certainly a lot to be said of...,0
1654568,2494421,245170,Skullgirls 2nd Encore,TL;DR: This is fighting games done right. If y...,1,0,tl dr fight game do right enjoy marvel v capco...,death ' combos that existed early in the game ...,TL;DR: This is fighting games done right. If y...,1
1654568,2494421,245170,Skullgirls 2nd Encore,TL;DR: This is fighting games done right. If y...,1,0,tl dr fight game do right enjoy marvel v capco...,they ' re totally adding characters to the gam...,TL;DR: This is fighting games done right. If y...,1
1654568,2494421,245170,Skullgirls 2nd Encore,TL;DR: This is fighting games done right. If y...,1,0,tl dr fight game do right enjoy marvel v capco...,in my mouth. every character brings something ...,TL;DR: This is fighting games done right. If y...,1
2477625,3862880,304050,Trove,"Trove, it was fun for a few minutes until you ...",0,1,trove fun minute relize entire game diffrent d...,as the last but you get slightly diffrent loot...,"Trove, it was fun for a few minutes until you ...",7
4012527,6206983,65980,Sid Meier's Civilization: Beyond Earth,Warning! This review contains lots of comparis...,0,0,warn review contains lot comparison civilizati...,trade route with the station to gain its advan...,Warning! This review contains lots of comparis...,7
2083876,3223211,270130,The Gallery - Episode 1: Call of the Starseed,"TL;DR : If it comes free with your Vive, or if...",0,0,tl dr come free vive access via family share g...,and the only replay value is completing achiev...,"TL;DR : If it comes free with your Vive, or if...",9
1958894,3032510,262060,Darkest Dungeon®,Even if at the time I purchased this (when eve...,0,1,even time purchase everybody dog post gameplay...,": / as a result of which jim sterling, as a st...",Even if at the time I purchased this (when eve...,9
1958894,3032510,262060,Darkest Dungeon®,Even if at the time I purchased this (when eve...,0,1,even time purchase everybody dog post gameplay...,##madeath and a single automatic save – and th...,Even if at the time I purchased this (when eve...,9


In [35]:
# print out the original text and the split (if any) for reference

for topic_id in repr_docs_ids.keys():
    print(f'Topic {topic_id}:')

    for index, row in df_original_texts[df_original_texts['topic_id'] == topic_id].iterrows():
        print(f'Doc {index}')
        print(f'Original: {row["review_text"]}')
        if split_sentence:
            print(f'Split: {row["review_text_split"]}')
        print()

Topic 0:
Doc 3723741
Original: Mount & Blade: Warband is a game like none ever before. It is peerless, perfect and a true everlasting masterpiece. It is an open world sandbox medieval RPG large scale battle simulation. Start out as an unknown adventurer and rise to be one of the mightiest lords, if not a ruler, of the entire realm. Don't get put off by the dated visuals, the depth is infinite and Warband is one of the best ten games I have ever played in my entire life so far. (And I played thousands of them) What might start out as a confusing journey to some, becomes the pinnacle of immersement when you're riding with 200 knights along you into battle, singling out the enemy commander and killing him in single combat while dozens of arrows hit the ground next to you. I can't wait for Mount & Blade 2: Bannerlord to finally finish development. It is personally my most expected game of the decade.

Doc 3489274
Original: cant believe a free game can be this much fun, best to play with fr

---

In [24]:
# compute topic frequency using the doc-topic distribution
from collections import Counter

topic_frequency = np.argmax(doc_topic_distribution, axis=1)

topic_frequency_counter = dict(Counter(topic_frequency))

# create dataframe for the topic frequency
# with key of the dict as topic_id and value as frequency
df_topic_freq = pd.DataFrame.from_dict(topic_frequency_counter, orient='index', columns=['frequency'])
df_topic_freq.reset_index(inplace=True)
df_topic_freq.rename(columns={'index': 'topic_id'}, inplace=True)
df_topic_freq.sort_values(by='frequency', ascending=False, inplace=True)
df_topic_freq.reset_index(drop=True, inplace=True)

df_topic_freq

Unnamed: 0,topic_id,frequency
0,10,63103
1,8,61635
2,5,61178
3,29,58071
4,14,58044
5,6,57557
6,17,56781
7,11,56524
8,27,53382
9,1,51839


In [25]:
# within the topic lists (the words)
# find out common words between topics

# from itertools import combinations

# topic_list = best_model.get_topic_lists(k=10)

# common_words = set()
# for topic1, topic2 in combinations(topic_list, 2):
#     common_words.update(set(topic1).intersection(set(topic2)))

# common_words = list(common_words)
# common_words.sort()
# common_words

In [26]:
# len(common_words)

In [27]:
eval_folder_path.joinpath('df_eval_top_10.pkl').resolve()

PosixPath('/root/FYP/NLP/dev-workspace/topic_modelling/eval_results/category_action_unique_review_text/ctm_genre_action_grid_search_20240229_005530/ctm_ctm_n_components_30/df_eval_top_10.pkl')

In [28]:
# save the top N (10) keywords
with open(eval_folder_path.joinpath('top_N_keywords.json'), 'w') as f:
    json.dump(topic_keywords, f, indent=2)

# save the top N (10) documents
df_original_texts.to_pickle(eval_folder_path.joinpath('df_eval_top_10.pkl'))

df_topic_freq.to_pickle(eval_folder_path.joinpath('df_eval_topic_freq.pkl'))

---

Test the capability of CTM with LLM topic naming

In [29]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

In [30]:
# can try diff llama2: https://ollama.com/library/yarn-llama2

llm = Ollama(model="llama2")        # assuming the port is 11434

In [31]:
# prompt engineering
system_message = "You are a player of the game who is reading the reviews about the game."

human_template = \
'''Create a name for a topic given the topic's keywords and some most representative reviews of the topic. Output a label for the topic in less than 5 words. Do not output other text. 

The top keywords of the topic is: \'\'\'{topic_keywords}\'\'\'. 

The most representative reviews of the topic are: \'\'\'{topic_reviews}\'\'\'. '''

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("human", human_template)
])

chain = chat_prompt | llm

In [32]:
import random
import time

N_times = 5

topic_ids = list(repr_docs.keys())

new_topic_labels = {}
randomed_topic_reviews = {}

for topic_id in topic_ids:
    _topic_keywords = topic_keywords[topic_id]

    time.sleep(1)

    _reviews_df = df_original_texts[df_original_texts['topic_id'] == topic_id]
    for i in range(N_times):
        while True:
            _sampled_reviews_df = _reviews_df.sample(n=2, replace=False)
        
            if split_sentence:
                check_bool = _sampled_reviews_df.apply(lambda x: len(x['review_text_split']) < 5000, axis=1)
            else:
                check_bool = _sampled_reviews_df.apply(lambda x: len(x['review_text']) < 5000, axis=1)

            if all(check_bool):
                break
        
        if split_sentence:
            topic_reviews = list(_sampled_reviews_df['review_text_split'].values)
        else:
            topic_reviews = list(_sampled_reviews_df['review_text'].values)

        result = chain.invoke({
            'topic_keywords':_topic_keywords,
            'topic_reviews':topic_reviews
        })

        print(f'{topic_id:02}_call{i}: {result}')

        if topic_id not in new_topic_labels:
            new_topic_labels[topic_id] = {}
            randomed_topic_reviews[topic_id] = {}

        new_topic_labels[topic_id][f"call_{i}"] = result
        randomed_topic_reviews[topic_id][f"call_{i}"] = {
            'reviews': topic_reviews,
            "col_index": _sampled_reviews_df['index'].values.tolist()
        }

    print()

00_call0: Friendly Survival
00_call1: 
Topic Label: Fun Indie Game


00_call2: Friendly Multiplayer Fun
00_call3: Friendly Survival
00_call4: Fun Multiplayer Game

01_call0: Co-op fun with toxic players.
01_call1: "Good Coop Game with Toxic Players"
01_call2: Co-op fun
01_call3: Coop Fun
01_call4: 
Nonfunctioning Eyeballs

02_call0: 
Dungeon RPG
02_call1: 
Topic: Epic Game with Great Single Player and Coop Experience.
02_call2: 
"New Dungeon RPG"
02_call3: 
Topic Label: "Underwhelming Experience with too much repetition"
02_call4: 
Topic Label: Incomplete Game

03_call0: Fun Waste Game
03_call1: Money-Worthy Experience
03_call2: "Fun and challenging game with great visuals"
03_call3: "Fun in a short hour"
03_call4: 
Topic Label: Fun shooter with balanced gameplay

04_call0: Action-packed
04_call1: Killer Game
04_call2: "Dino Shooter Woe"
04_call3: "Dinosaur Shootout"
04_call4: 
"Addictive and Rage-Inducing"

05_call0: 
Topic Label: Brilliant Puzzle Platformer
05_call1: 
Topic Label: Trine
05_call2: Trine: Creativity Unleashed
05_call3: 
Game: Brilliant 

In [33]:
randomed_topic_reviews

{0: {'call_0': {'reviews': ["Hands-down the best multiplayer survival game on the market. Terrifying and stressful, but fun and interesting at the same time. Unlike many other survival games, the difficulty of this one never diminishes based on how far into it you've progressed. -Excellent Atmosphere -Multitude of areas and gameplay elements to explore -Very difficult If you like survival with friends, you'll like this game. Just be prepared to lot.",
    'A good little game with a story that makes no sense towards the who cares,the narrative guy is awesome.'],
   'col_index': [4168662, 168989]},
  'call_1': {'reviews': ['unoptimized bag of crap',
    "I heard a lot about this game before I decided to buy it! I heard about a great soundtrack playing in the background, creating a nice atmosphere that carries you from hour to hour of playing. I heard about a neat art style, distinguishing itself from a lot of other games out there while still being nice to look at and sort of familiar. I

In [34]:
# save the topic labels

llm_generation_result = {
    'new_topic_labels': new_topic_labels,
    'randomed_topic_reviews': randomed_topic_reviews
}

with open(eval_folder_path.joinpath('llm_generation_result.json'), 'w') as f:
    json.dump(llm_generation_result, f, indent=2)

---