BERTopic qualitative review on external doc

(i.e. comments scraped from steam)

In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import json
from datetime import datetime
import pickle
import traceback

import gensim
import nltk
import pyLDAvis

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

import sys

sys.path.append('../')

In [2]:
import platform
import torch

if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

cuda


In [3]:
%load_ext autoreload

In [71]:
# the text to be evaluated

# game_steamid = 730
# game_name = 'counter-strike_2'

# game_steamid = 2138330
# game_name = 'cyberpunk2077_phantom_liberty'

# game_steamid = 1091500
# game_name = 'cyberpunk2077'

# game_steamid = 582010
# game_name = 'monster_hunter_world'

# game_steamid = 1118010
# game_name = 'monster_hunter_world_iceborne'

# game_steamid = 1716740
# game_name = 'starfield'

game_steamid = 570
game_name = "dota2"

datetime_until = datetime(2024, 1, 1, 0, 0, 0)      # only analyse reviews from this date until now (GMT+8)

# load the reviews from folder

reviews_reqs = []

# get existing folder and retrieve the cursor object (?)

# load the latest file
game_folder = Path(f'../../dataset/data_scraping/steam_comments_scraping/{game_name}').resolve()
if game_folder.exists():
    try:
        latest_file_path = sorted(game_folder.glob('steam_reviews_*.pkl'))[-1]
        with open(latest_file_path, 'rb') as f:
            reviews_reqs = pickle.load(f)           # retrieve the list of reviews
            print('Loaded:', latest_file_path)
    except IndexError as e:
        print('Error loading the latest file:', e)
        traceback.print_exc()

Loaded: /root/FYP/NLP/dev-workspace/dataset/data_scraping/steam_comments_scraping/dota2/steam_reviews_570_unique.pkl


In [72]:
# create a dataframe like in training/evaluation
reviews_df = pd.DataFrame(reviews_reqs)

reviews_df = reviews_df[['recommendationid', 'review', 'timestamp_created', 'voted_up', 'steam_purchase', 'received_for_free']]
# reviews_df = reviews_df[reviews_df['timestamp_created'] >= datetime_until.timestamp()]

# filter unique reviews
reviews_df = reviews_df.drop_duplicates(subset=['review', 'voted_up'])

# convert timestamp to datetime. The datetime converted is in utc+0
reviews_df['timestamp_created'] = pd.to_datetime(reviews_df['timestamp_created'], unit='s')

# convert the voted_up to 1 and -1
reviews_df['voted_up'] = reviews_df['voted_up'].apply(lambda x: 1 if x else -1)

reviews_df['review_original'] = reviews_df['review']

reviews_df

Unnamed: 0,recommendationid,review,timestamp_created,voted_up,steam_purchase,received_for_free,review_original
0,160077941,My all time favorite game,2024-03-08 03:28:29,1,False,False,My all time favorite game
1,160077923,very good\r\n,2024-03-08 03:27:55,1,False,False,very good\r\n
2,160077236,swag,2024-03-08 03:10:58,1,False,False,swag
3,160077159,"I played this game for almost 5,000 hours now ...",2024-03-08 03:09:20,1,False,False,"I played this game for almost 5,000 hours now ..."
4,160077082,1,2024-03-08 03:07:38,1,False,False,1
...,...,...,...,...,...,...,...
60613,129791250,pls dont,2022-12-31 17:07:26,-1,False,False,pls dont
60614,129791154,steal your soul,2022-12-31 17:06:08,1,False,False,steal your soul
60615,129790983,DotA is weird game where anything can work and...,2022-12-31 17:03:26,1,False,False,DotA is weird game where anything can work and...
60617,129788206,Still beta so be patient\n,2022-12-31 16:22:59,1,False,False,Still beta so be patient\n


In [73]:
sys.path.append('../../sa')
import str_cleaning_functions


def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

In [74]:
cleaning(reviews_df, 'review')

In [75]:
X = reviews_df['review'].values

In [76]:
# tokens spliting helper functions
# copied from bertopic_training.ipynb on 20240217

def split_tokens_into_smaller_chunks(
    data,
    chunk_size: int,
    stride: int,
    minimal_chunk_length: int,
) -> dict:
    """Splits tokens into overlapping chunks with given size and stride."""

    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []

    # eval_only
    _original_iloc = []

    for input_id, token_type_id, mask_chunk, iloc in zip(data['input_ids'], data['token_type_ids'], data['attention_mask'], data['X_iloc']):
        _input_id_chunk = split_overlapping(input_id, chunk_size, stride, minimal_chunk_length)
        _token_type_id = split_overlapping(token_type_id, chunk_size, stride, minimal_chunk_length)
        _mask_chunk = split_overlapping(mask_chunk, chunk_size, stride, minimal_chunk_length)

        _new_input_id_chunks.extend(_input_id_chunk)
        _new_token_type_ids.extend(_token_type_id)
        _new_mask_chunks.extend(_mask_chunk)

        _original_iloc.extend([iloc] * len(_input_id_chunk))

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks, 'X_iloc': _original_iloc}

def split_overlapping(tensor:list[int], chunk_size: int, stride: int, minimal_chunk_length: int) -> list[list[int]]:
    """Helper function for dividing 1-dimensional tensors into overlapping chunks."""
    # check_split_parameters_consistency(chunk_size, stride, minimal_chunk_length)
    result = [tensor[i : i + chunk_size] for i in range(0, len(tensor), stride)]
    if len(result) > 1:
        # ignore chunks with less than minimal_length number of tokens
        result = [x for x in result if len(x) >= minimal_chunk_length]
    return result


def tokenize_dataset(data, tokenizer):
    # return sbert_model[0].tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)
    return tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)

In [77]:
from sentence_transformers import SentenceTransformer
from datasets import Dataset

split_sentence = True
sbert_model_name = 'all-MiniLM-L6-v2'       # !!! check with the model to be loaded
sbert = SentenceTransformer(sbert_model_name, device=device)

if split_sentence:
    X_new = []

    tokenizer = sbert[0].tokenizer

    # tokenize the dataset
    # then split the tokens into smaller chunks
    ds_sentences = Dataset.from_dict({'text': X})
    ds_sentences = ds_sentences.map(tokenize_dataset, batched=True, fn_kwargs={'tokenizer':tokenizer})
    ds_sentences2 = Dataset.from_dict({'input_ids': ds_sentences['input_ids'], 'token_type_ids': ds_sentences['token_type_ids'], 'attention_mask': ds_sentences['attention_mask'], 'X_iloc': list(range(len(X)))})
    ds_sentences2 = ds_sentences2.map(split_tokens_into_smaller_chunks, batched=True, fn_kwargs={'chunk_size': sbert.max_seq_length-2, 'stride': sbert.max_seq_length-2, 'minimal_chunk_length': 1})

    # re-create new sentences based on tokens
    for input_id in ds_sentences2['input_ids']:
        X_new.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id)))

    embeddings = sbert.encode(X_new, show_progress_bar=True, batch_size=64)

    print('Created embeddings with split sentences')
else:
    embeddings = sbert.encode(X, show_progress_bar=True, batch_size=64)
    

Map:   0%|          | 0/33063 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1770 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 33063/33063 [00:01<00:00, 20536.82 examples/s]
Map: 100%|██████████| 33063/33063 [00:00<00:00, 54875.14 examples/s]
Batches: 100%|██████████| 525/525 [00:05<00:00, 88.07it/s] 


Created embeddings with split sentences


In [78]:
if split_sentence:
    reviews_df_split = reviews_df.iloc[ds_sentences2['X_iloc']]
    reviews_df_split['review_split'] = X_new

else:
    reviews_df_split = reviews_df

reviews_df_split

Unnamed: 0,recommendationid,review,timestamp_created,voted_up,steam_purchase,received_for_free,review_original,review_split
0,160077941,My all time favorite game,2024-03-08 03:28:29,1,False,False,My all time favorite game,my all time favorite game
1,160077923,very good\r\n,2024-03-08 03:27:55,1,False,False,very good\r\n,very good
2,160077236,swag,2024-03-08 03:10:58,1,False,False,swag,swag
3,160077159,"I played this game for almost 5,000 hours now ...",2024-03-08 03:09:20,1,False,False,"I played this game for almost 5,000 hours now ...","i played this game for almost 5, 000 hours now..."
4,160077082,1,2024-03-08 03:07:38,1,False,False,1,1
...,...,...,...,...,...,...,...,...
60613,129791250,pls dont,2022-12-31 17:07:26,-1,False,False,pls dont,pls dont
60614,129791154,steal your soul,2022-12-31 17:06:08,1,False,False,steal your soul,steal your soul
60615,129790983,DotA is weird game where anything can work and...,2022-12-31 17:03:26,1,False,False,DotA is weird game where anything can work and...,dota is weird game where anything can work and...
60617,129788206,Still beta so be patient\n,2022-12-31 16:22:59,1,False,False,Still beta so be patient\n,still beta so be patient


---

Evaluation

In [79]:
from eval_metrics import SEARCH_BEHAVIOUR

sys.path.append('../../topic_modelling/bertopic_dev')
from bertopic_utils import _load_bertopic_model

%autoreload 2
from dataset_loader import GENRES, load_dataset

In [80]:
# load the bertopic model

# Test whether the result are the same when load the model from the disk

# load the best model and the embedding from the config folder

genre = GENRES.ACTION
split_sentence = True
search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
training_datetime = datetime(2024, 3, 1, 9, 51, 49)
training_folder_p = Path(f'../bertopic_dev/category_{str(genre)}_unique_review_text')
training_folder = Path(f'bertopic{"[split]" if split_sentence else ""}_genre_{str(genre)}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
training_folder = training_folder_p.joinpath(training_folder)


training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)

# embeddings (uncomment it to load embeddings from training set for quick verification)
# embeddings_path = training_folder.joinpath(
#     f'embeddings_{training_result["best_hyperparameters"]["sbert_params"]["model_name_or_path"]}.pkl'
# )
# if embeddings_path.exists():
#     with open(embeddings_path, 'rb') as f:
#         embeddings = np.load(f)

#     assert embeddings.shape[0] == len(X), f'Number of embeddings ({embeddings.shape[0]}) does not match the number of reviews ({len(X)}). Function terminates.'
# else:
#     # raise Exception('No embeddings found. Function terminates.')
#     print('No embeddings found.')
#     embeddings = []


# model
# best_model_checkpoint_path = training_result['best_model_checkpoint']
best_model_checkpoint_path = training_folder_p.parent.joinpath(
    Path(training_result['best_model_checkpoint'])
)

print(best_model_checkpoint_path)

../bertopic_dev/category_action_unique_review_text/bertopic[split]_genre_action_grid_search_20240301_095149/bertopic_bt_nr_topics_100


In [81]:
# or change the model path to a different one if needed

N_TOPICS = 10

best_model_checkpoint_path = Path(best_model_checkpoint_path).parent.joinpath(f'bertopic_bt_nr_topics_{N_TOPICS}')

print(best_model_checkpoint_path)

../bertopic_dev/category_action_unique_review_text/bertopic[split]_genre_action_grid_search_20240301_095149/bertopic_bt_nr_topics_10


In [82]:
bertopic_model = _load_bertopic_model(best_model_checkpoint_path)

if split_sentence:
    X = reviews_df_split['review_split'].values
else:
    X = reviews_df_split['review'].values

topics, probs = bertopic_model.transform(X, embeddings=embeddings)

topic_model = bertopic_model

2024-03-14 23:38:34,630 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


---

Get top 10 keywords for each topics

These keywords will be updated

Then evaluate

In [83]:
top_N_words = 10
topic_keywords = {}

# for new text, we re-run the func to get the keywords on the new text
documents = pd.DataFrame({"Document": X, "ID": range(len(X)), 'Topic': topics})
topic_model._extract_topics(
    documents, embeddings
)

for topic_id, _keywords in topic_model.get_topics().items():
    if topic_id not in topic_keywords:
        topic_keywords[topic_id] = []
    
    for i in range(top_N_words):
        topic_keywords[topic_id].append(_keywords[i][0])

for topic_id, keywords in topic_keywords.items():
    print(f'Topic {topic_id}:')
    print(', '.join(keywords))
    print()

Topic -1:
play, playing, good, fun, toxic, players, like, life, hours, team

Topic 0:
best, moba, play, like, players, community, playing, new, toxic, fun

Topic 1:
update, fix, play, new, server, bad, valve, pc, uninstall, crashes

Topic 2:
10, life, recommend, cat, kill, mid, 11, years, ruined, play

Topic 3:
good, hate, hi, life, ash, steven, disney, youre, watching, nice

Topic 4:
не, игра, на, что, но, russian, это, как, для, все

Topic 5:
barry, pills, disease, inactive, ingredients, sfx, stand, pill, durring, want

Topic 6:
soul, sugar, bowl, mixture, eggs, flour, barry, pancakes, tin, crust

Topic 7:
old, classic, barry, masterpiece, best, greatest, gold, al, miss, school

Topic 8:
skins, valve, ranks, buy, items, need, hats, available, cap, tinker

Topic 9:
ether, addiction, water, meth, addictive, heroin, tsp, crack, meat, chicken




divide by zero encountered in divide



---

Evaluation (visualization)

In [84]:
eval_results_external_folder_path = Path(f'../eval_results_external/{game_name}')

eval_results_external_folder_path = eval_results_external_folder_path.joinpath(
    *best_model_checkpoint_path.parts[2:]
)

print(eval_results_external_folder_path)

if not eval_results_external_folder_path.exists():
    eval_results_external_folder_path.mkdir(parents=True)

../eval_results_external/dota2/category_action_unique_review_text/bertopic[split]_genre_action_grid_search_20240301_095149/bertopic_bt_nr_topics_10


In [50]:
# visualize topics

fig = topic_model.visualize_topics()

save_html = True
if save_html:
    fig.write_html(str(eval_results_external_folder_path.joinpath(
        f'intertopic_dist_map.html') if type(genre) == GENRES \
            else eval_results_external_folder_path.joinpath(f'intertopic_dist_map.html')
    ))

fig

In [85]:
# Prepare data for PyLDAVis

# ref: https://github.com/rafaelvalero/different_notebooks/blob/master/bertopics_pyldavis.ipynb

top_n = N_TOPICS      # !! change the number of topics to match the model
R = 10

topic_term_dists = topic_model.c_tf_idf_.toarray()[:top_n+1, ]
new_probs = probs[:, :top_n]
outlier = np.array(1 - new_probs.sum(axis=1)).reshape(-1, 1)
doc_topic_dists = np.hstack((new_probs, outlier))
doc_lengths = [len(doc) for doc in X]
vocab = [word for word in topic_model.vectorizer_model.vocabulary_.keys()]
term_frequency = [topic_model.vectorizer_model.vocabulary_[word] for word in vocab]

data = {'topic_term_dists': topic_term_dists,
        'doc_topic_dists': doc_topic_dists,
        'doc_lengths': doc_lengths,
        'vocab': vocab,
        'term_frequency': term_frequency}

# Visualize using pyLDAvis
vis_data= pyLDAvis.prepare(**data,R=R, n_jobs = 1, mds='mmds')

save_html = False
if save_html:
    pyLDAvis.save_html(vis_data, str(eval_results_external_folder_path.joinpath(
        f'pyLDAvis.html' if type(genre) == GENRES \
            else f'pyLDAvis.html'))
    )

pyLDAvis.display(vis_data)


divide by zero encountered in log


invalid value encountered in log


divide by zero encountered in log


invalid value encountered in log


divide by zero encountered in log



In [86]:
# visualize selecteed terms for a few topics
# creating bar charts out of the c-TF-IDF scores for each topic representation.

n_topics = training_result['best_hyperparameters']['bertopic_params']['nr_topics']

fig = topic_model.visualize_barchart(top_n_topics=n_topics, n_words=10)
# fig.write_html(str(eval_results_external_folder_path.joinpath(
#     f'topwordscores.html' if type(genre) == GENRES \
#         else f'topwordscores.html')
#         ))

fig

In [87]:
fig = topic_model.visualize_heatmap(top_n_topics=n_topics, width=1000, height=1000)

fig.write_html(str(eval_results_external_folder_path.joinpath(
    f'topicsimilarity.html' if type(genre) == GENRES \
        else f'topicsimilarity.html')
        ))

fig

---

Qualitative evaluation

In [66]:
# get top 10 representative docs for each topic

# Approximate most representative documents per topic by sampling
# a subset of the documents in each topic and calculating which are
# most represenative to their topic based on the cosine similarity between
# c-TF-IDF representations

# the method was called internally in the fit_transform method
# so that the .get_topic_info() can work properly when the model was reloaded from disk
repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids = topic_model._extract_representative_docs(
    topic_model.c_tf_idf_,
    pd.DataFrame({"Document": X, "ID": range(len(X)), "Topic": topics}),
    topic_model.topic_representations_,
    nr_samples=500,
    nr_repr_docs=10          # the number of representative documents per topic
)

In [67]:
df_original_texts = []

for i, topic_repr_docs_id in enumerate(repr_docs_ids):
    t = reviews_df_split.iloc[topic_repr_docs_id]
    t['topic_id'] = i - 1           # starts from -1, as -1 represents outliers

    df_original_texts.append(t)

df_original_texts = pd.concat(df_original_texts)
df_original_texts

Unnamed: 0,recommendationid,review,timestamp_created,voted_up,steam_purchase,received_for_free,review_original,review_split,topic_id
96345,145736692,After having played almost 60 hours in the fir...,2023-09-06 00:02:13,1,True,False,After having played almost 60 hours in the fir...,"some things are a bit unclear or easy to miss,...",-1
80995,145811986,The Short Version after beating the game:\n\nT...,2023-09-06 20:30:48,-1,True,False,The Short Version after beating the game:\n\nT...,the short version after beating the game : the...,-1
65535,146021164,Boring game with a lame main story. Pretty muc...,2023-09-10 01:09:00,-1,False,False,Boring game with a lame main story. Pretty muc...,boring game with a lame main story. pretty muc...,-1
7725,154551697,"I mean, 158 hours in, and I finally ran out of...",2023-12-27 16:35:36,1,True,False,"I mean, 158 hours in, and I finally ran out of...","i mean, 158 hours in, and i finally ran out of...",-1
43391,146884772,I've heard quite a few things about this game ...,2023-09-23 20:11:22,1,True,False,I've heard quite a few things about this game ...,i ' ve heard quite a few things about this gam...,-1
...,...,...,...,...,...,...,...,...,...
52474,146348864,Fast Travel not to Fast travel everywhere is j...,2023-09-15 17:44:25,-1,True,False,Fast Travel Simulator.Choosing not to Fast tra...,talk to npc / interact with quest objective - ...,9
26627,148740912,The premise of Starfield is incredibly compell...,2023-10-23 15:59:46,-1,False,False,The premise of Starfield is incredibly compell...,there ' s a massive disparity in the variety o...,9
52474,146348864,Fast Travel not to Fast travel everywhere is j...,2023-09-15 17:44:25,-1,True,False,Fast Travel Simulator.Choosing not to Fast tra...,r + x - talk to npc / interact with quest obje...,9
28346,148441089,"Bethesda has the good ingredients, but failed ...",2023-10-18 15:19:57,-1,True,False,"Bethesda has the good ingredients, but failed ...",##es. you can do away with 95 % of the science...,9


In [68]:
# print out the original text and the split (if any) for reference

for topic_id in repr_docs_mappings.keys():
    print(f'Topic {topic_id}:')

    for index, row in df_original_texts[df_original_texts['topic_id'] == topic_id].iterrows():
        print(f'Doc {index}')
        print(f'Original: {row["review_original"]}')
        if split_sentence:
            print(f'Split: {row["review_split"]}')
        print()

Topic -1:
Doc 96345
Original: After having played almost 60 hours in the first 4 days since release I think I'm in the position to give some first impressions. And yes I call it first impressions because I'm taking it slow and I feel like after 60 hours I only scratched the surface of this game for what will probably take me 200 hours to complete. 

The graphics range from breathtaking to disappointing. For the most part, the graphics look fantastic and some of the environments and planets just look amazing and the lighting is done really well in the game. There are are places which really do not look good at all. One would be the big city of New Atlantis which is one of the first places you will visit in the game. Don't be put off by that, the other bigger settlements/cities look significantly better visually.

One thing you need to know about is what this game is and what it isn't. This game isn't a space sim. You will spend very little time in your ship and most time on planets. The

In [88]:
# build the topic frequency table from results

from collections import Counter

topic_freq = Counter(topics)
topic_freq = pd.DataFrame(topic_freq.items(), columns=['topic_id', 'count'])
topic_freq = topic_freq.sort_values(by='topic_id')
topic_freq.reset_index(drop=True, inplace=True)
# rename columns
topic_freq.columns = ['Topic', 'Count']
topic_freq

Unnamed: 0,Topic,Count
0,-1,9154
1,0,5692
2,1,1467
3,2,1384
4,3,8202
5,4,6532
6,5,22
7,6,180
8,7,367
9,8,190


In [58]:
top_n = 10
df_original_texts.to_pickle(eval_results_external_folder_path.joinpath(f'df_eval_top_{top_n}.pkl'))

In [59]:
# also need to save the top N keywords for each topic as json
with open(eval_results_external_folder_path.joinpath(f'top_{top_N_words}_keywords.json'), 'w') as f:
    json.dump(topic_keywords, f, indent=2)

In [89]:
topic_freq.to_pickle(eval_results_external_folder_path.joinpath(f'df_eval_topic_freq.pkl'))

---

LLM topic naming on external docs

(copy from bertopic_eval_quali.ipynb)

In [61]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

In [62]:
# can try diff llama2: https://ollama.com/library/yarn-llama2

llm = Ollama(model="llama2")        # assuming the port is 11434

In [63]:
# prompt engineering
system_message = "You are a player of the game who is reading the reviews about the game."

human_template = \
'''Create a name for a topic given the topic's keywords and some most representative reviews of the topic. Output a label for the topic in less than 5 words. Output "NA" if the topic is not clear. Do not output other text. 

The top keywords of the topic is: \'\'\'{topic_keywords}\'\'\'. 

The most representative reviews of the topic are: \'\'\'{topic_reviews}\'\'\'. '''

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("human", human_template)
])

chain = chat_prompt | llm

In [64]:
new_topic_labels = {}
randomed_topic_reviews = {}

In [65]:
import time

N_times = 5

topic_ids = list(repr_docs_mappings.keys())

for topic_id in topic_ids:
    _topic_keywords = topic_keywords[topic_id]

    temp_disable_char_limit = False
    _count = 0

    _reviews_df = df_original_texts[df_original_texts['topic_id'] == topic_id]
    for i in range(N_times):
        if new_topic_labels.get(topic_id, {}).get(f"call_{i}", None) is not None:
            print(f'{topic_id:02}_call{i}: {new_topic_labels[topic_id][f"call_{i}"]}')
            continue

        while True:
            if _count > 20:
                temp_disable_char_limit = True

            
            _sampled_reviews_df = _reviews_df.sample(n=2, replace=False)

            if split_sentence:
                check_bool = _sampled_reviews_df.apply(lambda x: len(x['review_split']) < 5000, axis=1)
            else:
                check_bool = _sampled_reviews_df.apply(lambda x: len(x['review']) < 5000, axis=1)
        
            
            if temp_disable_char_limit:
                break
            
            if all(check_bool):
                break
            else:
                _count += 1

        if split_sentence:
            topic_reviews = list(_sampled_reviews_df['review_split'].values)
        else:
            topic_reviews = list(_sampled_reviews_df['review'].values)

        print(topic_reviews)

        result = chain.invoke({
            'topic_keywords':_topic_keywords,
            'topic_reviews':topic_reviews
        })

        print(f'{topic_id:02}_call{i}: {result}')

        if topic_id not in new_topic_labels:
            new_topic_labels[topic_id] = {}
            randomed_topic_reviews[topic_id] = {}


        new_topic_labels[topic_id][f"call_{i}"] = result
        randomed_topic_reviews[topic_id][f"call_{i}"] = {
            'reviews': topic_reviews,
            "recommendationid": _sampled_reviews_df['recommendationid'].values.tolist()
        }

    print()

['the missions were fun, as was the new area, the biggest problem i have is the ending to it. it \' s, much like my opinion on the main game, extremely poorly written and just strait up not enjoyable. full of plot holes, 50 % of the problems could have been solved by a cell which we have, the entire crazy that the secret ending could have been solved [ spoiler ] with a phone call or you know, demanding you bring your romantic interest with you ( in my case judy ), or you emergency contact? have the doctor or your newfound friend at nusa keep your friends up to date on your you don \' t get to play your character, you play cd projekt red \' s character the entire game. enjoy your " rpg " that \' s actually just a driven action adventure writer of this ending clearly just wanted people to feel like shit for having the audacity of wanting a subvert of expectations, you know, the thing everyone considers good story telling. i know the cyberpunk series extremely well, it \' s a massive subv

In [66]:
# save the topic labels, sampled docs and their ids
# in the eval folder

llm_generation_result = {
    'new_topic_labels': new_topic_labels,
    'randomed_topic_reviews': randomed_topic_reviews
}

with open(eval_results_external_folder_path.joinpath('llm_generation_result.json'), 'w') as f:
    json.dump(llm_generation_result, f, indent=2)