CTM qualitative review on external doc

(i.e. comments scraped from steam)

In [1]:
import pandas as pd
import numpy as np


from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
# from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

import nltk

import os
from pathlib import Path
import json
from datetime import datetime
import pickle
import traceback

os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

import sys

sys.path.append('../')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload

In [28]:
%autoreload 2
from dataset_loader import GENRES, load_dataset
from eval_metrics import SEARCH_BEHAVIOUR

In [4]:
# the text to be evaluated

game_steamid = 730
game_name = 'counter-strike_2'

# game_steamid = 1091500
# game_name = 'cyberpunk2077'

datetime_until = datetime(2024, 1, 1, 0, 0, 0)      # only analyse reviews from this date until now (GMT+8)

# load the reviews from folder

reviews_reqs = []

# get existing folder and retrieve the cursor object (?)

# load the latest file
game_folder = Path(f'../../dataset/data_scraping/steam_comments_scraping/{game_name}').resolve()
if game_folder.exists():
    try:
        latest_file_path = sorted(game_folder.glob('steam_reviews_*.pkl'))[-1]
        with open(latest_file_path, 'rb') as f:
            reviews_reqs = pickle.load(f)           # retrieve the list of reviews
            print('Loaded:', latest_file_path)
    except IndexError as e:
        print('Error loading the latest file:', e)
        traceback.print_exc()

Loaded: /root/FYP/NLP/dev-workspace/dataset/data_scraping/steam_comments_scraping/counter-strike_2/steam_reviews_730_unique.pkl


In [6]:
# create a dataframe like in training/evaluation
reviews_df = pd.DataFrame(reviews_reqs)

reviews_df = reviews_df[['recommendationid', 'review', 'timestamp_created', 'voted_up', 'steam_purchase', 'received_for_free']]
# reviews_df = reviews_df[reviews_df['timestamp_created'] >= datetime_until.timestamp()]

# filter unique reviews
reviews_df = reviews_df.drop_duplicates(subset=['review', 'voted_up'])

# convert timestamp to datetime. The datetime converted is in utc+0
reviews_df['timestamp_created'] = pd.to_datetime(reviews_df['timestamp_created'], unit='s')

# convert the voted_up to 1 and -1
reviews_df['voted_up'] = reviews_df['voted_up'].apply(lambda x: 1 if x else -1)

reviews_df['review_original'] = reviews_df['review']
reviews_df['review_bow'] = reviews_df['review']

reviews_df

Unnamed: 0,recommendationid,review,timestamp_created,voted_up,steam_purchase,received_for_free,review_original,review_bow
0,159325319,causes sleepless nights 10/10,2024-02-26 13:59:29,1,False,False,causes sleepless nights 10/10,causes sleepless nights 10/10
1,159325241,Nice game,2024-02-26 13:58:24,1,False,False,Nice game,Nice game
2,159324916,nice game\n,2024-02-26 13:51:51,1,False,False,nice game\n,nice game\n
3,159324780,game keep crashing too many cheaters subtick s...,2024-02-26 13:49:38,-1,False,False,game keep crashing too many cheaters subtick s...,game keep crashing too many cheaters subtick s...
4,159324627,best,2024-02-26 13:46:45,1,True,False,best,best
...,...,...,...,...,...,...,...,...
33969,154907923,Cheat Shooter 2,2023-12-31 16:29:26,-1,True,False,Cheat Shooter 2,Cheat Shooter 2
33970,154907896,fuck cs2. bring back csgo,2023-12-31 16:29:07,-1,False,False,fuck cs2. bring back csgo,fuck cs2. bring back csgo
33971,154907281,Good game with good skins,2023-12-31 16:20:31,1,False,False,Good game with good skins,Good game with good skins
33972,154906860,i like css better,2023-12-31 16:14:38,-1,True,False,i like css better,i like css better


In [7]:
import sys
sys.path.append('../../sa/')

%autoreload 2
import str_cleaning_functions

# copied from lda_demo_gridsearch.ipynb
def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

# copied from bert_demo_gridsearch.ipynb
def cleaning_little(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

In [9]:
cleaning(reviews_df, 'review_bow')
cleaning_little(reviews_df, 'review')

In [11]:
def _filter_zero_len(x):
    if len(x['review']) == 0 or len(x['review_bow']) == 0:
        return False
    return True

reviews_df = reviews_df[reviews_df.apply(lambda x: _filter_zero_len(x), axis=1)]

In [12]:
print(len(reviews_df))

18344


In [13]:
# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

In [14]:
from datasets import Dataset

# X_preprocessed = list(map(lambda x: lemmatization(x), X_preprocessed))
# X_preprocessed = list(map(lambda x: ' '.join(x), X_preprocessed))

def lemmatization_dataset(data):
    return {'review_text2': ' '.join(lemmatization(data['review_text']))}

temp_dataset = Dataset.from_dict({'review_text': reviews_df['review_bow'].values})
temp_dataset = temp_dataset.map(lemmatization_dataset, num_proc=4)      # speed up lemmatization
reviews_df['review_bow'] = temp_dataset['review_text2']

Map (num_proc=4): 100%|██████████| 18344/18344 [00:01<00:00, 10864.72 examples/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df['review_bow'] = temp_dataset['review_text2']


In [22]:
from copy import deepcopy

X_contextual = reviews_df['review'].values
X_bow = reviews_df['review_bow'].values
X = deepcopy(X_contextual)

Create split text for models trained with split tokens

In [23]:
import torch
import platform
if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series mac machine

print(device)

cuda


In [24]:
# copy from ctm_training.ipynb

# from datasets import Dataset
from tqdm.autonotebook import trange

def split_X_contextual_X_bow(X_contextual, X_bow, X, sbert, split:bool=False):
    if not split:
        return X_contextual, X_bow, X, list(range(len(X_contextual)))
    else:
        X_contextual_new, X_bow_new, X_new = [], [], []
        tokenizer = sbert[0].tokenizer

        # for eval
        _original_iloc = []

        batch_size = 64
        for start_index in trange(0, len(X_contextual), batch_size, desc="Batches", disable=False):
            sentence_batch = X_contextual[start_index:start_index+batch_size]
            features = tokenizer(sentence_batch, return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)

            # split overlapping
            features_split = split_tokens_into_smaller_chunks(features, sbert.max_seq_length-2,  sbert.max_seq_length-2, 1)

            for i, input_id_list in enumerate(features_split['input_ids']):
                for input_id in input_id_list:
                    X_contextual_new.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_id)))
                    X_bow_new.append(X_bow[start_index+i])
                    X_new.append(X[start_index+i])

                # for eval
                _original_iloc.extend([start_index+i]*len(input_id_list))


        assert len(X_contextual_new) == len(X_bow_new), "X_contextual_new and X_bow_new should have the same length. Found: {} and {}".format(len(X_contextual_new), len(X_bow_new))
        assert len(X_contextual_new) == len(X_new), "X_contextual_new and X_new should have the same length. Found: {} and {}".format(len(X_contextual_new), len(X_new))
        assert len(X_contextual_new) == len(_original_iloc), "X_contextual_new and _original_iloc should have the same length. Found: {} and {}".format(len(X_contextual_new), len(_original_iloc))
        return X_contextual_new, X_bow_new, X_new, _original_iloc
   
# ####################
# # helper functions
# ####################
    
# tokens spliting helper functions

def split_tokens_into_smaller_chunks(
    data,
    chunk_size: int,
    stride: int,
    minimal_chunk_length: int,
) -> dict:
    """Splits tokens into overlapping chunks with given size and stride."""

    _new_input_id_chunks = []
    _new_token_type_ids = []
    _new_mask_chunks = []

    for input_id, token_type_id, mask_chunk in zip(data['input_ids'], data['token_type_ids'], data['attention_mask']):
        _input_id_chunk = split_overlapping(input_id, chunk_size, stride, minimal_chunk_length)
        _token_type_id = split_overlapping(token_type_id, chunk_size, stride, minimal_chunk_length)
        _mask_chunk = split_overlapping(mask_chunk, chunk_size, stride, minimal_chunk_length)

        _new_input_id_chunks.append(_input_id_chunk)
        _new_token_type_ids.append(_token_type_id)
        _new_mask_chunks.append(_mask_chunk)    

    return {'input_ids':_new_input_id_chunks, 'token_type_ids':_new_token_type_ids, 'attention_mask': _new_mask_chunks}

def split_overlapping(tensor:list[int], chunk_size: int, stride: int, minimal_chunk_length: int) -> list[list[int]]:
    """Helper function for dividing 1-dimensional tensors into overlapping chunks."""
    # check_split_parameters_consistency(chunk_size, stride, minimal_chunk_length)
    result = [tensor[i : i + chunk_size] for i in range(0, len(tensor), stride)]
    if len(result) > 1:
        # ignore chunks with less than minimal_length number of tokens
        result = [x for x in result if len(x) >= minimal_chunk_length]
    return result


def tokenize_dataset(data, tokenizer):
    # return sbert_model[0].tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)
    return {'tokenized': tokenizer(data['text'], return_attention_mask=True, return_token_type_ids=True, add_special_tokens=False, return_tensors=None, truncation=False)}


In [25]:
# ATTENTION !!!!!
# define the sbert model (SHOULD BE THE SAME AS TRAINING)
# also define whether we want to split the tokens or not

split_sentences = False
sbert_model_name = 'all-MiniLM-L6-v2'

# load the sbert model
from sentence_transformers import SentenceTransformer
sbert = SentenceTransformer(sbert_model_name, device=device)

In [26]:
X = list(reviews_df['review'].values)
X_preprocessed = list(reviews_df['review_bow'].values)

X_contextual, X_bow, X, _original_iloc = split_X_contextual_X_bow(
    X, X_preprocessed, X, 
    sbert, 
    split=split_sentences)

---

Load the trained model

In [27]:
%autoreload 2
sys.path.append('../ctm_dev')
from ctm_dataset_creation import create_ctm_dataset
from ctm_utils import _load_ctm_model

In [49]:
# load the model from disk to compare the results

genre = GENRES.INDIE
search_behaviour = SEARCH_BEHAVIOUR.GRID_SEARCH
split_sentences = False
sbert_model_name = 'all-MiniLM-L6-v2'


# training_datetime = datetime(2024, 2, 14, 22, 4, 32)
training_datetime = datetime(2024, 2, 16, 11, 59, 10)
training_folder_p = Path(f'../ctm_dev/category_{str(genre)}_unique_review_text').resolve()
training_folder = Path(f'ctm{"[split]" if split_sentences else ""}_genre_{str(genre)}_{search_behaviour.value}_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
training_folder = training_folder_p.joinpath(training_folder)

training_result_json_path = training_folder.joinpath('result.json')
with open(training_result_json_path, 'r') as f:
    training_result = json.load(f)


# load the embeddings (training)
# model_name_or_path = training_result['best_hyperparameters']['sbert_params']['model_name_or_path']
# embeddings_path = training_folder.joinpath(f'embeddings_{model_name_or_path}.pkl')
# with open(embeddings_path, 'rb') as f:
#     embeddings = np.load(f)

best_model_path = training_folder_p.parent.joinpath(training_result['best_model_checkpoint'])
ctm_hyperparameters = training_result['best_hyperparameters']['ctm_params']
sbert_params = training_result['best_hyperparameters']['sbert_params']

# ctm_hyperparameters['bow_size'] = 2000
# ctm_hyperparameters['contextual_size'] = 768

# create the dataset on the fly
with open(Path(best_model_path).joinpath('count_vectorizer.pkl'), 'rb') as f:
    vectorizer = pickle.load(f)

training_dataset, _, _, _, _ = create_ctm_dataset(
    X_contextual, X_bow, X,
    sbert_params, training_folder,
    vectorizer=vectorizer,
    X_contextual_embedding_path = Path(best_model_path).parent.parent.joinpath('preprocessed_data').joinpath(
        f'{genre.value:02}_{str(genre)}_dataset_eval{"_[split]" if split_sentences else ""}_X_contextual_embeddings.npy')       # attempt to get existing embeddings for evaluation
    )

Found existing sbert embeddings at /root/FYP/NLP/dev-workspace/topic_modelling/ctm_dev/category_indie_unique_review_text/preprocessed_data/01_indie_dataset_eval_X_contextual_embeddings.npy. Reusing them.




In [50]:
best_model_path = best_model_path.parent.joinpath(
    'ctm_ctm_n_components_20'
)

In [51]:
ctm_model = _load_ctm_model(best_model_path, ctm_hyperparameters)



---

Visualization

In [55]:
import pyLDAvis as vis

vocab = vectorizer.get_feature_names_out()

lda_vis_data = ctm_model.get_ldavis_data_format(vocab, training_dataset, n_samples=10)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

  0%|          | 0/219 [00:00<?, ?it/s]

100%|██████████| 219/219 [00:00<00:00, 258.50it/s]


Evaluation

Cannot... as the topic keywords will not be updated when u fed a out-of-context text

In [76]:
topic_list = ctm_model.get_topic_lists(10)

topic_list

[['hate', 'suck', 'kid', 'god', 'pass', 'damn', 'ok', 'wan', 'hat', 'gon'],
 ['wow',
  'weird',
  'boring',
  'funny',
  'neutral',
  'confuse',
  'mainly',
  'appearance',
  'cough',
  'conclusion'],
 ['terrarium',
  'minecraft',
  'biome',
  'npc',
  'item',
  'boss',
  'craft',
  'armor',
  'world',
  'hardmode'],
 ['night',
  'play',
  'freddy',
  'scary',
  'friend',
  'know',
  'fnaf',
  'think',
  'say',
  'na'],
 ['puzzle',
  'horror',
  'look',
  'solve',
  'world',
  'gameplay',
  'like',
  'feel',
  'music',
  'great'],
 ['novel',
  'character',
  'depression',
  'ending',
  'date',
  'girl',
  'vn',
  'visual',
  'end',
  'like'],
 ['eat',
  'cat',
  'life',
  'goat',
  'fly',
  'kill',
  'simulator',
  'blood',
  'man',
  'human'],
 ['pretty',
  'good',
  'nice',
  'short',
  'graphic',
  'cool',
  'really',
  'bit',
  'music',
  'kinda'],
 ['relax',
  'simple',
  'puzzle',
  'easy',
  'challenge',
  'achievement',
  'solve',
  'difficult',
  'little',
  'complete'],
 ['in

In [77]:
# get top N keywords for each topic

topic_list = ctm_model.get_topic_lists(10)

topic_list

top_N_words = 10

topic_keywords = {}
topic_keywords_pyldavis = {}

for topic_id, _keywords in enumerate(ctm_model.get_topic_lists(top_N_words)):
    topic_keywords[topic_id] = _keywords
    topic_keywords_pyldavis[topic_id] = list(ctm_pd.sorted_terms(topic=topic_id+1, _lambda=0.6)['Term'].values[:top_N_words])

    print(f'Topic {topic_id}:')
    print(', '.join(_keywords))
    print(', '.join(topic_keywords_pyldavis[topic_id]))
    print()

Topic 0:
hate, suck, kid, god, pass, damn, ok, wan, hat, gon
fix, window, crash, bug, file, port, menu, lag, unplayable, forum

Topic 1:
wow, weird, boring, funny, neutral, confuse, mainly, appearance, cough, conclusion
truck, mod, ets, workshop, car, server, community, drive, map, garry

Topic 2:
terrarium, minecraft, biome, npc, item, boss, craft, armor, world, hardmode
hate, suck, kid, pass, god, damn, ok, hat, wan, thanks

Topic 3:
night, play, freddy, scary, friend, know, fnaf, think, say, na
night, play, freddy, scary, fnaf, na, know, scar, animatronics, friend

Topic 4:
puzzle, horror, look, solve, world, gameplay, like, feel, music, great
eat, cat, life, goat, fly, blood, body, human, man, water

Topic 5:
novel, character, depression, ending, date, girl, vn, visual, end, like
play, card, player, mode, level, deck, campaign, match, hour, class

Topic 6:
eat, cat, life, goat, fly, kill, simulator, blood, man, human
tl, dr, conclusion, mainly, pros, factor, credit, suit, differenc

In [60]:
# note that due to its stochastic (sampling) nature, the doc-topic distribution may not be the same each time.
# although a n_samples param is applied
doc_topic_distribution = ctm_model.get_doc_topic_distribution(training_dataset, n_samples=20)

# sample call
top_docs = ctm_model.get_top_documents_per_topic_id(X, doc_topic_distribution, 8, k=10)

100%|██████████| 219/219 [00:00<00:00, 343.81it/s]


In [61]:
repr_docs = {}
repr_docs_ids = {}

# to get the index of the top documents
def _get_top_docs_index_per_topic(X, doc_topic_distributions, topic_id, k=10):
    probability_list = doc_topic_distributions.T[topic_id]
    ind = probability_list.argsort()[-k:][::-1]
    return ind

for topic_id in range(ctm_model.n_components):
    repr_docs[topic_id] = ctm_model.get_top_documents_per_topic_id(X, doc_topic_distribution, topic_id, k=10)
    repr_docs_ids[topic_id] = list(_get_top_docs_index_per_topic(X, doc_topic_distribution, topic_id, 10))

In [62]:
repr_docs

{0: [('CHEATERS!, CHEATERS! EVERY GAME :) VALVE LOVE MONEY FROM BOUGHT ACC',
   1.0),
  ('good enough', 0.9364315),
  ('A shooter that can be described as the "dark souls" of competitive strategy takes centre stage in this action packed skill based shooter. This experience will be like no shooter you\'ve ever played; the learning curve is fresh, exciting ad Highly recommend for people who are tactical and love low TTK. This game is for you.',
   0.80000824),
  ('my hours tell it all', 0.7426502),
  ('The game has improved a lot, but I wish they would ban cheaters! RIP',
   0.73692316),
  ("My friend who I play CS 2 with said if I post a review on community and it gets 100 likes and 10 Awards that he will buy me a RTX 3060 and 2 Kilos of canned ham. So I'm just gonna leave this here. Help a brotha!",
   0.7152298),
  ('hi to game. Is good.', 0.7009601),
  ('CS2 is a good game play it also its hard', 0.6996366),
  ('very good game i love it and its free and i think that is perfect',
   0

In [63]:
# create a dataframe with only these repr docs, their topic id, and the probability
df_original_texts = []

for topic_id, _repr_docs_ids in repr_docs_ids.items():
    t = reviews_df.iloc[_repr_docs_ids]
    t['topic_id'] = topic_id

    df_original_texts.append(t)

df_original_texts = pd.concat(df_original_texts)
df_original_texts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['topic_id'] = topic_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['topic_id'] = topic_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['topic_id'] = topic_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

Unnamed: 0,recommendationid,review,timestamp_created,voted_up,steam_purchase,received_for_free,review_original,review_bow,topic_id
1549,159046956,"CHEATERS!, CHEATERS! EVERY GAME :) VALVE LOVE ...",2024-02-23 10:56:33,-1,False,False,"CHEATERS!, CHEATERS! EVERY GAME :) VALVE LOVE ...",cheater cheater every game valve love money bu...,0
17035,157284581,good enough,2024-01-31 21:12:10,1,True,False,good enough,good enough,0
7413,158364201,"A shooter that can be described as the ""dark s...",2024-02-15 10:00:34,1,True,False,"A shooter that can be described as the ""dark s...",shooter describe dark soul competitive strateg...,0
1460,159072132,my hours tell it all,2024-02-23 16:50:02,1,True,False,my hours tell it all,hour tell,0
16891,157314749,"The game has improved a lot, but I wish they w...",2024-02-01 08:08:02,1,False,False,"The game has improved a lot, but I wish they w...",game improve lot wish would ban cheater rip,0
...,...,...,...,...,...,...,...,...,...
5720,158623434,a brief and fun introduction to ecosystem rest...,2024-02-18 13:05:47,1,True,False,a brief and fun introduction to ecosystem rest...,brief fun introduction ecosystem restoration m...,19
1885,158997012,salam,2024-02-22 19:26:55,1,False,False,salam,salam,19
26899,156144443,Another piece of sh111t from gaben that can't ...,2024-01-17 02:48:36,-1,True,False,Another piece of sh111t from gaben that can't ...,another piece sh gaben even moderate cheater,19
26119,156158978,ewr4t er,2024-01-17 09:45:27,1,True,False,ewr4t er,ewr er,19


In [65]:
# check which repr docs do not begin from beginning of the original review_text

def check_beginning_of_review_text(row):
    t = row['review_text_split'].split()[0].lower()
    return t != row['review_text'][:len(t)].lower()

if split_sentences:
    t = df_original_texts[
        df_original_texts.apply(
            lambda x: check_beginning_of_review_text(x),
            axis=1
        )]

    print(len(t))

In [75]:
# print out the original text and the split (if any) for reference

for topic_id in repr_docs_ids.keys():
    print(f'Topic {topic_id}:')

    for index, row in df_original_texts[df_original_texts['topic_id'] == topic_id].iterrows():
        print(f'Doc {index}')
        print(f'Original: {row["review"]}')
        if split_sentences:
            print(f'Split: {row["review_split"]}')
        print()

    print(); print()

Topic 0:
Doc 1549
Original: CHEATERS!, CHEATERS! EVERY GAME :) VALVE LOVE MONEY FROM BOUGHT ACC

Doc 17035
Original: good enough

Doc 7413
Original: A shooter that can be described as the "dark souls" of competitive strategy takes centre stage in this action packed skill based shooter. This experience will be like no shooter you've ever played; the learning curve is fresh, exciting ad Highly recommend for people who are tactical and love low TTK. This game is for you.

Doc 1460
Original: my hours tell it all

Doc 16891
Original: The game has improved a lot, but I wish they would ban cheaters! RIP

Doc 14760
Original: My friend who I play CS 2 with said if I post a review on community and it gets 100 likes and 10 Awards that he will buy me a RTX 3060 and 2 Kilos of canned ham. So I'm just gonna leave this here. Help a brotha!

Doc 2383
Original: hi to game. Is good.

Doc 6188
Original: CS2 is a good game play it also its hard

Doc 13074
Original: very good game i love it and its free an

---

LLM topic naming, with external text

In [69]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

In [70]:
# can try diff llama2: https://ollama.com/library/yarn-llama2

llm = Ollama(model="llama2")        # assuming the port is 11434

Since there is no way to update the topic keywords with the new reviews, we generate the keywords using LLM, then explicitly told the LLM to output the label using the keywords and the representative docs

In [94]:
# prompt engineering
system_message = "You are a player of the game who is reading the reviews about the game."

# human_template = \
# '''Create a name for a topic given the topic's keywords and some most representative reviews of the topic. Output a label for the topic in less than 5 words. Do not output other text. 

# The top keywords of the topic is: \'\'\'{topic_keywords}\'\'\'. 

# The most representative reviews of the topic are: \'\'\'{topic_reviews}\'\'\'. '''

human_template = \
'''Create 10 keywords given some most representative reviews of the topic. Output the keywords in a numberred list. Then output a label for the topic in less than 5 words by considering the keywords and the representative docs. Do not output other text.

The most representative reviews of the topic are: \'\'\'{topic_reviews}\'\'\'.'''

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("human", human_template)
])

chain = chat_prompt | llm

In [95]:
import random
import time

N_times = 5

topic_ids = list(repr_docs.keys())

new_topic_labels = {}

for topic_id in topic_ids:
    # _topic_keywords = topic_keywords[topic_id]
    _topic_keywords = topic_keywords_pyldavis[topic_id]

    time.sleep(1)

    for i in range(N_times):
        if split_sentences:
            _reviews = list(df_original_texts[df_original_texts['topic_id'] == topic_id]['review_split'].values)
        else:
            _reviews = list(df_original_texts[df_original_texts['topic_id'] == topic_id]['review_original'].values)


        while True:
            topic_reviews = random.sample(_reviews, 2)

            check_bool = [len(topic_reviews[i]) < 5000 for i in range(len(topic_reviews))]
            if all(check_bool):
                break

        result = chain.invoke({
            'topic_keywords':_topic_keywords,
            'topic_reviews':topic_reviews
        })

        print(f'{topic_id:02}_call{i}: {result}')

        if topic_id not in new_topic_labels:
            new_topic_labels[topic_id] = [result]
        else:
            new_topic_labels[topic_id].append(result)

    print()

00_call0: 
Keywords:

1. Cool
2. Sometimes
3. Good
4. Fun
5. Gameplay
6. Enjoyable
7. Entertainment
8. Engaging
9. Interactive
10. Exciting

Label: Casual Gaming
00_call1: 
Keywords:

1. CS 2
2. Review
3. Community
4. Likes
5. Awards
6. RTX 3060
7. Canned ham
8. Friend
9. Buy
10. Help

Label: Gaming
00_call2: 
Keywords:

1. Cool
2. Sometimes
3. Good
4. Hard
5. Play
6. Game
7. Fun
8. Enjoyable
9. Challenging
10. Engaging

Label: "Cool game with challenging gameplay."
00_call3: 
Keywords:

1. CS 2
2. Review
3. Community
4. Likes
5. Awards
6. RTX 3060
7. Canned ham
8. Brother
9. Good enough
10. Friend

Label: Gaming
00_call4: 
Keywords:

1. Hours
2. Good
3. Gameplay
4. Fun
5. Enjoyable
6. Playability
7. Addictive
8. Satisfying
9. Well-designed
10. Entertaining

Label: Fun game with engaging gameplay

01_call0: 
Keywords:

1. After update
2. Runs poorly
3. Valve issues
4. New VAC
5. Performance decrease
6. Bugs and glitches
7. Unstable gameplay
8. Poor optimization
9. Framerate drops
10. C