Testing topic models with external documents

(i.e. comments from later games)

LDA

In [26]:
import pandas as pd
import numpy as np

from pathlib import Path
import json
import pickle
from datetime import datetime
import traceback

import gensim
import nltk

import sys
sys.path.append('../')

In [2]:
%load_ext autoreload

In [12]:
# the text to be evaluated

# game_steamid = 730
# game_name = 'counter-strike_2'

game_steamid = 1091500
game_name = 'cyberpunk2077'

datetime_until = datetime(2024, 1, 1, 0, 0, 0)      # only analyse reviews from this date until now (GMT+8)

# load the reviews from folder

reviews_reqs = []

# get existing folder and retrieve the cursor object (?)

# load the latest file
game_folder = Path(f'../../dataset/data_scraping/steam_comments_scraping/{game_name}').resolve()
if game_folder.exists():
    try:
        latest_file_path = sorted(game_folder.glob('steam_reviews_*.pkl'))[-1]
        with open(latest_file_path, 'rb') as f:
            reviews_reqs = pickle.load(f)           # retrieve the list of reviews
            print('Loaded:', latest_file_path)
    except IndexError as e:
        print('Error loading the latest file:', e)
        traceback.print_exc()

Loaded: /Users/michaelcheng/Documents/MyDocs/HKU/COMP4801 FYP/FYP/NLP/dev-workspace/dataset/data_scraping/steam_comments_scraping/cyberpunk2077/steam_reviews_1091500_unique.pkl


In [13]:
# create a dataframe like in training/evaluation
reviews_df = pd.DataFrame(reviews_reqs)

reviews_df = reviews_df[['recommendationid', 'review', 'timestamp_created', 'voted_up', 'steam_purchase', 'received_for_free']]

# convert timestamp to datetime. The datetime converted is in utc+0
reviews_df['timestamp_created'] = pd.to_datetime(reviews_df['timestamp_created'], unit='s')

# convert the voted_up to 1 and -1
reviews_df['voted_up'] = reviews_df['voted_up'].apply(lambda x: 1 if x else -1)

reviews_df

Unnamed: 0,recommendationid,review,timestamp_created,voted_up,steam_purchase,received_for_free
0,157989688,"awesome stuff right here dude, far out stuff bro",2024-02-10 10:59:33,1,True,False
1,157989192,Awesome game!,2024-02-10 10:50:16,1,True,False
2,157988968,Key bindings are the hardest boss in this game...,2024-02-10 10:46:32,-1,True,False
3,157987236,This game is now fantastic! Despite early issu...,2024-02-10 10:11:09,1,True,False
4,157987210,"This game has been changed completely, made a ...",2024-02-10 10:10:46,1,True,False
...,...,...,...,...,...,...
299929,81918924,Still waiting to be able to play.\n\nEdit: I'm...,2020-12-10 00:11:38,1,True,False
299930,81918918,"Bigot developer\n\nTo clarify, I got this game...",2020-12-10 00:11:29,-1,True,True
299931,81918912,Obligatory It’s breathtaking!\nA wonderful sci...,2020-12-10 00:11:10,1,False,False
299932,81918903,It's a city of dreams... and I'm a big dreamer.,2020-12-10 00:10:50,1,False,True


In [14]:
%autoreload 2
sys.path.append('../../sa')
import str_cleaning_functions

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def cleaning(df, review):
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_links2(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.clean(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.deEmojify(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_non_letters(x))
    df[review] = df[review].apply(lambda x: x.lower())
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.remove_stopword(x))
    df[review] = df[review].apply(lambda x: str_cleaning_functions.unify_whitespaces(x))

# do lemmatization, but not stemming (as part of speech is important in topic modelling)
# use nltk wordnet for lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemma = WordNetLemmatizer()

# from https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word

# from: https://www.cnblogs.com/jclian91/p/9898511.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None     # if none -> created as noun by wordnet
    
def lemmatization(text):
   # use nltk to get PoS tag
    tagged = nltk.pos_tag(nltk.word_tokenize(text))

    # then we only need adj, adv, verb, noun
    # convert from nltk Penn Treebank tag to wordnet tag
    wn_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), tagged))

    # lemmatize by the PoS
    lemmatized = list(map(lambda x: lemma.lemmatize(x[0], pos=x[1] if x[1] else wordnet.NOUN), wn_tagged))
    # lemma.lemmatize(wn_tagged[0], pos=wordnet.NOUN)

    return lemmatized

def lemmatization_dataset(data):
    return {'review_text2': lemmatization(data['review_text'])}

In [15]:
# apply data preprocessing
cleaning(reviews_df, 'review')

from datasets import Dataset

temp_dataset = Dataset.from_dict({'review_text': reviews_df['review']})
temp_dataset = temp_dataset.map(lemmatization_dataset, num_proc=4)
reviews_df['review_lemmatized'] = temp_dataset['review_text2']

# filter empty list of strings in X_lemmatized, as they are not useful for topic modelling
# X_lemmatized = list(filter(lambda x: len(x) > 0, X_lemmatized))
reviews_df = reviews_df[reviews_df['review_lemmatized'].apply(len) > 0]
X_lemmatized = reviews_df['review_lemmatized'].values

print(len(X_lemmatized))
print(X_lemmatized[0])

Map (num_proc=4): 100%|██████████| 299934/299934 [01:38<00:00, 3056.66 examples/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


292201
['awesome', 'stuff', 'right', 'dude', 'far', 'stuff', 'bro']


In [73]:
# load the LDA model
%autoreload 2
from dataset_loader import GENRES

genre = GENRES.INDIE
training_datetime = datetime(2024, 2, 17, 1, 18, 55)
N_topics = 20

lda_model_folder = Path(f'../lda_dev/category_{str(genre)}_unique_review_text')
lda_model_folder = lda_model_folder.joinpath(
    Path(f'lda_multicore_genre_{str(genre)}_grid_search_{training_datetime.strftime("%Y%m%d_%H%M%S")}')
)
lda_model_folder = lda_model_folder.joinpath(
    Path(f'lda_multicore_lda_num_topics_{N_topics}')
)

# load the id2word and the model
id2word = gensim.corpora.Dictionary.load(str(lda_model_folder.joinpath('lda_multicore.id2word')))
lda_model = gensim.models.LdaMulticore.load(str(lda_model_folder.joinpath('lda_multicore')))

In [74]:
# create corpus object from the lemmatized reviews and id2word
corpus = [id2word.doc2bow(text) for text in X_lemmatized]

---

Evaluation copied from lda_eval_vis.ipynb and lda_eval_vis.ipynb

Evaluation

In [75]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis



---

Qualitative evaluation

Top 10 keywords for each topics

depends on the lda model, or the pyldavis

In [76]:
list(vis.sorted_terms(topic=1, _lambda=0.6)['Term'].values[:10])

['puzzle',
 'feel',
 'end',
 'character',
 'design',
 'experience',
 'mechanic',
 'quite',
 'really',
 'little']

In [77]:
topic_keywords = {}
topic_keywords_pyldavis = {}
top_N_words = 10

for i, topic in lda_model.show_topics(num_topics=lda_model.num_topics, num_words=top_N_words, formatted=False):
    topic_keywords[i] = [word for word, _ in topic]
    topic_keywords_pyldavis[i] = list(vis.sorted_terms(topic=i+1, _lambda=0.6)['Term'].values[:top_N_words])
    
    print(f'Topic {i}:')
    print(', '.join([word for word, _ in topic]))
    print(', '.join([word for word in topic_keywords_pyldavis[i]]))

    print()

Topic 0:
worth, buy, money, sale, pay, definitely, dollar, cheap, wait, sell
puzzle, feel, end, character, design, experience, mechanic, quite, really, little

Topic 1:
shoot, gun, fps, like, sam, fly, creepy, physic, wall, weapon
click, screen, control, work, crash, button, review, bug, bad, problem

Topic 2:
recommend, highly, amazing, mod, garry, value, replay, great, like, absolutely
like, say, think, know, thing, really, bad, good, feel, play

Topic 3:
die, life, scary, room, man, night, girl, walk, scare, real
defense, weapon, ship, level, upgrade, attack, different, dungeon, mission, combat

Topic 4:
level, weapon, defense, different, character, ship, like, combat, upgrade, dungeon
player, multiplayer, server, free, community, online, update, download, dlc, map

Topic 5:
kill, dinosaur, simulator, fight, loot, bos, death, best, epic, kick
adventure, art, style, beautiful, old, great, classic, fan, school, fantastic

Topic 6:
great, adventure, style, art, old, character, gameplay

---

Get most representative docs

In [78]:
# setup: get the model's topics in their native ordering...
all_topics = lda_model.print_topics(num_topics=-1)
# ...then create a empty list per topic to collect the docs:
docs_per_topic = {topic_id: [] for (topic_id, _) in all_topics}

# now, for every doc...
for doc_id, doc_bow in enumerate(corpus):
    # ...get its topics...
    doc_topics = lda_model.get_document_topics(doc_bow)
    # ...& for each of its topics...
    for topic_id, score in doc_topics:
        # ...add the doc_id & its score to the topic's doc list
        docs_per_topic[topic_id].append((doc_id, score))

In [79]:
for doc_list in docs_per_topic.values():
    doc_list.sort(key=lambda id_and_score: id_and_score[1], reverse=True)

In [80]:
top_N_docs = 10

for i in range(len(docs_per_topic)):
    print(docs_per_topic[i][:top_N_docs])

[(257277, 0.99263155), (42471, 0.9547618), (25815, 0.9499999), (59344, 0.8642658), (101951, 0.8416663), (138691, 0.84166366), (78214, 0.8416635), (215442, 0.8416618), (250727, 0.8416597), (84231, 0.8416552)]
[(17590, 0.9136113), (282974, 0.84165925), (242966, 0.841655), (285088, 0.8416434), (200460, 0.8099985), (225905, 0.8099982), (23995, 0.8099968), (162809, 0.8099963), (256154, 0.80999196), (107771, 0.8099917)]
[(6323, 0.9472219), (28129, 0.9472219), (11398, 0.94411725), (234743, 0.8944438), (12535, 0.88122493), (18015, 0.8642854), (104730, 0.86428493), (275276, 0.86428493), (291006, 0.86428493), (277610, 0.8416658)]
[(27309, 0.9833327), (220509, 0.9703122), (15226, 0.9693543), (57811, 0.9406244), (117436, 0.89444345), (72151, 0.89444053), (278109, 0.8944143), (20580, 0.8812333), (109998, 0.86427873), (61618, 0.8642615)]
[(243512, 0.89443606), (193849, 0.8944341), (43808, 0.88124543), (169408, 0.8812452), (119459, 0.88120013), (15064, 0.8642838), (46372, 0.8642792), (80637, 0.864273

In [81]:
reviews_df

Unnamed: 0,recommendationid,review,timestamp_created,voted_up,steam_purchase,received_for_free,review_lemmatized
0,157989688,awesome stuff right dude far stuff bro,2024-02-10 10:59:33,1,True,False,"[awesome, stuff, right, dude, far, stuff, bro]"
1,157989192,awesome game,2024-02-10 10:50:16,1,True,False,"[awesome, game]"
2,157988968,key bindings hardest boss game unacceptable,2024-02-10 10:46:32,-1,True,False,"[key, binding, hard, bos, game, unacceptable]"
3,157987236,game fantastic despite early issues cd projekt...,2024-02-10 10:11:09,1,True,False,"[game, fantastic, despite, early, issue, cd, p..."
4,157987210,game changed completely made absolute unpreced...,2024-02-10 10:10:46,1,True,False,"[game, change, completely, make, absolute, unp..."
...,...,...,...,...,...,...,...
299929,81918924,still waiting able waiting anymore great game,2020-12-10 00:11:38,1,True,False,"[still, wait, able, wait, anymore, great, game]"
299930,81918918,bigot developer clarify got game friend,2020-12-10 00:11:29,-1,True,True,"[bigot, developer, clarify, get, game, friend]"
299931,81918912,obligatory breathtaking wonderful sci fi cyber...,2020-12-10 00:11:10,1,False,False,"[obligatory, breathtaking, wonderful, sci, fi,..."
299932,81918903,city big dreamer,2020-12-10 00:10:50,1,False,True,"[city, big, dreamer]"


In [82]:
# use the ID to retrieve the top docs, and copy them to a file for inspection

# retrieve the original text
df_original_texts = []
for topic_id in docs_per_topic.keys():
    t = reviews_df.iloc[[doc_id for doc_id, _ in docs_per_topic[topic_id][:top_N_docs]]]
    t['topic_id'] = topic_id        # store the topic id

    df_original_texts.append(t)

df_original_texts = pd.concat(df_original_texts)
df_original_texts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['topic_id'] = topic_id        # store the topic id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['topic_id'] = topic_id        # store the topic id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['topic_id'] = topic_id        # store the topic id
A value is trying to be set on a copy of a sli

Unnamed: 0,recommendationid,review,timestamp_created,voted_up,steam_purchase,received_for_free,review_lemmatized,topic_id
264131,82120980,liars liars liars liars liars liars liars liar...,2020-12-11 23:36:39,-1,True,False,"[liar, liars, liar, liars, liar, liars, liar, ...",0
43862,141847314,buy game buy game buy game buy game buy game b...,2023-07-11 17:32:45,1,True,False,"[buy, game, buy, game, buy, game, buy, game, b...",0
26647,147822817,buy buy buy buy buy buy buy buy buy buy buy bu...,2023-10-07 19:32:07,1,True,False,"[buy, buy, buy, buy, buy, buy, buy, buy, buy, ...",0
61212,130219338,glad decided buy game definitely worth sale price,2023-01-05 20:58:38,1,True,False,"[glad, decide, buy, game, definitely, worth, s...",0
105137,109118052,way shape form game worth got sale though wort...,2022-01-30 16:51:38,1,True,False,"[way, shape, form, game, worth, get, sale, tho...",0
...,...,...,...,...,...,...,...,...
222616,82438196,bit glitchy really fun play,2020-12-15 16:14:45,1,True,False,"[bit, glitchy, really, fun, play]",19
239521,82291894,glitchy great story tons fun plenty,2020-12-13 19:05:53,1,True,False,"[glitchy, great, story, ton, fun, plenty]",19
104313,109988146,fun perfect really really fun,2022-02-11 15:22:25,1,True,False,"[fun, perfect, really, really, fun]",19
114734,104432401,great game story great really interesting firs...,2021-11-27 22:09:01,1,True,False,"[great, game, story, great, really, interest, ...",19


In [83]:
# print out the original texts as a log

for topic_id in docs_per_topic.keys():
    print(f'Topic {topic_id}:')
    print()
    t = reviews_df.iloc[[doc_id for doc_id, _ in docs_per_topic[topic_id][:top_N_docs]]]
    for index, row in t.iterrows():
        print(f'Doc {index}:')
        print(row['review'])
        print()
    print()

Topic 0:

Doc 264131:
liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars liars 

---

LLM topic naming

Script copied from lda_eval_quali.ipynb

In [84]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"          # disable huggingface warning

# device check
import platform
import torch
if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

mps


In [85]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

In [86]:
llm = Ollama(model="llama2")        # assuming the port is 11434

In [87]:
# prompt engineering
system_message = "You are a player of the game who is reading the reviews about the game."

human_template = \
'''Create a name for a topic given the topic's keywords and some most representative reviews of the topic. Output a label for the topic in less than 5 words. Do not output other text. 

The top keywords of the topic is: \'\'\'{topic_keywords}\'\'\'. 

The most representative reviews of the topic are: \'\'\'{topic_reviews}\'\'\'.'''

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("human", human_template)
])

chain = chat_prompt | llm

In [88]:
import random
import time

N_times = 5

topic_ids = list(docs_per_topic.keys())           # also generate the labels for the outlier topic, as its part of the topic_labels_ attribute

new_topic_labels = {}

for topic_id in topic_ids:
    # _topic_keywords = topic_keywords[topic_id]
    _topic_keywords = topic_keywords_pyldavis[topic_id]

    time.sleep(2)

    for i in range(N_times):
        while True:
            topic_reviews = random.sample(list(df_original_texts[df_original_texts['topic_id'] == topic_id]['review'].values), 2)            # only 2 reviews, sampled randomly        # TODO: called multiple times to select the best topic label

            # check the length of the topic reviews so that the llm won't be overloaded
            # 5000 character limits
            check_bool = [len(topic_reviews[i]) < 5000 for i in range(len(topic_reviews))]
            if all(check_bool):
                break


        result = chain.invoke(
            {
                "topic_keywords": _topic_keywords,
                "topic_reviews": topic_reviews
            }
        )

        print(f'{topic_id:02}_call{i}: {result}')

        if topic_id not in new_topic_labels:
            new_topic_labels[topic_id] = [result]
        else:
            new_topic_labels[topic_id].append(result)
    print('\n')

00_call0: "Puzzle game with too much repetition."
00_call1: "Puzzle game with good character design and mechanics, but some bugs."
00_call2: 
Topic Label: Disappointing Puzzle Experience with Little Character Design
00_call3: 
Topic Label: Worth Buying
00_call4: 
Topic Label: Disappointing Puzzle Experience with Too Much Repetition.


01_call0: Badly Made Game
01_call1: 
Topic Label: Buggy Game
01_call2: 
Topic Label: "Buggy Controls"
01_call3: 
Topic Label: "Buggy Controls"
01_call4: 
Flying Car Glitches


02_call0: 
Label: Positive Reviews
02_call1: 
Topic Label: Game Review
02_call2: 
Topic Label: "Game Reviews"
02_call3: 
Topic Label: Thoughts on the Game
02_call4: 
Topic Label: "Thoughts on Playing the Game"


03_call0: 
Topic Label: "Combat Action"
03_call1: 
Topic Label: "Ship Combat"
03_call2: 
Topic Label: "Battle Royale"
03_call3: 
Topic Label: Action-Packed Combat
03_call4: 
Topic Label: "Combat Grind"


04_call0: 
Topic Label: "Challenging Multiplayer Experience with Variet

---

Save the result (by external doc, then by the model)

In [90]:
lda_model_folder.relative_to(Path('../lda_dev'))

PosixPath('category_indie_unique_review_text/lda_multicore_genre_indie_grid_search_20240217_011855/lda_multicore_lda_num_topics_20')

In [93]:
result_folder = Path(f'{game_name}/lda').joinpath(lda_model_folder.relative_to(Path('../lda_dev')))

# save the new topic labels, top N words, and the top N reviews (original text) to a json file
if not result_folder.exists():
    result_folder.mkdir(parents=True)

with open(result_folder.joinpath('topic_labels.json'), 'w') as f:
    json.dump(new_topic_labels, f, indent=2)

with open(result_folder.joinpath('topic_keywords.json'), 'w') as f:
    json.dump(topic_keywords, f, indent=2)

with open(result_folder.joinpath('topic_keywords_pyldavis.json'), 'w') as f:
    json.dump(topic_keywords_pyldavis, f, indent=2)

df_original_texts.to_pickle(result_folder.joinpath('top_reviews.pkl'))