Part 2 of the LLM per game TLDR generation project


Passing the review to both sentiment analysis and topic modeling model

In [1]:
from datetime import datetime
import pytz
import os
import time
import requests
import json
import sys

import pickle
from pathlib import Path
import traceback

In [2]:
# game_steamid = 1716740              # starfield
# game_name = 'starfield'             # also the folder name where the reviews are stored

# game_steamid = 1118010
# game_name = 'monster_hunter_world_iceborne'

game_steamid = 582010
game_name = 'monster_hunter_world'


In [3]:
# load the reviews from folder

reviews_reqs = []

# get existing folder and retrieve the cursor object (?)

# load the latest file
game_folder = Path(f'../../dataset/data_scraping/steam_comments_scraping/{game_name}')
if game_folder.exists():
    try:
        latest_file_path = game_folder.joinpath(f'steam_reviews_{game_steamid}_unique_with_gendata.pkl')
        with open(latest_file_path, 'rb') as f:
            reviews_reqs = pickle.load(f)           # retrieve the list of reviews
            print('Loaded:', latest_file_path)
    except IndexError as e:
        print('Error loading the latest file:', e)
        traceback.print_exc()

Loaded: ../../dataset/data_scraping/steam_comments_scraping/monster_hunter_world/steam_reviews_582010_unique_with_gendata.pkl


In [4]:
reviews_reqs

[{'recommendationid': '159297882',
  'author': {'steamid': '76561198957496334',
   'num_games_owned': 0,
   'num_reviews': 2,
   'playtime_forever': 636,
   'playtime_last_two_weeks': 636,
   'playtime_at_review': 606,
   'last_played': 1708919153},
  'language': 'english',
  'review': 'One of my favorite games over 2-300 hours over platforms still go back for more',
  'timestamp_created': 1708916570,
  'timestamp_updated': 1708916570,
  'voted_up': True,
  'votes_up': 0,
  'votes_funny': 0,
  'weighted_vote_score': 0,
  'comment_count': 0,
  'steam_purchase': True,
  'received_for_free': False,
  'written_during_early_access': False,
  'hidden_in_steam_china': True,
  'steam_china_location': '',
  'author_generated': {'name': 'dunnrobert1697',
   'email': 'dunnrobert1697@protonmail.com',
   'password': 'dunnrobert1697',
   'birthdate': '1988-10-14',
   'gender': 'MALE'}},
 {'recommendationid': '159297800',
  'author': {'steamid': '76561199130278616',
   'num_games_owned': 0,
   'num_r

---

Load both sentiment analysis and topic modeling model

SA: load the BERT 480K-bal model

TM: load the by genre model, and the topic-id-to-name json file

In [5]:
import platform
import torch

if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

mps


In [6]:
sa_model_path = Path('../../sa/bert_2023-12-13/bert-finetune_480k_bal_2024-01-18')
sa_model_path = sa_model_path.joinpath(
    sa_model_path.name + '_model'
)

if not sa_model_path.exists():
    print('SA model not found:', sa_model_path)
    exit()

# load the model (use gpu acceleration)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
sa_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', device=device)
sa_model = AutoModelForSequenceClassification.from_pretrained(sa_model_path)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# load the BERTopic model

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

sys.path.append('../../topic_modelling/bertopic_dev')
from bertopic_utils import _load_bertopic_model

sbert_model_name = 'all-MiniLM-L6-v2'
sbert = SentenceTransformer(sbert_model_name, device=device)

tm_model_path = Path('../../topic_modelling/bertopic_dev/category_action_unique_review_text/bertopic[split]_genre_action_grid_search_20240301_095149/bertopic_bt_nr_topics_10')
# TODO: finalize the model path

topic_model = _load_bertopic_model(tm_model_path)

# load the topic-id-to-name json mapping
topic_id_to_label_path = Path(f'../../topic_modelling/eval_results_external/{game_name}').joinpath(
    *tm_model_path.parts[-3:], 'topic_id_to_label.json')

if not topic_id_to_label_path.exists():
    print('Topic id to label mapping not found:', topic_id_to_label_path)
    exit()

with open(topic_id_to_label_path, 'r') as f:
    topic_id_to_label = json.load(f)

In [7]:
from datasets import Dataset

sys.path.append('../../sa')
import str_cleaning_functions

def cleaning(s:str):
    s = str_cleaning_functions.remove_links(s)
    s = str_cleaning_functions.remove_links2(s)
    s = str_cleaning_functions.clean(s)
    s = str_cleaning_functions.deEmojify(s)
    s = str_cleaning_functions.unify_whitespaces(s)

    return s

# use huggingface datasets package to accelerate the cleaning process
def cleaning_datasets(data):
    return {'review_text': cleaning(data['review_text'])}

In [8]:
# since both BERTopic and SA BERT model shares the same preprocessing procedure
# use the same cleaning function for both

# load the reviews into a dataset object
reviews_dict = {
    'recommendationid' : [r['recommendationid'] for r in reviews_reqs],     # need the recommendation id for identifying the original data
    'review_text': [r['review'] for r in reviews_reqs]
}

# create a dataset object from the list
reviews_dataset = Dataset.from_dict(reviews_dict)

In [9]:
# common preprocessing
reviews_dataset_p = reviews_dataset.map(cleaning_datasets, num_proc=4)
reviews_dataset_p = reviews_dataset_p.filter(lambda x: len(x['review_text']) > 0)

Map (num_proc=4):   0%|          | 0/143026 [00:00<?, ? examples/s]

Map (num_proc=4): 100%|██████████| 143026/143026 [00:07<00:00, 18574.84 examples/s]
Filter: 100%|██████████| 143026/143026 [00:00<00:00, 422919.89 examples/s]


In [10]:
# SA specific
def tokenize_dataset(data):
    return sa_tokenizer(data['review_text'], max_length=sa_tokenizer.model_max_length, padding='max_length', truncation=True, return_tensors='pt')

reviews_dataset_p_sa = reviews_dataset_p.map(tokenize_dataset, batched=True)

Map: 100%|██████████| 142613/142613 [00:44<00:00, 3239.64 examples/s]


In [15]:
# BERTopic specific

reviews_dataset_p_bertopic = Dataset.from_dict({
    'recommendationid': reviews_dataset_p['recommendationid'],
    'review_text': reviews_dataset_p['review_text'],
    'embeddings': sbert.encode(reviews_dataset_p['review_text'], show_progress_bar=True, batch_size=64)
})

Batches:   9%|▉         | 202/2229 [00:37<06:18,  5.35it/s]


KeyboardInterrupt: 

---

Inference

In [11]:
sa_output = sa_model(reviews_dataset_p_sa)

sa_output

TypeError: '>' not supported between instances of 'list' and 'slice'

In [12]:
# SA with BERT

from transformers import Trainer, TrainingArguments

BATCH_SIZE = 32

trainer_args = TrainingArguments(output_dir='eval',
    per_device_eval_batch_size=BATCH_SIZE)

trainer = Trainer(
    model=sa_model,
    args=trainer_args,
    # train_dataset=ds_test,
    # eval_dataset=ds_test,
)

sa_output = trainer.predict(reviews_dataset_p_sa)


  0%|          | 20/4457 [00:11<45:50,  1.61it/s]

KeyboardInterrupt: 

In [None]:
sa_outputs

transformers.models.bert.modeling_bert.BertForSequenceClassification

In [None]:
# bertopic

topics, probs = topic_model.transform(
    reviews_dataset_p_bertopic['review_text'],
    embeddings=reviews_dataset_p_bertopic['embeddings']
)

---

formulate the data and put it back to the reviews