Part 2 of the LLM per game TLDR generation project


Passing the review to both sentiment analysis and topic modeling model

In [1]:
from datetime import datetime
import pytz
import os
import time
import requests
import json
import sys

import pickle
from pathlib import Path
import traceback

In [2]:
# game_steamid = 1716740              # starfield
# game_name = 'starfield'             # also the folder name where the reviews are stored

# game_steamid = 1118010
# game_name = 'monster_hunter_world_iceborne'

# game_steamid = 582010
# game_name = 'monster_hunter_world'

# game_steamid = 2138330          # cyberpunk2077 phantom liberty
# game_name = 'cyberpunk2077_phantom_liberty'

# game_steamid = 1091500          # cyberpunk2077
# game_name = 'cyberpunk2077'

# game_steamid = 730
# game_name = 'counter-strike_2'

game_steamid = 570
game_name = "dota2"


In [3]:
# load the reviews from folder

reviews_reqs = []

# get existing folder and retrieve the cursor object (?)

# load the latest file
game_folder = Path(f'../../dataset/data_scraping/steam_comments_scraping/{game_name}')
if game_folder.exists():
    try:
        latest_file_path = game_folder.joinpath(f'steam_reviews_{game_steamid}_unique_with_gendata.pkl')
        with open(latest_file_path, 'rb') as f:
            reviews_reqs = pickle.load(f)           # retrieve the list of reviews
            print('Loaded:', latest_file_path)
    except IndexError as e:
        print('Error loading the latest file:', e)
        traceback.print_exc()

Loaded: ../../dataset/data_scraping/steam_comments_scraping/dota2/steam_reviews_570_unique_with_gendata.pkl


In [4]:
reviews_reqs[:10]

[{'recommendationid': '160077941',
  'author': {'steamid': '76561199383301245',
   'num_games_owned': 0,
   'num_reviews': 1,
   'playtime_forever': 60593,
   'playtime_last_two_weeks': 1795,
   'playtime_at_review': 60045,
   'last_played': 1710129709},
  'language': 'english',
  'review': 'My all time favorite game',
  'timestamp_created': 1709868509,
  'timestamp_updated': 1709868509,
  'voted_up': True,
  'votes_up': 0,
  'votes_funny': 0,
  'weighted_vote_score': 0,
  'comment_count': 0,
  'steam_purchase': False,
  'received_for_free': False,
  'written_during_early_access': False,
  'hidden_in_steam_china': True,
  'steam_china_location': '',
  'author_generated': {'name': 'wrightjeffrey1',
   'email': 'wrightjeffrey1@narod.ru',
   'password': 'wrightjeffrey1',
   'birthdate': '2002-03-25',
   'gender': 'MALE'}},
 {'recommendationid': '160077923',
  'author': {'steamid': '76561199434553630',
   'num_games_owned': 2,
   'num_reviews': 2,
   'playtime_forever': 725,
   'playtime_l

---

Load both sentiment analysis and topic modeling model

SA: load the BERT 480K-bal model

TM: load the by genre model, and the topic-id-to-name json file

In [5]:
import platform
import torch

if platform.system() == 'Linux' or platform.system() == 'Windows':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('mps')        # m-series machine

print(device)

cuda


In [6]:
sa_model_path = Path('../../sa/bert_2023-12-13/bert-finetune_480k_bal_2024-01-18')
sa_model_path = sa_model_path.joinpath(
    sa_model_path.name + '_model'
)

if not sa_model_path.exists():
    print('SA model not found:', sa_model_path)
    exit()

# load the model (use gpu acceleration)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
sa_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', device=device)
sa_model = AutoModelForSequenceClassification.from_pretrained(sa_model_path)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# load the BERTopic model

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

sys.path.append('../../topic_modelling/bertopic_dev')
from bertopic_utils import _load_bertopic_model

sbert_model_name = 'all-MiniLM-L6-v2'
sbert = SentenceTransformer(sbert_model_name, device=device)

tm_model_path = Path('../../topic_modelling/bertopic_dev/category_action_unique_review_text/bertopic[split]_genre_action_grid_search_20240301_095149/bertopic_bt_nr_topics_10')
# TODO: finalize the model path

topic_model = _load_bertopic_model(tm_model_path)

# load the topic-id-to-name json mapping
topic_id_to_label_path = Path(f'../../topic_modelling/eval_results_external/{game_name}').joinpath(
    *tm_model_path.parts[-3:], 'topic_id_to_label.json')

if not topic_id_to_label_path.exists():
    print('Topic id to label mapping not found:', topic_id_to_label_path)
    exit()

with open(topic_id_to_label_path, 'r') as f:
    topic_id_to_label = json.load(f)

In [8]:
from datasets import Dataset

sys.path.append('../../sa')
import str_cleaning_functions

def cleaning(s:str):
    s = str_cleaning_functions.remove_links(s)
    s = str_cleaning_functions.remove_links2(s)
    s = str_cleaning_functions.clean(s)
    s = str_cleaning_functions.deEmojify(s)
    s = str_cleaning_functions.unify_whitespaces(s)

    return s

# use huggingface datasets package to accelerate the cleaning process
def cleaning_datasets(data):
    return {'review_text': cleaning(data['review_text'])}

In [9]:
# since both BERTopic and SA BERT model shares the same preprocessing procedure
# use the same cleaning function for both

# load the reviews into a dataset object
reviews_dict = {
    'recommendationid' : [r['recommendationid'] for r in reviews_reqs],     # need the recommendation id for identifying the original data
    'review_text': [r['review'] for r in reviews_reqs]
}

# create a dataset object from the list
reviews_dataset = Dataset.from_dict(reviews_dict)

In [10]:
# common preprocessing
reviews_dataset_p = reviews_dataset.map(cleaning_datasets, num_proc=4)
reviews_dataset_p = reviews_dataset_p.filter(lambda x: len(x['review_text']) > 0)

Map (num_proc=4):   0%|          | 0/60619 [00:00<?, ? examples/s]

Map (num_proc=4): 100%|██████████| 60619/60619 [00:00<00:00, 96396.38 examples/s] 
Filter: 100%|██████████| 60619/60619 [00:00<00:00, 820178.50 examples/s]


In [11]:
# SA specific
def tokenize_dataset(data):
    return sa_tokenizer(data['review_text'], max_length=sa_tokenizer.model_max_length, padding='max_length', truncation=True, return_tensors='pt')

reviews_dataset_p_sa = reviews_dataset_p.map(tokenize_dataset, batched=True)

Map: 100%|██████████| 60431/60431 [00:08<00:00, 6890.56 examples/s]


In [12]:
# BERTopic specific

reviews_dataset_p_bertopic = Dataset.from_dict({
    'recommendationid': reviews_dataset_p['recommendationid'],
    'review_text': reviews_dataset_p['review_text']})

sbert_embeddings = sbert.encode(reviews_dataset_p['review_text'], show_progress_bar=True, batch_size=64)

Batches: 100%|██████████| 945/945 [00:06<00:00, 142.91it/s]


---

Inference

In [13]:
# SA with BERT

from transformers import Trainer, TrainingArguments

BATCH_SIZE = 32

trainer_args = TrainingArguments(output_dir='eval',
    per_device_eval_batch_size=BATCH_SIZE)

trainer = Trainer(
    model=sa_model,
    args=trainer_args,
    # train_dataset=ds_test,
    # eval_dataset=ds_test,
)

sa_output = trainer.predict(reviews_dataset_p_sa)


In [14]:
sa_output

PredictionOutput(predictions=array([[-2.513191  ,  2.8408263 ],
       [-2.8129225 ,  3.2663047 ],
       [-1.4244823 ,  1.3042884 ],
       ...,
       [-2.0817068 ,  2.1522136 ],
       [-0.09953638, -0.46936586],
       [ 0.02777606, -0.38265294]], dtype=float32), label_ids=None, metrics={'test_runtime': 262.3552, 'test_samples_per_second': 230.34, 'test_steps_per_second': 7.2})

In [15]:
from scipy.special import softmax

sa_logits = softmax(sa_output.predictions, axis=1)
sa_pred_result = sa_logits.argmax(axis=1)

sa_pred_result

array([1, 1, 1, ..., 1, 0, 0])

In [16]:
# bertopic

topics, probs = topic_model.transform(
    reviews_dataset_p_bertopic['review_text'],
    embeddings=sbert_embeddings
)

2024-03-22 11:39:51,411 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [17]:
topics

array([7, 3, 8, ..., 3, 1, 3])

In [18]:
probs

array([[ 0.5594574 ,  0.57658625,  0.38608336, ...,  0.58039004,
         0.3552853 ,  0.28020176],
       [ 0.2610594 ,  0.22799659,  0.1720337 , ...,  0.26279494,
         0.14184105,  0.24393168],
       [ 0.14688447,  0.13080573,  0.08629075, ...,  0.14431764,
         0.30286062,  0.22244917],
       ...,
       [ 0.27115703,  0.25759235,  0.16022427, ...,  0.3832994 ,
         0.19056311,  0.27444   ],
       [ 0.26007795,  0.24094352,  0.28893963, ...,  0.05636574,
         0.17065516,  0.09285688],
       [ 0.11547672,  0.10740201, -0.01128568, ...,  0.05336908,
         0.08607227,  0.07022146]], dtype=float32)

In [19]:
# build the topic frequency table from results

import pandas as pd
from collections import Counter

topic_freq = Counter(topics)
topic_freq = pd.DataFrame(topic_freq.items(), columns=['topic_id', 'count'])
topic_freq = topic_freq.sort_values(by='topic_id')
topic_freq.reset_index(drop=True, inplace=True)
# rename columns
topic_freq.columns = ['Topic', 'Count']

# add the topic names
topic_id_to_label = {int(k): v for k, v in topic_id_to_label.items()}       # int key conversion
topic_freq['Topic Name'] = topic_freq['Topic'].map(topic_id_to_label)

topic_freq

Unnamed: 0,Topic,Count,Topic Name
0,-1,10379,Fun game with toxic players.
1,0,21922,Best MOBA Ever
2,1,1466,Frustrating Experience after new updates
3,2,1644,Ruined Game
4,3,16307,Good game
5,4,7584,Russian Game
6,5,18,Barry's Pills
7,6,173,Pie Recipe
8,7,399,Old Classic Masterpiece
9,8,192,Money-gated skins


---

formulate the data and put it back to the reviews

In [20]:
sa_pred_result.shape

(60431,)

In [21]:
topics.shape

(60431,)

In [22]:
len(reviews_reqs)

60619

In [23]:
dataset_ptr = 0

reviews_dataset_p_sa_recommendationid_list = reviews_dataset_p_sa['recommendationid']

for i in range(len(reviews_reqs)):
    # get recommendationid
    recommendationid_reviewreqs = reviews_reqs[i]['recommendationid']

    # get the recommendationid from the dataset
    recommendationid_dataset = reviews_dataset_p_sa_recommendationid_list[dataset_ptr]

    # check if the recommendationid matches
    if recommendationid_reviewreqs != recommendationid_dataset:
        continue
    
    analysis_json = {
        'SA': sa_pred_result[dataset_ptr],
        'TM_topic_id': topics[dataset_ptr],
    }

    reviews_reqs[i]['analysis'] = analysis_json

    dataset_ptr += 1



In [24]:
# loop thorugh the reviews and extract json with analysis results
reviews_reqs_w_analysis = []

for i in range(len(reviews_reqs)):
    review = reviews_reqs[i]
    if 'analysis' in review:
        reviews_reqs_w_analysis.append(review)

len(reviews_reqs_w_analysis)

60431

In [25]:
# save the results to a file

reviews_reqs_w_analysis_path = game_folder.joinpath(
    f'steam_reviews_{game_steamid}_unique_with_gendata_with_analysis.pkl'
)

with open(reviews_reqs_w_analysis_path, 'wb') as f:
    pickle.dump(reviews_reqs_w_analysis, f)

    print('Saved:', reviews_reqs_w_analysis_path)

# also save the topic frequency table
topic_freq_path = game_folder.joinpath(
    f'steam_reviews_{game_steamid}_unique_with_gendata_with_analysis_topic_freq.csv'
)

topic_freq.to_csv(topic_freq_path, index=False)
print('Saved:', topic_freq_path)

Saved: ../../dataset/data_scraping/steam_comments_scraping/dota2/steam_reviews_570_unique_with_gendata_with_analysis.pkl
Saved: ../../dataset/data_scraping/steam_comments_scraping/dota2/steam_reviews_570_unique_with_gendata_with_analysis_topic_freq.csv
