Part 3 of the LLM per game TLDR generation project

Given the processed game reviews, we create different tables like in GameAnalysis API

In [43]:
from datetime import datetime
import pytz
import os
import time
import requests
import json
import sys

import pickle
from pathlib import Path
import traceback

import pandas as pd
import numpy as np

In [2]:
game_steamid = 1716740              # starfield
game_name = 'starfield'             # also the folder name where the reviews are stored

# game_steamid = 1118010
# game_name = 'monster_hunter_world_iceborne'

# game_steamid = 582010
# game_name = 'monster_hunter_world'

# game_steamid = 2138330          # cyberpunk2077 phantom liberty
# game_name = 'cyberpunk2077_phantom_liberty'

# game_steamid = 1091500          # cyberpunk2077
# game_name = 'cyberpunk2077'

# game_steamid = 730
# game_name = 'counter-strike_2'

# game_steamid = 570
# game_name = "dota2"


In [3]:
# load the reviews from folder

reviews_reqs = []

# get existing folder and retrieve the cursor object (?)

# load the latest file
game_folder = Path(f'../../dataset/data_scraping/steam_comments_scraping/{game_name}')
if game_folder.exists():
    try:
        latest_file_path = game_folder.joinpath(f'steam_reviews_{game_steamid}_unique_with_gendata_with_analysis.pkl')
        with open(latest_file_path, 'rb') as f:
            reviews_reqs = pickle.load(f)           # retrieve the list of reviews
            print('Loaded:', latest_file_path)
    except IndexError as e:
        print('Error loading the latest file:', e)
        traceback.print_exc()

Loaded: ../../dataset/data_scraping/steam_comments_scraping/starfield/steam_reviews_1716740_unique_with_gendata_with_analysis.pkl


In [23]:
reviews_reqs[0]

{'recommendationid': '157967184',
 'author': {'steamid': '76561198005388655',
  'num_games_owned': 121,
  'num_reviews': 7,
  'playtime_forever': 556,
  'playtime_last_two_weeks': 0,
  'playtime_at_review': 556,
  'last_played': 1695409417},
 'language': 'english',
 'review': "I have loved every Bethesda game.\nI can't say I particularly even like this one :(",
 'timestamp_created': 1707534074,
 'timestamp_updated': 1707534074,
 'voted_up': False,
 'votes_up': 0,
 'votes_funny': 0,
 'weighted_vote_score': 0,
 'comment_count': 0,
 'steam_purchase': True,
 'received_for_free': False,
 'written_during_early_access': False,
 'hidden_in_steam_china': True,
 'steam_china_location': '',
 'author_generated': {'name': 'richardjackson0',
  'email': 'richardjackson0@icloud.com',
  'password': 'richardjackson0',
  'birthdate': '1997-10-04',
  'gender': 'MALE'},
 'analysis': {'SA': 0, 'TM_topic_id': 0}}

In [4]:
# also load the topic frequency table

topic_freq_path = game_folder.joinpath(
    f'steam_reviews_{game_steamid}_unique_with_gendata_with_analysis_topic_freq.csv'
)

topic_freq = pd.read_csv(topic_freq_path, index_col=None)

Create different tables

- genderReviews
- ageReviews


Using sentiment analysis results

- sentimentReviews
- sentimentReviewsByGender
- sentimentReviewsByAge


In [17]:
MIN_MAX_AGE = {
    '13-19': (13, 19),
    '20-29': (20, 29),
    '30-39': (30, 39),
    '40-49': (40, 49),
    '50-59': (50, 59),
    '60-69': (60, 69),
    '70-79': (70, 79),
    '80-89': (80, 89),
    '90-99': (90, 99)
}

AGE_GROUPS = list(MIN_MAX_AGE.keys()) + ["N/A"]
AGE_RANGE_TO_AGE_GROUP = {v: k for k, v in MIN_MAX_AGE.items()}

GENDERS = ['MALE', 'FEMALE', "OTHER", "UNDISCLOSED", "N/A"]

In [18]:
analysis_date = datetime(2024, 3, 22)       # or get latest date using datetime.now().today()

In [19]:
def get_age_range(review:json):
    bod = review['author_generated']['birthdate']
    bod = datetime.strptime(bod, '%Y-%m-%d')
    age = analysis_date.year - bod.year - ((analysis_date.month, analysis_date.day) < (bod.month, bod.day))

    for age_group, (min_age, max_age) in MIN_MAX_AGE.items():
        if min_age <= age <= max_age:
            return age_group
        
    return "N/A"        # default value in the API

In [20]:
# create df for ageReviews

ageReviews_json = {}
for age_group in AGE_GROUPS:
    ageReviews_json[age_group] = 0

for review in reviews_reqs:
    age_group = get_age_range(review)
    ageReviews_json[age_group] += 1

ageReviews_df = pd.DataFrame(ageReviews_json.items(), columns=['age_group', 'count'])
ageReviews_df

Unnamed: 0,age_group,count
0,13-19,9833
1,20-29,38403
2,30-39,29110
3,40-49,14313
4,50-59,4800
5,60-69,0
6,70-79,0
7,80-89,0
8,90-99,0
9,,0


In [22]:
# create df for genderReviews

genderReviews_json = {k:0 for k in GENDERS}

for review in reviews_reqs:
    genderReviews_json[review['author_generated']['gender']] += 1

genderReviews_df = pd.DataFrame(genderReviews_json.items(), columns=['gender', 'count'])
genderReviews_df

Unnamed: 0,gender,count
0,MALE,77200
1,FEMALE,19259
2,OTHER,0
3,UNDISCLOSED,0
4,,0


In [26]:
# create sentimentReviews

sentimentReviews_json = {k:0 for k in ['POSITIVE', 'NEGATIVE', "N/A"]}
for review in reviews_reqs:
    sentimentReviews_json["POSITIVE" if review['analysis']['SA'] else "NEGATIVE"] += 1        # 1 for positive, 0 for negative

sentimentReviews_df = pd.DataFrame(sentimentReviews_json.items(), columns=['sentiment', 'count'])
sentimentReviews_df

# and the actual sentiment count

sentimentReviews_truelabel_json = {k:0 for k in ['POSITIVE', 'NEGATIVE', "N/A"]}
for review in reviews_reqs:
    sentimentReviews_truelabel_json["POSITIVE" if review['voted_up'] else "NEGATIVE"] += 1      # "voted_up" is the actual sentiment, and stores a boolean value

sentimentReviews_truelabel_df = pd.DataFrame(sentimentReviews_truelabel_json.items(), columns=['sentiment', 'count'])
sentimentReviews_truelabel_df

Unnamed: 0,sentiment,count
0,POSITIVE,58153
1,NEGATIVE,38306
2,,0


In [33]:
from itertools import product

sentimentByAgeGroup_df_template = pd.DataFrame(product(
    AGE_GROUPS, ['POSITIVE', 'NEGATIVE', "N/A"], [0]
), columns=['age_group', 'sentiment', 'count'])

sentimentByAgeGroup_df_template

Unnamed: 0,age_group,sentiment,count
0,13-19,POSITIVE,0
1,13-19,NEGATIVE,0
2,13-19,,0
3,20-29,POSITIVE,0
4,20-29,NEGATIVE,0
5,20-29,,0
6,30-39,POSITIVE,0
7,30-39,NEGATIVE,0
8,30-39,,0
9,40-49,POSITIVE,0


In [46]:
# sentiment by age group

sentimentByAgeGroup = []

for review in reviews_reqs:
    # get the age group of the review
    age_group = get_age_range(review)
    sentiment = "POSITIVE" if review['analysis']['SA'] else "NEGATIVE"
    sentimentByAgeGroup.append((age_group, sentiment))

# count the number of reviews for each (age_group, sentiment) pair
sentimentByAgeGroup_df = pd.DataFrame(sentimentByAgeGroup, columns=['age_group', 'sentiment'])
sentimentByAgeGroup_df = sentimentByAgeGroup_df.groupby(['age_group', 'sentiment']).size().reset_index(name='count')
# reorder columns
sentimentByAgeGroup_df = sentimentByAgeGroup_df[['sentiment', 'age_group', 'count']]

# update with the template
sentimentByAgeGroup_df = sentimentByAgeGroup_df_template.merge(sentimentByAgeGroup_df, on=['age_group', 'sentiment'], how='left')
sentimentByAgeGroup_df['count'] = np.max(sentimentByAgeGroup_df[['count_x', 'count_y']], axis=1)
sentimentByAgeGroup_df = sentimentByAgeGroup_df[['sentiment', 'age_group', 'count']]

# drop rows with count = 0
sentimentByAgeGroup_df = sentimentByAgeGroup_df[sentimentByAgeGroup_df['count'] > 0]

# sort by sentiment
sentimentByAgeGroup_df = sentimentByAgeGroup_df.sort_values(by=['sentiment', 'age_group'])
sentimentByAgeGroup_df.reset_index(drop=True, inplace=True)


sentimentByAgeGroup_df

Unnamed: 0,sentiment,age_group,count
0,NEGATIVE,13-19,4203.0
1,NEGATIVE,20-29,16643.0
2,NEGATIVE,30-39,12712.0
3,NEGATIVE,40-49,6145.0
4,NEGATIVE,50-59,2046.0
5,POSITIVE,13-19,5630.0
6,POSITIVE,20-29,21760.0
7,POSITIVE,30-39,16398.0
8,POSITIVE,40-49,8168.0
9,POSITIVE,50-59,2754.0


In [47]:
sentimentByGender_df_template = pd.DataFrame(product(
    GENDERS, ['POSITIVE', 'NEGATIVE', "N/A"], [0]
), columns=['gender', 'sentiment', 'count'])

sentimentByGender_df_template

Unnamed: 0,gender,sentiment,count
0,MALE,POSITIVE,0
1,MALE,NEGATIVE,0
2,MALE,,0
3,FEMALE,POSITIVE,0
4,FEMALE,NEGATIVE,0
5,FEMALE,,0
6,OTHER,POSITIVE,0
7,OTHER,NEGATIVE,0
8,OTHER,,0
9,UNDISCLOSED,POSITIVE,0


In [50]:
# sentiment by gender

sentimentByGender = []

for review in reviews_reqs:
    # get the gender of the review
    gender = review['author_generated']['gender']
    sentiment = "POSITIVE" if review['analysis']['SA'] else "NEGATIVE"
    
    sentimentByGender.append([gender, sentiment])

# count the number of reviews for each (gender, sentiment) pair
sentimentByGender_df = pd.DataFrame(sentimentByGender, columns=['gender', 'sentiment'])
sentimentByGender_df = sentimentByGender_df.groupby(['gender', 'sentiment']).size().reset_index(name='count')

# reorder columns
sentimentByGender_df = sentimentByGender_df[['sentiment', 'gender', 'count']]

# update with the template
sentimentByGender_df = sentimentByGender_df_template.merge(sentimentByGender_df, on=['sentiment', 'gender'], how='left')
sentimentByGender_df['count'] = np.max(sentimentByGender_df[['count_x', 'count_y']], axis=1)
sentimentByGender_df = sentimentByGender_df[['sentiment', 'gender', 'count']]

# drop rows with count = 0
sentimentByGender_df = sentimentByGender_df[sentimentByGender_df['count'] > 0]

# sort by sentiment
sentimentByGender_df = sentimentByGender_df.sort_values(by=['sentiment', 'gender'])
sentimentByGender_df.reset_index(drop=True, inplace=True)


sentimentByGender_df


Unnamed: 0,sentiment,gender,count
0,NEGATIVE,FEMALE,8321.0
1,NEGATIVE,MALE,33428.0
2,POSITIVE,FEMALE,10938.0
3,POSITIVE,MALE,43772.0


In [51]:
# save each csv
save_path = game_folder.joinpath(f'steam_reviews_{game_steamid}_unique_with_gendata_with_analysis_ageGroup.csv')
ageReviews_df.to_csv(save_path, index=False)
print('Saved:', save_path)

save_path = game_folder.joinpath(f'steam_reviews_{game_steamid}_unique_with_gendata_with_analysis_genderReviews.csv')
genderReviews_df.to_csv(save_path, index=False)
print('Saved:', save_path)

save_path = game_folder.joinpath(f'steam_reviews_{game_steamid}_unique_with_gendata_with_analysis_sentimentReviews.csv')
sentimentReviews_df.to_csv(save_path, index=False)
print('Saved:', save_path)

save_path = game_folder.joinpath(f'steam_reviews_{game_steamid}_unique_with_gendata_with_analysis_sentimentReviews_truelabel.csv')
sentimentReviews_truelabel_df.to_csv(save_path, index=False)
print('Saved:', save_path)

save_path = game_folder.joinpath(f'steam_reviews_{game_steamid}_unique_with_gendata_with_analysis_sentimentByAgeGroup.csv')
sentimentByAgeGroup_df.to_csv(save_path, index=False)
print('Saved:', save_path)

save_path = game_folder.joinpath(f'steam_reviews_{game_steamid}_unique_with_gendata_with_analysis_sentimentByGender.csv')
sentimentByGender_df.to_csv(save_path, index=False)
print('Saved:', save_path)

Saved: ../../dataset/data_scraping/steam_comments_scraping/starfield/steam_reviews_1716740_unique_with_gendata_with_analysis_ageGroup.csv
Saved: ../../dataset/data_scraping/steam_comments_scraping/starfield/steam_reviews_1716740_unique_with_gendata_with_analysis_genderReviews.csv
Saved: ../../dataset/data_scraping/steam_comments_scraping/starfield/steam_reviews_1716740_unique_with_gendata_with_analysis_sentimentReviews.csv
Saved: ../../dataset/data_scraping/steam_comments_scraping/starfield/steam_reviews_1716740_unique_with_gendata_with_analysis_sentimentReviews_truelabel.csv
Saved: ../../dataset/data_scraping/steam_comments_scraping/starfield/steam_reviews_1716740_unique_with_gendata_with_analysis_sentimentByAgeGroup.csv
Saved: ../../dataset/data_scraping/steam_comments_scraping/starfield/steam_reviews_1716740_unique_with_gendata_with_analysis_sentimentByGender.csv
