In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import plotly.express as px
import nbformat

In [2]:
np.random.seed(42)

Data source: https://www.kaggle.com/datasets/najzeko/steam-reviews-2021/data

In [3]:
# Get the total number of rows in the CSV file
total_rows = sum(1 for row in open('data/steam_reviews.csv')) - 1  # subtract 1 for header
sample_size = int(total_rows * 0.1)

# Read only 10% of the CSV file
steam_reviews = pd.read_csv('data/steam_reviews.csv', skiprows=lambda x: x > 0 and np.random.rand() > 0.1, nrows=sample_size)

In [4]:
# Get all unique app names present in the dataset
app_names = steam_reviews['app_name'].unique()
app_names = app_names.tolist()
app_names

['The Witcher 3: Wild Hunt',
 'Half-Life',
 'Counter-Strike: Source',
 'Half-Life 2: Episode Two',
 'Portal 2',
 'X Rebirth',
 "Garry's Mod",
 "Sid Meier's Civilization V",
 'Dead by Daylight',
 "Sid Meier's Civilization VI",
 'Subnautica',
 'Human: Fall Flat',
 'Beat Saber',
 'Cold Waters',
 'Banished',
 'Celeste',
 'Getting Over It with Bennett Foddy',
 'A Hat in Time',
 'Overcooked! 2',
 'Slipstream',
 'The Forest',
 'Pogostuck: Rage With Your Friends',
 'PC Building Simulator',
 'RollerCoaster Tycoon World',
 'NBA 2K18',
 'NBA 2K21',
 'Deus Ex: The Fall',
 'Rapture Rejects',
 'Artifact',
 'Call of Duty: Infinite Warfare',
 'Cube World',
 'NBA 2K19',
 'Nether',
 'Wolfenstein: Youngblood',
 'Warhammer 40,000: Dawn of War III',
 'Takedown: Red Sabre',
 'ATLAS',
 'Stay Out',
 'Identity',
 'Umbrella Corps',
 'Hunt Down The Freeman',
 'WWE 2K20',
 'Down To One',
 'Axiom Verge',
 'Guacamelee! Super Turbo Championship Edition',
 'The Binding of Isaac: Rebirth',
 'To the Moon',
 'Cave Story

In [5]:
# Number of reviews in English
steam_reviews[(steam_reviews["language"] == "english")].shape

(962962, 23)

In [6]:
#Possible review Bombing for GTA V between 2017-06-01 and 2017-07-31
one_game_only_english = steam_reviews[(steam_reviews["app_name"].str.contains("Grand Theft Auto", case = False)) 
                                      & (steam_reviews["language"] == "english")
                                      & (steam_reviews["recommended"] == False)
                                      & (steam_reviews["timestamp_updated"] > 1496268000)
                                      & (steam_reviews["timestamp_updated"] < 1501538399)
                                      & ((steam_reviews["review"].str.contains("Take-Two", case = False)) 
                                         | (steam_reviews["review"].str.contains("OpenIV", case = False)))
                                      & (steam_reviews["votes_funny"] == 0)]
one_game_only_english.head()

Unnamed: 0.1,Unnamed: 0,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,...,steam_purchase,received_for_free,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played
1349925,13496915,271590,Grand Theft Auto V,33812000,english,OpenIV fiasco shows how a company tries to dic...,1501085473,1501085531,False,0,...,True,False,False,76561198024673004,15,2,10175.0,0.0,4410.0,1509200000.0
1350441,13501704,271590,Grand Theft Auto V,33345032,english,Take Two basically killed their own reputation...,1499439418,1499439567,False,2,...,False,False,False,76561198087267083,20,2,9593.0,0.0,1280.0,1565367000.0
1350443,13501715,271590,Grand Theft Auto V,33343849,english,What's done is done. Removing OpenIV greatly d...,1499436776,1499436776,False,2,...,True,False,False,76561198239439730,24,1,16134.0,0.0,7453.0,1592070000.0
1350493,13502206,271590,Grand Theft Auto V,33307413,english,You focus on banning single player modding not...,1499337665,1499337665,False,5,...,True,False,False,76561198045782126,121,20,32323.0,0.0,18078.0,1589024000.0
1350624,13503350,271590,Grand Theft Auto V,33238809,english,Here's my philosophy on the games as it relate...,1499187311,1499187311,False,1,...,True,False,False,76561198070812011,439,26,863.0,0.0,,1591927000.0


In [7]:
total_gta_reviews = steam_reviews[(steam_reviews["app_name"].str.contains("Grand Theft Auto", case = False))
                                  & (steam_reviews["language"] == "english")
                                  & (steam_reviews["timestamp_updated"] > 1496268000)
                                  & (steam_reviews["timestamp_updated"] < 1501538399)]
total_gta_reviews.head()

Unnamed: 0.1,Unnamed: 0,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,...,steam_purchase,received_for_free,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played
1349836,13496038,271590,Grand Theft Auto V,33917530,english,This Positive Review was Ruined by the Error\n...,1501462530,1501462530,False,1,...,True,False,False,76561198033281896,143,1,25309.0,0.0,16139.0,1598193000.0
1349838,13496053,271590,Grand Theft Auto V,33915584,english,This game is fun and ha a lot of aspects to it...,1501453984,1501453984,True,0,...,True,True,False,76561198332312545,18,1,2823.0,65.0,2659.0,1610338000.0
1349846,13496157,271590,Grand Theft Auto V,33901816,english,░░░░░░░░░░░░░░░░░░░░░░█████████\n░░███████░░░░...,1501411178,1501411178,True,1,...,True,False,False,76561198035380665,129,74,17746.0,0.0,15743.0,1579447000.0
1349848,13496167,271590,Grand Theft Auto V,33900688,english,Always a have a fun time playing online and th...,1501407064,1501407064,True,1,...,True,False,False,76561197984771212,356,2,8695.0,0.0,837.0,1526875000.0
1349849,13496172,271590,Grand Theft Auto V,33899864,english,lets m e m e the community by banning people f...,1501403596,1501403596,False,2,...,True,False,False,76561198085821142,182,6,10094.0,0.0,6345.0,1580589000.0


In [8]:
# Compare the number of rows of both tables
rows_one_game_only_english = one_game_only_english.shape[0]
rows_total_gta_reviews = total_gta_reviews.shape[0]

print(f"Number of rows in one_game_only_english: {rows_one_game_only_english}")
print(f"Number of rows in total_gta_reviews: {rows_total_gta_reviews}")
print(f"Percentage of review bombing: {rows_one_game_only_english / rows_total_gta_reviews * 100:.2f}%")

Number of rows in one_game_only_english: 591
Number of rows in total_gta_reviews: 4111
Percentage of review bombing: 14.38%


### Attempt to turn Tabular data into a Graph for GNNs

In [9]:
steam_reviews_english = steam_reviews[(steam_reviews["language"] == "english")]
steam_reviews_english = steam_reviews_english[['app_id', 'app_name', 'review', 'timestamp_created', 'timestamp_updated', 'recommended', 'author.steamid', 'weighted_vote_score']]
steam_reviews_english = steam_reviews_english.sort_values(by=['timestamp_created', 'timestamp_updated'], ascending=[True, True])

# Replace the author.steamid with smaller numbers starting from 0
steam_reviews_english['author.steamid'] = pd.factorize(steam_reviews_english['author.steamid'])[0]

# Replace the app_id with smaller numbers starting from 0
steam_reviews_english['app_id'] = pd.factorize(steam_reviews_english['app_id'])[0]

steam_reviews_english.head()

Unnamed: 0,app_id,app_name,review,timestamp_created,timestamp_updated,recommended,author.steamid,weighted_vote_score
156030,0,Garry's Mod,For creative & awesome people,1290197836,1290197836,True,0,0.0
156029,0,Garry's Mod,If you don't own Garry's Mod. You aren't truel...,1290198787,1386116339,True,1,0.472947
156028,0,Garry's Mod,TOP NOTCH GAME RIGHT HERE.\nSo many ways to pl...,1290200117,1385462680,True,2,0.0
156027,0,Garry's Mod,You All Should have this game by now,1290202964,1290202964,True,3,0.0
156026,0,Garry's Mod,Is a fantastic sandbox which you can give 👍 to...,1290208819,1561670109,True,4,0.584615


In [10]:
# Number of times the review was updated
steam_reviews_english[(steam_reviews_english['timestamp_created'] != steam_reviews_english['timestamp_updated'])]

Unnamed: 0,app_id,app_name,review,timestamp_created,timestamp_updated,recommended,author.steamid,weighted_vote_score
156029,0,Garry's Mod,If you don't own Garry's Mod. You aren't truel...,1290198787,1386116339,True,1,0.472947
156028,0,Garry's Mod,TOP NOTCH GAME RIGHT HERE.\nSo many ways to pl...,1290200117,1385462680,True,2,0.000000
156026,0,Garry's Mod,Is a fantastic sandbox which you can give 👍 to...,1290208819,1561670109,True,4,0.584615
156023,0,Garry's Mod,Taking the aspects of the Source Engine and ex...,1290219743,1543063053,True,7,0.000000
156020,0,Garry's Mod,funnest game ever you do anything you want,1290242820,1385416637,True,10,0.000000
...,...,...,...,...,...,...,...,...
724595,155,Little Nightmares,[table]\n[tr]\n\t[td]⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀...,1611418352,1611418487,True,455498,0.000000
244867,60,Subnautica,Too many buggs.....\n\nimagine working around ...,1611418455,1611421527,False,856800,0.476190
2126224,48,FINAL FANTASY XIV Online,"Great game untill you hit lvl 50, then you're ...",1611418995,1611420014,False,396196,0.000000
89922,8,Portal 2,If you havent been spoiled and you havent play...,1611421120,1611422394,True,856824,0.000000


#### The network should be in the following format:

- One line per interaction/edge.
- Each line should be: user, item, timestamp, state label, comma-separated array of features.
- First line is the network format.
- User and item fields can be alphanumeric.
- Timestamp should be in cardinal format (not in datetime).
- State label should be 1 whenever the user state changes, 0 otherwise. If there are no state labels, use 0 for all interactions.
- Feature list can be as long as desired. It should be atleast 1 dimensional. If there are no features, use 0 for all interactions.


In [11]:
# Define a function to transform the reviews into the required network format
def transform_to_network(df):
    network_data = []
    review_field = df['review']
    vectorizer = TfidfVectorizer(max_features = 10)
    tdfidf_features = vectorizer.fit_transform(review_field.fillna("")).toarray()
    i = 0
    for _, row in df.iterrows():
        user = row['author.steamid']
        item = row['app_id']
        timestamp = row['timestamp_created']
        weighted_vote_score = row['weighted_vote_score']
        state_label = 0
        recommended = 1 if row['recommended'] else 0
        # Add the features list to the network data
        array_to_append = [user, item, timestamp, state_label, recommended]
        array_to_append.extend(tdfidf_features[i])
        array_to_append.extend((weighted_vote_score))
        network_data.append(array_to_append)
        i += 1
    # Create a DataFrame from the network data
    network_df = pd.DataFrame(network_data)
    return network_df

In [12]:
# Transform the steam_reviews in english DataFrame
network_df = transform_to_network(steam_reviews_english)

# Display the first few rows of the transformed network DataFrame
network_df.head()

Unnamed: 0,0,1,2,3,4
0,0,0,1290197836,0,1
1,1,0,1290198787,0,1
2,2,0,1290200117,0,1
3,3,0,1290202964,0,1
4,4,0,1290208819,0,1


In [13]:
network_df.rename(columns={0: 'user_id', 1: 'item_id', 2: 'timestamp', 3: 'state_label', 4: 'comma_separated_list_of_features'}, inplace=True)

In [14]:
network_df.head()

Unnamed: 0,user_id,item_id,timestamp,state_label,comma_separated_list_of_features
0,0,0,1290197836,0,1
1,1,0,1290198787,0,1
2,2,0,1290200117,0,1
3,3,0,1290202964,0,1
4,4,0,1290208819,0,1


In [15]:
network_df.to_csv('data/steam.csv', index=False)