# Imports

In [1]:
import pandas as pd
import numpy as np

# Reading data

In [2]:
df_games = pd.read_csv("../datasets/games.csv")
df_purchases = pd.read_csv("../datasets/purchased_games.csv")
df_prices = pd.read_csv("../datasets/prices.csv")
df_players = pd.read_csv("../datasets/players.csv")
df_reviews = pd.read_csv("../datasets/reviews.csv")


# Sample by Date and Language

## Import relevant libraries

In [5]:
%pip install langdetect
%pip install emoji

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install vaderSentiment

Note: you may need to restart the kernel to use updated packages.


In [5]:
%pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Using cached googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting httpx==0.13.3
  Using cached httpx-0.13.3-py3-none-any.whl (55 kB)
Collecting sniffio
  Using cached sniffio-1.3.1-py3-none-any.whl (10 kB)
Collecting chardet==3.*
  Using cached chardet-3.0.4-py2.py3-none-any.whl (133 kB)
Collecting rfc3986<2,>=1.3
  Using cached rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting hstspreload
  Using cached hstspreload-2025.1.1-py3-none-any.whl (1.3 MB)
Collecting idna==2.*
  Using cached idna-2.10-py2.py3-none-any.whl (58 kB)
Collecting httpcore==0.9.*
  Using cached httpcore-0.9.1-py3-none-any.whl (42 kB)
Collecting h11<0.10,>=0.8
  Using cached h11-0.9.0-py2.py3-none-any.whl (53 kB)
Collecting h2==3.*
  Using cached h2-3.2.0-py2.py3-none-any.whl (65 kB)
Collecting hyperframe<6,>=5.2.0
  Using cached hyperframe-5.2.0-py2.py3-none-any.whl (12 kB)
Collecting hpack<4,>=3.0
  Using cached hpack-3.0.0-py2.

In [7]:
# For language detection & translation
from langdetect import detect
from googletrans import Translator


# For sentiment analysis
import re
import torch
import emoji
import torch.nn.functional as F
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from tqdm import tqdm

import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Olivia/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Sample Reviews by Time

In [15]:
# cast posted from df_reviews to date time
df_reviews['posted'] = pd.to_datetime(df_reviews['posted'])

# filter reviews posted only after Jan 2020
reviews = df_reviews[df_reviews['posted'] > '2020-01-01']

In [16]:
reviews

Unnamed: 0,reviewid,playerid,gameid,review,helpful,funny,awards,posted
1,639544,76561198028706627,393380,---{ Graphics }---‚òê You forget what reality is...,0,0,0,2025-01-03
4,639547,76561198272817436,730,ONE OF THE GAMES I COME BACK TO,2,0,0,2020-01-23
5,639548,76561198399037664,271590,Ë∂ÖÂ§öbugÔºåË∂ÖÂ§öÊåÇ,0,0,0,2020-01-28
7,639550,76561198399037664,601150,È¨ºÊ≥£Á≥ªÂàóÊàëÁöÑÊúÄÁà±,0,0,0,2020-12-14
8,639551,76561198891812676,359550,‚ô•‚ô•‚ô•‚ô•,3,0,1,2021-08-09
...,...,...,...,...,...,...,...,...
1204528,639537,76561198400392792,730,√© b√£o.,0,0,0,2022-12-20
1204530,639539,76561198375531601,739630,jogo pika,0,0,1,2021-09-14
1204531,639540,76561198375531601,945360,bullet,0,0,0,2020-09-17
1204532,639541,76561198199434213,412220,–î–∞ –º–Ω–µ –Ω—Ä–∞–≤–∏—Ç—Å—è –∏–≥—Ä–∞ –ø–æ—Ç–æ–º—É-—á—Ç–æ —Ç–∞–º –µ—Å—Ç—å –∫i–ªi–±...,0,0,0,2024-11-03


## Detect Language of Reviews

In [17]:
# Detect Language
def detect_language(text):
    try:
        return detect(text)
    except:
        pass

# Translate non-english to english for each row of reviews
translator = Translator()
def translate_to_english(df):
    if df['language'] == 'en':
        return df['review']
    else:
        try:
            return translator.translate(df['review'], src=df['language'], dest='en').text
        except:
            return df['review']

In [18]:
# Add progress bar for language detection
tqdm.pandas(desc="Detecting language")
reviews['language'] = reviews['review'].progress_apply(detect_language)

Detecting language: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 749302/749302 [23:25<00:00, 533.01it/s]  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['language'] = reviews['review'].progress_apply(detect_language)


In [None]:
reviews.to_csv("../datasets/reviews_lang_detect.csv", index=False)

In [None]:
reviews = pd.read_csv("../datasets/reviews_lang_detect.csv")
reviews

Unnamed: 0,reviewid,playerid,gameid,review,helpful,funny,awards,posted,language
0,639544,76561198028706627,393380,---{ Graphics }---‚òê You forget what reality is...,0,0,0,2025-01-03,en
1,639547,76561198272817436,730,ONE OF THE GAMES I COME BACK TO,2,0,0,2020-01-23,en
2,639548,76561198399037664,271590,Ë∂ÖÂ§öbugÔºåË∂ÖÂ§öÊåÇ,0,0,0,2020-01-28,zh-cn
3,639550,76561198399037664,601150,È¨ºÊ≥£Á≥ªÂàóÊàëÁöÑÊúÄÁà±,0,0,0,2020-12-14,ko
4,639551,76561198891812676,359550,‚ô•‚ô•‚ô•‚ô•,3,0,1,2021-08-09,
...,...,...,...,...,...,...,...,...,...
749297,639537,76561198400392792,730,√© b√£o.,0,0,0,2022-12-20,pt
749298,639539,76561198375531601,739630,jogo pika,0,0,1,2021-09-14,sl
749299,639540,76561198375531601,945360,bullet,0,0,0,2020-09-17,tr
749300,639541,76561198199434213,412220,–î–∞ –º–Ω–µ –Ω—Ä–∞–≤–∏—Ç—Å—è –∏–≥—Ä–∞ –ø–æ—Ç–æ–º—É-—á—Ç–æ —Ç–∞–º –µ—Å—Ç—å –∫i–ªi–±...,0,0,0,2024-11-03,ru


In [9]:
en_reviews = reviews[reviews['language'] == "en"]
en_reviews.shape

(303871, 9)

In [None]:
en_reviews.to_csv("../datasets/english_reviews.csv", index=False)

# Post-Sentiment Analysis Data

In [3]:
reviews_scores = pd.read_csv("../datasets/sentiment_reviews_18oct.csv")

In [4]:
reviews_scores

Unnamed: 0,reviewid,playerid,gameid,review,helpful,funny,awards,posted,language,sentiment_score
0,639544,76561198028706627,393380,---{ Graphics }---‚òê You forget what reality is...,0,0,0,2025-01-03,en,5.000000
1,639547,76561198272817436,730,ONE OF THE GAMES I COME BACK TO,2,0,0,2020-01-23,en,3.049362
2,639571,76561198111259840,1476680,I love this game!Of course it is currently in ...,5,0,1,2022-09-23,en,4.779496
3,639583,76561198164012532,244210,only purchase if using a sim wheel setup with ...,0,0,0,2021-09-17,en,4.823064
4,639671,76561198979874677,739630,Fun game and the community is great ive made t...,2,0,0,2020-11-27,en,5.000000
...,...,...,...,...,...,...,...,...,...,...
303866,639524,76561198985274668,440,A funny game with a cool community.I play this...,0,0,0,2021-07-24,en,4.886784
303867,639526,76561198985274668,271590,- DIFFICULTY -üî≤ My 90 year old grandma could p...,0,0,0,2021-05-07,en,5.000000
303868,639528,76561198985274668,550,- DIFFICULTY -üî≤ My 90 year old grandma could p...,0,0,0,2021-01-15,en,5.000000
303869,639529,76561198985274668,730,Good job Valve. I love this game <3,0,0,0,2020-12-29,en,5.000000
