# Baldur's Gate 3 Steam Reviews EDA
Mahan Madani - Mohammad Mehdi Begmaz

## Load Dataset and import libraries

In [1]:
import os, zipfile

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from textblob import TextBlob

from sklearn.neighbors import NearestNeighbors

In [None]:
!pip install kaggle

# Upload your kaggle.json file and replace the path below
os.environ['KAGGLE_CONFIG_DIR'] = "./.kaggle"

!kaggle datasets download -p ./dataset harisyafie/baldurs-gate-3-steam-reviews

[0mCollecting kaggle
[0m  Downloading kaggle-1.6.3.tar.gz (84 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.5/84.5 kB[0m [31m20.6 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting python-slugify (from kaggle)
  Obtaining dependency information for python-slugify from https://files.pythonhosted.org/packages/09/49/e05adaaa2d8604b7cfbce81af14c7a48c67d70a6e06cb47473c9673267db/python_slugify-8.0.2-py2.py3-none-any.whl.metadata
  Downloading python_slugify-8.0.2-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting text-unidecode>=1.3 (from python-slugify->kaggle)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.2/78.2 kB[0m [31m56.9 kB/s[0m eta [36m0:00:00[0m31m45.7 MB/s[0m eta [36m0:00:01[0m
Downloading python_slugify-8.0.2-py2.py3-none-any.whl (10 kB)
Building wheels for collecte

In [3]:
with zipfile.ZipFile('./dataset/baldurs-gate-3-steam-reviews.zip', 'r') as zip_ref:
    zip_ref.extractall('./dataset')

In [4]:
df = pd.read_csv("dataset/BG3_reviews_updated.csv")
print(df.columns)
print(df.shape)

Index(['recommendationid', 'language', 'review', 'timestamp_created',
       'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny',
       'weighted_vote_score', 'written_during_early_access', 'comment_count',
       'steam_purchase', 'received_for_free'],
      dtype='object')
(309103, 13)


## Basic Data Analysis

In [7]:
def explore_data(x):
    display("Data Head", x.head())
    display("Data Sample",x.sample(5))
    rows, columns = x.shape
    print("Data Shape")
    print(f"This datset has {rows} rows and {columns} columns")
    display("Data Describe",x.describe().T)
    display("Data Types", x.dtypes)
    
    
explore_data(df)

Unnamed: 0,recommendationid,language,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,written_during_early_access,comment_count,steam_purchase,received_for_free
0,153560814,english,This game hits all the right marks. 10/10,1702542971,1702542971,True,0,0,0.0,False,0,True,False
1,153560623,english,took me like 11 hours to understand the basics,1702542657,1702542657,True,0,0,0.0,False,0,True,False
2,153560414,english,10/10 game play and story! It's my first turn ...,1702542275,1702542275,True,0,0,0.0,False,0,True,False


In [6]:
df.describe(include='object')

Unnamed: 0,language,review
count,309103,308086
unique,1,255104
top,english,good
freq,309103,1937


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309103 entries, 0 to 309102
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   recommendationid             309103 non-null  int64  
 1   language                     309103 non-null  object 
 2   review                       308086 non-null  object 
 3   timestamp_created            309103 non-null  int64  
 4   timestamp_updated            309103 non-null  int64  
 5   voted_up                     309103 non-null  bool   
 6   votes_up                     309103 non-null  int64  
 7   votes_funny                  309103 non-null  int64  
 8   weighted_vote_score          309103 non-null  float64
 9   written_during_early_access  309103 non-null  bool   
 10  comment_count                309103 non-null  int64  
 11  steam_purchase               309103 non-null  bool   
 12  received_for_free            309103 non-null  bool   
dtyp

In [19]:
null_counts = df.isnull().sum()
null_table = pd.DataFrame(null_counts, columns=["Null Count"])
null_table

Unnamed: 0,Null Count
recommendationid,0
language,0
review,1017
timestamp_created,0
timestamp_updated,0
voted_up,0
votes_up,0
votes_funny,0
weighted_vote_score,0
written_during_early_access,0


In [21]:
df.dropna()
df.shape

(309103, 13)

## Feature Engineering

In [23]:
# def count_words(text):
#     return len(text.split())
df['review'] = df['review'].astype(str)
df['word_count'] = df['review'].apply(lambda text: len(text.split()))

In [25]:
df['word_count'].describe()

count    309103.000000
mean         37.377816
std          88.723252
min           0.000000
25%           4.000000
50%          11.000000
75%          34.000000
max        2286.000000
Name: word_count, dtype: float64

## Visualization

## Feature Transformation