# Baldur's Gate 3 Steam Reviews EDA
Mahan Madani - Mohammad Mehdi Begmaz

## Load Dataset and import libraries

In [2]:
import re

import pandas as pd
import numpy as np
from profanity_check import predict

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
df = pd.read_csv("dataset/BG3_reviews_updated.csv")
print(df.columns)
print(df.shape)

Index(['recommendationid', 'language', 'review', 'timestamp_created',
       'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny',
       'weighted_vote_score', 'written_during_early_access', 'comment_count',
       'steam_purchase', 'received_for_free'],
      dtype='object')
(309103, 13)


In [4]:
df.dropna()
df.shape

(309103, 13)

In [5]:
duplicated = df.duplicated(subset=['review'], keep=False)
duplicate_rows = df[duplicated]

if not duplicate_rows.empty:
    print("Duplicate records found:")
    print(duplicated.sum())
else:
    print("No duplicate records found.")

Duplicate records found:
60446


In [6]:
df = df.drop_duplicates(subset='review', keep='first').reset_index(drop=True)
print("Dataframe with no duplicate values:")
print(df.shape)

Dataframe with no duplicate values:
(255105, 13)


## Feature Transformation

In [7]:
def clean_text(text):
    cleaned_text = text.lower().replace('\n', ' ')
    cleaned_text = re.sub('\[.*?\]', '', cleaned_text)
    cleaned_text = re.sub('[^a-zA-Z0-9()/!?&\'",. ]', '', cleaned_text)

    return cleaned_text

In [8]:
df['review'] = df['review'].astype(str)
df['review'] = df['review'].apply(clean_text)

In [9]:
df = df.drop(columns=['recommendationid', 'language','timestamp_created',
       'timestamp_updated', 'written_during_early_access', 'comment_count',
       'steam_purchase', 'received_for_free'])

## Feature Engineering

In [10]:
df['word_count'] = df['review'].apply(lambda text: len(text.split()))
df['word_count'].describe()

count    255105.000000
mean         44.504373
std          95.168806
min           0.000000
25%           7.000000
50%          16.000000
75%          42.000000
max        2286.000000
Name: word_count, dtype: float64

In [31]:
filtered_df = df[df['votes_up'] >= 0].reset_index(drop=True)

In [32]:
filtered_df = filtered_df[filtered_df['weighted_vote_score'] > 0.1].reset_index(drop=True)

In [33]:
filtered_df = filtered_df[filtered_df['word_count'] > 20].reset_index(drop=True)
filtered_df = filtered_df[filtered_df['word_count'] < 1000].reset_index(drop=True)

Delete reviews with profanity

In [34]:
filtered_df['profanity'] = pd.DataFrame(predict(filtered_df['review']))
filtered_df['profanity'].value_counts()

0    29723
1     1078
Name: profanity, dtype: int64

In [35]:
filtered_df = filtered_df[filtered_df['profanity'] == False]

Save preprocessed dara

In [36]:
filtered_df['voted_up'].value_counts()

True     24178
False     5545
Name: voted_up, dtype: int64

In [44]:
negative_df = filtered_df[filtered_df['voted_up'] == 0].reset_index(drop=True)
positive_df = filtered_df[filtered_df['voted_up'] == 1].reset_index(drop=True)

In [45]:
negative_df = negative_df.sort_values(by='weighted_vote_score', ascending=False).reset_index(drop=True)
positive_df = positive_df.sort_values(by='weighted_vote_score', ascending=False).reset_index(drop=True)

In [57]:
combined_df = pd.concat([negative_df.head(5000), positive_df.head(5000)]).reset_index(drop=True)

In [61]:
combined_df = combined_df.sample(frac=1, random_state=7).reset_index(drop=True)

In [62]:
combined_df.head()

Unnamed: 0,review,voted_up,votes_up,votes_funny,weighted_vote_score,word_count,profanity
0,"its a good game, if you play it exactly how th...",False,2,1,0.464075,488,0
1,20 hrs in while i will admit it does not have...,True,2,0,0.542175,247,0
2,"lives, all mortal lives, expire souls go to ...",True,2,0,0.545455,146,0
3,"i realise its in early access, and i am hoping...",False,7,0,0.563492,476,0
4,the best sixty dollars i've ever spent in my l...,True,3,0,0.558011,492,0


In [60]:
combined_df['voted_up'].value_counts()

False    5000
True     5000
Name: voted_up, dtype: int64

In [63]:
combined_df.to_csv('./dataset/BG3_reviews_more_negative.csv', index=False)