# Importing Libraries, defining paths & creating functions

In [1]:
# Importing libraries and paths

import pandas as pd
import gzip
import ast
from textblob import TextBlob
from scipy import stats
# tensorflow

# Paths

path1 = 'steam_games.json.gz'
path2 = 'user_reviews.json.gz'
path3 = 'users_items.json.gz'

# Creating a function to read the paths

def read_path(file):
    with gzip.open(file, 'rt', encoding='utf-8') as myfile:
        return [ast.literal_eval(line.strip()) for line in myfile]


# We generated the function to unnest the 'items' and 'reviews' columns
def unnest_items(df):
    # Explode the 'items' column to create a new row for each element in the list
    items = df['items'].explode()

    # Drop the original 'items' column from the DataFrame
    df = df.drop('items', axis=1)

    # Join the exploded 'items' Series as new columns in the DataFrame
    df = df.join(items.apply(pd.Series))

    # Return the modified DataFrame
    return df


# Reading datasets

In [2]:
#   Let's read all the datasets

# Reading the dataset from 'steam_games.json.gz' ↓↓↓
with gzip.open(path1, 'rt', encoding='utf-8') as file:
    df_games = pd.read_json(file, lines=True)

# Reading datasets 
reviews = read_path(path2)
items = read_path(path3)

# Transforming datasets to dataframes ↓:
df_reviews = pd.DataFrame(reviews)
# df_items = pd.DataFrame(items)

# Let's prepare the files by cleaning null values, normalizing data, addressing outliers, and including dummy variables.

## First, we'll begin cleaning 'df_games'

In [None]:
# Let's start with df_games

# In the next code cell, I'm going to delete specific columns that we don't for the tasks
columns_to_drop = ['publisher', 'url', 'tags', 'reviews_url', 'specs', 'price', 'early_access', 'developer']

df_games.drop(columns=columns_to_drop, inplace=True)

# I use inplace=True because I want to change the original dataframe too

In [22]:
# Are there some null values?
print(df_games.isnull())

# Deleting null values from this columns
df_games.dropna(subset=['genres', 'app_name', 'title', 'release_date', 'id'])

# We see date
df_games.astype(str) # This is for convert dataframe to a list so we can manipulate data
mask = df_games['release_date'].notnull()
print('\n', df_games[mask])

        genres  app_name  title  release_date     id
88310    False     False  False         False  False
88311    False     False  False         False  False
88312    False     False  False         False  False
88313    False     False  False         False  False
88315    False     False  False         False  False
...        ...       ...    ...           ...    ...
120439   False     False  False         False  False
120440   False     False  False         False  False
120441   False     False  False         False  False
120442   False     False  False         False  False
120443   False     False  False         False  False

[29894 rows x 5 columns]

                                                    genres  \
88310       [Action, Casual, Indie, Simulation, Strategy]   
88311                [Free to Play, Indie, RPG, Strategy]   
88312   [Casual, Free to Play, Indie, Simulation, Sports]   
88313                         [Action, Adventure, Casual]   
88315                     [Acti

In [33]:
# This function is for extract the year
def extract_year(date_str):
    try:
        return pd.to_datetime(date_str, errors='coerce').year
    except:
        return None


# We only the year, this is needed for EDA, days and months aren't needed
df_games['release_date'] = df_games['release_date'].apply(extract_year)

# We delete rows where year is an invalid value
df_games.dropna(subset=['release_date'], how='all', inplace=True)

df_games

Unnamed: 0_level_0,genres,app_name,title,release_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
761140.0,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,1970
643980.0,"[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,1970
670290.0,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,1970
767400.0,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,1970
772540.0,"[Action, Adventure, Simulation]",Battle Royale Trainer,Battle Royale Trainer,1970
...,...,...,...,...
745400.0,"[Action, Adventure, Casual, Indie]",Kebab it Up!,Kebab it Up!,1970
773640.0,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,1970
733530.0,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,1970
610660.0,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,1970


In [34]:
# Last step: We see our dataframe 

print(df_games.info(), '\n')

<class 'pandas.core.frame.DataFrame'>
Index: 29894 entries, 761140.0 to 658870.0
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        28660 non-null  object
 1   app_name      29893 non-null  object
 2   title         29893 non-null  object
 3   release_date  29894 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.1+ MB
None 



## Second Step: We'll clean 'df_reviews'

In [87]:
# We continue cleaning dataframes and the next is: df_reviews

nulls = df_reviews.isnull() == True
#print(nulls)


# duplicates = df_reviews.astype(str).duplicated().any() == True
# print("\n", "Are there some duplicated values? ", duplicates, "\n")
'''
duplicates_rows = df_reviews[df_reviews.astype(str).duplicated()]
print(duplicates_rows, '\n')

df_reviews.info()
deleting_duplicates = df_reviews.astype(str).drop_duplicates()
df_reviews.info()
'''

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


# Saving dataframes to csv & parquet files 

Finally, we're going to save our dataframes to files so that we can conduct the Exploratory Data Analysis (EDA).



In [None]:
# Saving dataframes in csv and parquet files respectively ↓:
df_games.to_csv("Games.csv")
df_reviews.to_csv("Reviews.csv")
df_items.to_parquet("Items.parquet")

# Why did I choose Parquet? This is related to the structure of the dataset itself.

# Note: this cell might take some time to finish the task.