# Importing Libraries, defining paths & creating functions

In [1]:
# Importing libraries and paths

import pandas as pd
import gzip
import ast
from textblob import TextBlob
from scipy import stats
# tensorflow

# Paths

path1 = 'steam_games.json.gz'
path2 = 'user_reviews.json.gz'
path3 = 'users_items.json.gz'

# Creating a function to read the paths

def read_path(file):
    with gzip.open(file, 'rt', encoding='utf-8') as myfile:
        return [ast.literal_eval(line.strip()) for line in myfile]

# Reading datasets

In [10]:
#   Let's read all the datasets

# Reading the dataset from 'steam_games.json.gz' ↓↓↓
with gzip.open(path1, 'rt', encoding='utf-8') as file:
    df_games = pd.read_json(file, lines=True)

# Reading datasets 
reviews = read_path(path2)
items = read_path(path3)

# Transforming datasets to dataframes ↓:
df_reviews = pd.DataFrame(reviews)
df_items = pd.DataFrame(items)

# Let's prepare the files by cleaning null values, normalizing data, addressing outliers, and including dummy variables.

## First, we'll begin cleaning 'df_games'

In [None]:
# Let's start with df_games

# In the next code cell, I'm going to delete specific columns that we don't for the tasks
columns_to_drop = ['publisher', 'url', 'tags', 'reviews_url', 'specs', 'price', 'early_access', 'developer']

df_games.drop(columns=columns_to_drop, inplace=True)

# I use inplace=True because I want to change the original dataframe too

In [22]:
# Are there some null values?
print(df_games.isnull())

# Deleting null values from this columns
df_games.dropna(subset=['genres', 'app_name', 'title', 'release_date', 'id'])

# We see date
df_games.astype(str) # This is for convert dataframe to a list so we can manipulate data
mask = df_games['release_date'].notnull()
print('\n', df_games[mask])

        genres  app_name  title  release_date     id
88310    False     False  False         False  False
88311    False     False  False         False  False
88312    False     False  False         False  False
88313    False     False  False         False  False
88315    False     False  False         False  False
...        ...       ...    ...           ...    ...
120439   False     False  False         False  False
120440   False     False  False         False  False
120441   False     False  False         False  False
120442   False     False  False         False  False
120443   False     False  False         False  False

[29894 rows x 5 columns]

                                                    genres  \
88310       [Action, Casual, Indie, Simulation, Strategy]   
88311                [Free to Play, Indie, RPG, Strategy]   
88312   [Casual, Free to Play, Indie, Simulation, Sports]   
88313                         [Action, Adventure, Casual]   
88315                     [Acti

In [33]:
# This function is for extract the year
def extract_year(date_str):
    try:
        return pd.to_datetime(date_str, errors='coerce').year
    except:
        return None


# We only the year, this is needed for EDA, days and months aren't needed
df_games['release_date'] = df_games['release_date'].apply(extract_year)

# We delete rows where year is an invalid value
df_games.dropna(subset=['release_date'], how='all', inplace=True)

df_games

Unnamed: 0_level_0,genres,app_name,title,release_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
761140.0,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,1970
643980.0,"[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,1970
670290.0,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,1970
767400.0,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,1970
772540.0,"[Action, Adventure, Simulation]",Battle Royale Trainer,Battle Royale Trainer,1970
...,...,...,...,...
745400.0,"[Action, Adventure, Casual, Indie]",Kebab it Up!,Kebab it Up!,1970
773640.0,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,1970
733530.0,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,1970
610660.0,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,1970


In [34]:
# Last step: We see our dataframe 

print(df_games.info(), '\n')

<class 'pandas.core.frame.DataFrame'>
Index: 29894 entries, 761140.0 to 658870.0
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        28660 non-null  object
 1   app_name      29893 non-null  object
 2   title         29893 non-null  object
 3   release_date  29894 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.1+ MB
None 



## Second Step: We'll clean 'df_reviews'

In [7]:
# We continue cleaning dataframes and the next is: df_reviews

nulls = df_reviews.isnull() == True
print(nulls.sum()) 

# We don't have nulls, that's great!

# The next function is for sentiment analysis:
def sentiment_analysis(review):
    if isinstance(review, list) and len(review) > 0:
        text = review[0].get('review', '')
        sentiment = TextBlob(text).sentiment.polarity
        
        if sentiment < -0.2:
            return 0 # Negative
        elif sentiment >= -0.2 and sentiment <= 0.2:
            return 1 # Neutral
        else:
            return 2 # Positive
    else:
        return 1 # This is when no value exists
    
# Creating a new column and aplying the function for sentiment analysis:
df_reviews['sentiment_analysis'] = df_reviews['reviews'].apply(sentiment_analysis)



user_id     0
user_url    0
reviews     0
dtype: int64


## Third step: Cleaning 'df_items'

In [11]:
# We are going to read one element from the dataset this why the structure is nest
df_items_element = df_items.sample(1)
for item in df_items_element['items']:
    print(item)

[{'item_id': '10090', 'item_name': 'Call of Duty: World at War', 'playtime_forever': 9292, 'playtime_2weeks': 0}, {'item_id': '1250', 'item_name': 'Killing Floor', 'playtime_forever': 2578, 'playtime_2weeks': 0}, {'item_id': '35420', 'item_name': 'Killing Floor Mod: Defence Alliance 2', 'playtime_forever': 1, 'playtime_2weeks': 0}, {'item_id': '620', 'item_name': 'Portal 2', 'playtime_forever': 342, 'playtime_2weeks': 0}, {'item_id': '105600', 'item_name': 'Terraria', 'playtime_forever': 340, 'playtime_2weeks': 0}, {'item_id': '49520', 'item_name': 'Borderlands 2', 'playtime_forever': 2284, 'playtime_2weeks': 0}, {'item_id': '202970', 'item_name': 'Call of Duty: Black Ops II', 'playtime_forever': 333, 'playtime_2weeks': 0}, {'item_id': '202990', 'item_name': 'Call of Duty: Black Ops II - Multiplayer', 'playtime_forever': 87593, 'playtime_2weeks': 0}, {'item_id': '212910', 'item_name': 'Call of Duty: Black Ops II - Zombies', 'playtime_forever': 58703, 'playtime_2weeks': 0}, {'item_id': 

In [12]:
# Now we unnest 'items' column
items_column = df_items['items'].explode()
# We drop 'items' column
df_items = df_items.drop('items', axis=1)
# We join the new column 'items' unnest
df_items = df_items.join(items_column.apply(pd.Series))

In [None]:
# Let's the new structure for df_items
df_items

In [None]:
# Now, we save the result in a parquet file in this way we don't have to wait and do all the previous steps
# This file is temporary

df_items_temporary = df_items.to_parquet("df_items_temporary.parquet", index=False)

In [None]:
# We delete unnecesary columns

df_items.drop([0,'steam_id','playtime_2weeks','user_url','items_count'], axis=1, inplace=True)

# Deleting nulls

df_items.dropna(how='all', inplace=True)

# Deleting duplicates

df_items=df_items.drop_duplicates()

# Deleting null values from useful columns:

df_items.dropna(subset=['playtime_forever'],inplace=True)
df_items.dropna(subset=['item_id'],inplace=True)
df_items.dropna(subset=['user_id'],inplace=True)

In [None]:
df_items.info()
df_items.head(6)

In [None]:
# We save the file again

df_items_data = df_items.to_parquet("df_items_data.parquet", index=False)



In [None]:
# We create a sample because is too big 

df_items_sample = df_items.sample(10000)
df_items_sample.to_parquet("df_items_sample.parquet", index=False)

Why did I choose Parquet? This is related to the structure of the dataset itself.


# Saving dataframes to csv & parquet files 

Finally, we're going to save our dataframes to files so that we can conduct the Exploratory Data Analysis (EDA).

