# Importing Libraries, defining paths & creating functions

In [1]:
# Importing libraries and paths

import pandas as pd
import gzip
import ast

# Creating a function to read the paths

def read_path(file):
    with gzip.open(file, 'rt', encoding='utf-8') as myfile:
        return [ast.literal_eval(line.strip()) for line in myfile]
    
# Creating a function to change datetime in columns
    
def datetime_change(var):

    if pd.isna(var):
        return None

    try:
        return pd.to_datetime(var)
    except ValueError:
        return None
    

# Creating a function to unnest data in columns
    
def unnesting(dataframe,column):
    
    df_aux = dataframe.explode(column)
    df_normal = pd.json_normalize(df_aux[column].dropna())

    df_aux.reset_index(inplace=True)
    df_normal.reset_index(inplace=True)
    dataframe = pd.concat([df_aux,df_normal],axis=1)
    dataframe.dropna(inplace=True)
    
    return dataframe


def to_float(data):
    if pd.isna(data):
        return 0.0
        
    try:
        return float(data)
    except(ValueError, TypeError):
        return 0.0

## First, let's begin cleaning 'df_games'

In [2]:
path1 = 'Datasets/steam_games.json.gz'

In [3]:
# Reading the dataset from 'steam_games.json.gz' ↓↓↓
with gzip.open(path1, 'rt', encoding='utf-8') as file:
    df_games = pd.read_json(file, lines=True)

In [4]:
print(df_games.shape)
print(df_games.columns)

(120445, 13)
Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')


In [8]:
df_games['genres'].explode()

0               None
1               None
2               None
3               None
4               None
             ...    
120442        Racing
120442    Simulation
120443        Casual
120443         Indie
120444          None
Name: genres, Length: 163147, dtype: object

In [9]:
# In the next code cell, I'm going to delete specific columns that we don't need for the tasks

df_games.drop(['url','reviews_url','specs', 'id'],axis=1,inplace=True)

# I use inplace=True because I want to change the original dataframe too

df_games.dropna(subset=['genres'], inplace=True)

df_games.reset_index(inplace=True)

df_games.drop(columns='index', inplace=True)

In [10]:
df_games['price'].apply(to_float)

df_games['release_date'] = pd.to_datetime(df_games['release_date'], errors='coerce')


In [11]:
df_games['release_year'] = df_games['release_date'].dt.year


In [12]:
df_games['release_year']

0        2018.0
1        2018.0
2        2017.0
3        2017.0
4        2018.0
          ...  
28847    2018.0
28848    2018.0
28849    2018.0
28850    2018.0
28851    2017.0
Name: release_year, Length: 28852, dtype: float64

In [15]:
df_games['genres']

0            [Action, Casual, Indie, Simulation, Strategy]
1                     [Free to Play, Indie, RPG, Strategy]
2        [Casual, Free to Play, Indie, Simulation, Sports]
3                              [Action, Adventure, Casual]
4                          [Action, Adventure, Simulation]
                               ...                        
28847                   [Action, Adventure, Casual, Indie]
28848                [Casual, Indie, Simulation, Strategy]
28849                            [Casual, Indie, Strategy]
28850                          [Indie, Racing, Simulation]
28851                                      [Casual, Indie]
Name: genres, Length: 28852, dtype: object

In [17]:
df_games.explode('genres')

Unnamed: 0,publisher,genres,app_name,title,release_date,tags,price,early_access,developer,release_year
0,Kotoshiro,Action,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,0.0,Kotoshiro,2018.0
0,Kotoshiro,Casual,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,0.0,Kotoshiro,2018.0
0,Kotoshiro,Indie,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,0.0,Kotoshiro,2018.0
0,Kotoshiro,Simulation,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,0.0,Kotoshiro,2018.0
0,Kotoshiro,Strategy,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,0.0,Kotoshiro,2018.0
...,...,...,...,...,...,...,...,...,...,...
28850,Laush Studio,Indie,Russian Roads,Russian Roads,2018-01-04,"[Indie, Simulation, Racing]",1.99,0.0,Laush Dmitriy Sergeevich,2018.0
28850,Laush Studio,Racing,Russian Roads,Russian Roads,2018-01-04,"[Indie, Simulation, Racing]",1.99,0.0,Laush Dmitriy Sergeevich,2018.0
28850,Laush Studio,Simulation,Russian Roads,Russian Roads,2018-01-04,"[Indie, Simulation, Racing]",1.99,0.0,Laush Dmitriy Sergeevich,2018.0
28851,SIXNAILS,Casual,EXIT 2 - Directions,EXIT 2 - Directions,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",4.99,0.0,"xropi,stev3ns",2017.0


In [19]:
df_games.shape

(28852, 10)

In [20]:
# Last but not least let's save it to a csv file

df_games = df_games.to_csv('Games.csv', index=False)

## Second Step: Let's clean 'df_reviews'

In [None]:
path2 = 'Datasets/user_reviews.json.gz'

# Reading dataset
reviews = read_path(path2)


# Transforming dataset to dataframes ↓:
df_reviews = pd.DataFrame(reviews)

In [None]:
df_reviews = unnesting(df_reviews, "reviews")

In [None]:
df_reviews.columns

In [None]:
# Transforming the date

df_reviews["posted"] = df_reviews["posted"].str.extract(r"Posted ([\w\s\d,]+)") 
df_reviews["posted_date"] = df_reviews["posted"].apply(datetime_change) 
df_reviews['posted_year'] = df_reviews["posted_date"].dt.year


In [None]:
df_reviews.drop(["reviews","last_edited","index","posted"], axis=1, inplace=True)

In [None]:
df_reviews.columns

In [None]:
df_reviews.to_csv("Reviews.csv", index=False)

## Third step: Cleaning 'df_items'

In [None]:
path3 = 'Datasets/users_items.json.gz'
items = read_path(path3)

In [None]:
df_items = pd.DataFrame(items)

In [None]:
df_items.unstack()

In [None]:
df_items = unnesting(df_items, "items")

In [None]:
df_items.to_csv("DF Items.csv", index=False)

In [None]:
DF_Items = pd.read_csv("DF Items.csv")

In [None]:
DF_Items.drop(['items', 'index'], axis=1, inplace=True)

In [None]:
df_items.drop(['items', 'index'], axis=1, inplace=True)

In [None]:
df_items = df_items.to_parquet("Items.parquet", index=False)

In [None]:
DF_Items.to_csv("DF Items.csv", index=False)

Why did I choose Parquet? This is related to the structure of the dataset itself.


# General Information

In [None]:
DF_Items = pd.read_csv("DF Items.csv")
DF_Items.info()

In [None]:
Items = pd.read_parquet("Items.parquet")
Items.info()

In [None]:
Games = pd.read_csv('Games.csv')
Games.info()

In [None]:
Reviews = pd.read_csv("Reviews.csv")
Reviews.info()