# Importing Libraries, defining paths & creating functions

In [1]:
# Importing libraries and paths

import pandas as pd
import gzip
import ast

# Creating a function to read the paths

def read_path(file):
    with gzip.open(file, 'rt', encoding='utf-8') as myfile:
        return [ast.literal_eval(line.strip()) for line in myfile]
    
# Creating a function to change datetime in columns
    
def datetime_change(var):

    if pd.isna(var):
        return None

    try:
        return pd.to_datetime(var)
    except ValueError:
        return None
    

# Creating a function to unnest data in columns
    
def unnesting(dataframe,column):
    
    df_aux = dataframe.explode(column)
    df_normal = pd.json_normalize(df_aux[column].dropna())

    df_aux.reset_index(inplace=True)
    df_normal.reset_index(inplace=True)
    dataframe = pd.concat([df_aux,df_normal],axis=1)
    dataframe.dropna(inplace=True)
    
    return dataframe


def to_float(data):
    if pd.isna(data):
        return 0.0
        
    try:
        return float(data)
    except(ValueError, TypeError):
        return 0.0
    
def null_ratio(Dataset):
    null_ratio = ((Dataset.isnull().sum() / len(Dataset))*100).sort_values(ascending=False)
    return null_ratio

## First, let's begin cleaning 'df_games'

In [31]:
path1 = 'Datasets/steam_games.json.gz'

In [32]:
# Reading the dataset from 'steam_games.json.gz' ↓↓↓
with gzip.open(path1, 'rt', encoding='utf-8') as file:
    df_games = pd.read_json(file, lines=True)

In [33]:
print(df_games.shape)
print(df_games.columns)

(120445, 13)
Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')


In [34]:
nulls = null_ratio(df_games)
nulls

publisher       80.004982
developer       76.058782
genres          76.045498
release_date    75.035909
title           75.021794
price           74.463033
specs           73.876043
tags            73.455104
app_name        73.321433
reviews_url     73.321433
id              73.321433
url             73.319773
early_access    73.319773
dtype: float64

In [35]:
df_games.fillna("No data")

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data
1,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data
2,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data
3,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data
4,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,0.0,773640.0,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,0.0,733530.0,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,0.0,610660.0,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,0.0,658870.0,"xropi,stev3ns"


In [38]:
df_games.dropna(how='all', inplace=True)
df_games.shape

(32135, 13)

In [39]:
df_games.drop_duplicates(subset='id', inplace=True)
df_games.shape

(32133, 13)

In [40]:
genres = df_games.explode('genres')

In [41]:
genres['genres'].value_counts()

genres
Indie                        15858
Action                       11319
Casual                        8282
Adventure                     8242
Strategy                      6957
Simulation                    6699
RPG                           5479
Free to Play                  2031
Early Access                  1462
Sports                        1257
Massively Multiplayer         1108
Racing                        1083
Design &amp; Illustration      460
Utilities                      340
Web Publishing                 268
Animation &amp; Modeling       183
Education                      125
Video Production               116
Software Training              105
Audio Production                93
Photo Editing                   77
Accounting                       7
Name: count, dtype: int64

In [47]:
null_ratio(df_games)

publisher       25.058351
developer       10.266704
genres          10.216911
release_date     6.432639
title            6.379734
price            4.285314
specs            2.085084
tags             0.507267
app_name         0.006224
reviews_url      0.003112
id               0.003112
url              0.000000
early_access     0.000000
dtype: float64

In [46]:
null_ratio(genres)

publisher       20.838122
developer        4.648956
price            4.471230
genres           4.387043
release_date     2.808884
title            2.740733
specs            1.257450
tags             0.247214
app_name         0.004009
reviews_url      0.001336
id               0.001336
url              0.000000
early_access     0.000000
dtype: float64

In [48]:
genres.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74834 entries, 88310 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     59240 non-null  object 
 1   genres        71551 non-null  object 
 2   app_name      74831 non-null  object 
 3   title         72783 non-null  object 
 4   url           74834 non-null  object 
 5   release_date  72732 non-null  object 
 6   tags          74649 non-null  object 
 7   reviews_url   74833 non-null  object 
 8   specs         73893 non-null  object 
 9   price         71488 non-null  object 
 10  early_access  74834 non-null  float64
 11  id            74833 non-null  float64
 12  developer     71355 non-null  object 
dtypes: float64(2), object(11)
memory usage: 8.0+ MB


In [49]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32133 entries, 88310 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24081 non-null  object 
 1   genres        28850 non-null  object 
 2   app_name      32131 non-null  object 
 3   title         30083 non-null  object 
 4   url           32133 non-null  object 
 5   release_date  30066 non-null  object 
 6   tags          31970 non-null  object 
 7   reviews_url   32132 non-null  object 
 8   specs         31463 non-null  object 
 9   price         30756 non-null  object 
 10  early_access  32133 non-null  float64
 11  id            32132 non-null  float64
 12  developer     28834 non-null  object 
dtypes: float64(2), object(11)
memory usage: 3.4+ MB


In [55]:
# In the next code cell, I'm going to delete specific columns that we don't need for the tasks

genres.drop(['url','reviews_url','specs', 'id'],axis=1,inplace=True)

# I use inplace=True because I want to change the original dataframe too

genres.reset_index(inplace=True)
# 
genres.drop(columns='index', inplace=True)

In [56]:
genres.head()

Unnamed: 0,publisher,genres,app_name,title,release_date,tags,price,early_access,developer
0,Kotoshiro,Action,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,0.0,Kotoshiro
1,Kotoshiro,Casual,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,0.0,Kotoshiro
2,Kotoshiro,Indie,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,0.0,Kotoshiro
3,Kotoshiro,Simulation,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,0.0,Kotoshiro
4,Kotoshiro,Strategy,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,0.0,Kotoshiro


In [59]:
genres['price'].apply(to_float)

genres['release_date'] = pd.to_datetime(genres['release_date'], errors='coerce')


In [60]:
genres['release_year'] = genres['release_date'].dt.year


In [73]:
genres['release_date']

TypeError: 'NoneType' object is not subscriptable

In [63]:
# Last but not least let's save it to a csv file

genres = genres.to_csv('Games.csv', index=False)

## Second Step: Let's clean 'df_reviews'

In [None]:
path2 = 'Datasets/user_reviews.json.gz'

# Reading dataset
reviews = read_path(path2)


# Transforming dataset to dataframes ↓:
df_reviews = pd.DataFrame(reviews)

In [None]:
df_reviews = unnesting(df_reviews, "reviews")

In [None]:
df_reviews.columns

In [None]:
# Transforming the date

df_reviews["posted"] = df_reviews["posted"].str.extract(r"Posted ([\w\s\d,]+)") 
df_reviews["posted_date"] = df_reviews["posted"].apply(datetime_change) 
df_reviews['posted_year'] = df_reviews["posted_date"].dt.year


In [None]:
df_reviews.drop(["reviews","last_edited","index","posted"], axis=1, inplace=True)

In [None]:
df_reviews.columns

In [None]:
df_reviews.to_csv("Reviews.csv", index=False)

## Third step: Cleaning 'df_items'

In [None]:
path3 = 'Datasets/users_items.json.gz'
items = read_path(path3)

In [None]:
df_items = pd.DataFrame(items)

In [None]:
df_items.unstack()

In [None]:
df_items = unnesting(df_items, "items")

In [None]:
df_items.to_csv("DF Items.csv", index=False)

In [None]:
DF_Items = pd.read_csv("DF Items.csv")

In [None]:
DF_Items.drop(['items', 'index'], axis=1, inplace=True)

In [None]:
df_items.drop(['items', 'index'], axis=1, inplace=True)

In [None]:
df_items = df_items.to_parquet("Items.parquet", index=False)

In [None]:
DF_Items.to_csv("DF Items.csv", index=False)

Why did I choose Parquet? This is related to the structure of the dataset itself.


# General Information

In [None]:
DF_Items = pd.read_csv("DF Items.csv")
DF_Items.info()

In [None]:
Items = pd.read_parquet("Items.parquet")
Items.info()

In [None]:
Games = pd.read_csv('Games.csv')
Games.info()

In [None]:
Reviews = pd.read_csv("Reviews.csv")
Reviews.info()