# ETL (Extract Transform Load) 
## Extraer, Transformar, Carga del archivo Steam-Games

In [2]:
# Importamos la librerias 
import pandas as pd 
import numpy as np
import ast

# Cargar el archivo 
df_games = pd.read_json("output_steam_games.json", lines=True)

##### Vista general

In [3]:
# Veamos una vista General del set de datos df_games
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 11.9+ MB


### COMENZAMOS CON LA LIMPIEZA CON DF_GAMES

In [4]:
#Borramos filas con todos sus valores nulos
df_games = df_games.dropna(how='all')

In [5]:
# El dataset luego de eliminar todas las filas que contengan valores nules en todas las columnas 
print(f"Filas del dataset df_games ({df_games.shape[0]}), columnas del dataset df_games ({df_games.shape[1]})")

Filas del dataset df_games (32135), columnas del dataset df_games (13)


In [6]:
# Encontramos valores null, luego de eliminar filas con todos sus valores null
df_games.isna().sum()

publisher       8052
genres          3283
app_name           2
title           2050
url                0
release_date    2067
tags             163
reviews_url        2
specs            670
price           1377
early_access       0
id                 2
developer       3299
dtype: int64

##### Eliminamos columnas no necesarias

In [7]:
# Borramos las columnas 'early_access', 'reviews_url', 'url'
df_games.drop(columns= ['early_access', 'reviews_url', 'url'], inplace=True)

##### app_name & title

In [8]:
# Observamos que App_name y title tienen los mismos datos
df_games[['app_name','title']]

Unnamed: 0,app_name,title
88310,Lost Summoner Kitty,Lost Summoner Kitty
88311,Ironbound,Ironbound
88312,Real Pool 3D - Poolians,Real Pool 3D - Poolians
88313,弹炸人2222,弹炸人2222
88314,Log Challenge,
...,...,...
120440,Colony On Mars,Colony On Mars
120441,LOGistICAL: South Africa,LOGistICAL: South Africa
120442,Russian Roads,Russian Roads
120443,EXIT 2 - Directions,EXIT 2 - Directions


In [9]:
#Observaremos cual tienen menos datos nulos y borraremos la columna con más datos nulos.
df_games[['app_name', 'title']].isna().sum()

app_name       2
title       2050
dtype: int64

In [10]:
#Borramos title
df_games.drop(columns= ['title'], inplace=True)

In [11]:
#Cambiamos los nombres de las columnas 
df_games.rename(columns={'app_name' : 'name', 'id': 'item_id' }, inplace=True)

##### Publisher & Developer

In [12]:
#Observemos Publisher y Developer 
df_games[['publisher', 'developer']].head()

Unnamed: 0,publisher,developer
88310,Kotoshiro,Kotoshiro
88311,"Making Fun, Inc.",Secret Level SRL
88312,Poolians.com,Poolians.com
88313,彼岸领域,彼岸领域
88314,,


In [13]:
#Rellenamos las columna Publisher con los valores de la columna Developer y viceversa
df_games['publisher'].fillna(df_games['developer'], inplace=True)
df_games['developer'].fillna(df_games['publisher'], inplace=True)

In [14]:
#Ahora remplazaremos los valores vacios o None por Desconocido(unknow) 
# En la columna Developer y Publisher
df_games['publisher'].replace(['', None], 'unknow', inplace=True)
df_games['developer'].replace(['', None], 'unknow', inplace=True)

In [15]:
# observemos que no hay valores null
df_games[['publisher', 'developer']].isna().sum()

publisher    0
developer    0
dtype: int64

In [16]:
# Ya tenemos Developer sin valores nulos podemos borrar publisher 
df_games.drop(columns=['publisher'], inplace=True)

#####  item_id

In [17]:
#Encotremos los valores faltantes de item_id
df_games[df_games['item_id'].isna()]

Unnamed: 0,genres,name,release_date,tags,specs,price,item_id,developer
88384,,,,,,19.99,,unknow
119271,"[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...","[Single-player, Steam Achievements, Steam Trad...",19.99,,"Rocksteady Studios,Feral Interactive (Mac)"


In [18]:
# Eliminamos esos valores faltantes
df_games = df_games.dropna(subset=['item_id'])

In [19]:
# Convertimos la columna tipo de dato int 
df_games.item_id.astype('int')

88310     761140
88311     643980
88312     670290
88313     767400
88314     773570
           ...  
120440    773640
120441    733530
120442    610660
120443    658870
120444    681550
Name: item_id, Length: 32133, dtype: int32

##### price

In [20]:
#Observemos los valores no numericos en la columna 'price'
mask_filter = pd.to_numeric(df_games['price'], errors='coerce').isna()
no_numeric_values = df_games.loc[mask_filter, 'price'].unique()
print(no_numeric_values)

['Free To Play' 'Free to Play' None 'Free' 'Free Demo' 'Play for Free!'
 'Install Now' 'Play WARMACHINE: Tactics Demo' 'Free Mod' 'Install Theme'
 'Third-party' 'Play Now' 'Free HITMAN™ Holiday Pack' 'Play the Demo'
 'Starting at $499.00' 'Starting at $449.00' 'Free to Try' 'Free Movie'
 'Free to Use']


In [21]:
#convertimos los valores no numericos a 0 (cero) para representar a los juegos gratis 
df_games['price'] = pd.to_numeric(df_games['price'], errors= 'coerce')
df_games['price'].fillna(0, inplace=True)

In [22]:
#Cambiamos el tipo de dato en la columna 'price'
df_games['price'].astype(int)

88310     4
88311     0
88312     0
88313     0
88314     2
         ..
120440    1
120441    4
120442    1
120443    4
120444    4
Name: price, Length: 32133, dtype: int32

##### release_date

In [23]:
# Ahora transformaremos la columna release_date a tipo to_datetime para mejor manipulacion 
df_games['release_date'] = pd.to_datetime(df_games['release_date'], format= '%Y-%m-%d', errors='coerce')

In [24]:
# Creamos una columna nueva con esos valores, pero solo extraeremos el año y cambiamos el tipo de dato
df_games['Year'] = df_games['release_date'].dt.year.astype('Int64')

In [25]:
# Buscamos cuales son los valores faltantes, ademas cuantos de esos valores existen 
mask_filter = df_games['Year'].isna()
diferent_Value = df_games.loc[mask_filter, 'Year'].unique()
print(f"Valores unicos faltantes {diferent_Value}, Numero de valores faltantes {mask_filter.sum()}")

Valores unicos faltantes <IntegerArray>
[<NA>]
Length: 1, dtype: Int64, Numero de valores faltantes 2351


In [26]:
# LLenamos los valores NAN con la mediana
def llenar_nan(df, columna):
    mediana = df_games['Year'].median()
    df[columna] = df[columna].fillna(mediana)
    return df 

df_games = llenar_nan(df_games, 'Year')

In [27]:
# Para el analisis necesitamos solo el anio, para ello eliminamos la columna 'release_date'
df_games = df_games.drop(columns=['release_date'])

##### Name 

In [28]:
# Econtramos el valore faltante de la columna Name 
df_games[df_games['name'].isna()]

Unnamed: 0,genres,name,tags,specs,price,item_id,developer,Year
90890,"[Action, Indie]",,"[Action, Indie]","[Single-player, Game demo]",0.0,317160.0,unknow,2014


In [29]:
# Eliminamos el valor faltante
df_games = df_games.dropna(subset=['name'])

##### genres y tags 

In [30]:
# Sabemos que genres tiene mas valores faltantes que tags entonces,
# Vamos a rellenar los datos entre las dos columnas que son parecidas 
df_games['genres'].fillna(df_games['tags'], inplace=True)
df_games['tags'].fillna(df_games['genres'], inplace=True)

In [31]:
# Observemos que valores faltantes quedaron 
df_games[['genres', 'tags']].isna().sum()

genres    138
tags      138
dtype: int64

#####  specs 

In [74]:
# Remplazaremos los valores faltantes a Desconocido(Unknow)
df_games['specs'].replace(['', None], 'unknow', inplace=True)

##### Convertimos el archivo en parquet

In [32]:
df_games.to_parquet("steam_games.parquet")