#### ETL SOBRE STEAM_GAMES

In [3]:
import pandas as pd
import json
import ast
import warnings
import pyarrow as pa
import pyarrow.parquet as pq
warnings.filterwarnings('ignore')

In [4]:
# Se carga el json de steam_games y se realiza una lectura linea a linea que se irá almacenando en una lista para luego convertirla en un dataframe pandas
steam_games = 'datos_json/output_steam_games.json'

listado = []

with open(steam_games) as archivo:
    for linea in archivo.readlines():
        fila = json.loads(linea)
        listado.append(fila)

dfSteamGames = pd.DataFrame(listado)

# Se revisa la cantidad de nulos por columna en el archivo
dfSteamGames.isnull().sum()

publisher       96362
genres          91593
app_name        88312
title           90360
url             88310
release_date    90377
tags            88473
reviews_url     88312
specs           88980
price           89687
early_access    88310
id              88312
developer       91609
dtype: int64

In [5]:
# Se limpian las filas donde todos los valores sean nulos y se reinicia el index 
games = dfSteamGames.dropna(how='all').reset_index(drop=True)

games

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32130,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS"""
32131,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada
32132,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich
32133,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns"


In [6]:
# Se aplica el metodo drop_duplicates utilizando como filtro el id
games = games.drop_duplicates(subset='id')

In [7]:
'''  
OBSERVACIONES

id y price estan como objeto, debería transformarlos a int
release_date esta como objeto, podria transformarse a dato tipo date?
genres, tags y specs tienen datos anidados, deberia desanidarlos?


'''
games.dtypes

publisher       object
genres          object
app_name        object
title           object
url             object
release_date    object
tags            object
reviews_url     object
specs           object
price           object
early_access    object
id              object
developer       object
dtype: object

In [8]:
# Se desanida la columna genres para poder tener cada genero en una fila
steam_games = games.explode('genres')

# Se reindexa el dataframe luego de desanidarse para su correcto acceso a la información
steam_games = steam_games.reset_index(drop=True)

steam_games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,Kotoshiro,Action,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
1,Kotoshiro,Casual,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
2,Kotoshiro,Indie,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
3,Kotoshiro,Simulation,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
4,Kotoshiro,Strategy,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro


In [9]:
steam_games.isnull().sum()

publisher       15594
genres           3283
app_name            3
title            2051
url                 0
release_date     2102
tags              185
reviews_url         1
specs             941
price            3346
early_access        0
id                  1
developer        3479
dtype: int64

In [10]:
# Almaceno los indices de los nulos en la columna genres que no me permiten realizar la transformacion del tipo de dato
generos_nulos = steam_games[steam_games['genres'].isnull()].index


# Utilizo la variable nulos donde almacene el indice de nulos para ubicar las filas correspondientes
steam_games.iloc[generos_nulos]


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
17,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,
41,,,Icarus Six Sixty Six,,http://store.steampowered.com/app/724910/Icaru...,,[Casual],http://steamcommunity.com/app/724910/reviews/?...,"[Single-player, HTC Vive, Tracked Motion Contr...",Free,False,724910,
70,,,After Life VR,,http://store.steampowered.com/app/772590/After...,,"[Early Access, Indie, VR]",http://steamcommunity.com/app/772590/reviews/?...,"[Single-player, HTC Vive, Tracked Motion Contr...",4.99,True,772590,
71,,,Kitty Hawk,,http://store.steampowered.com/app/640250/Kitty...,,"[Early Access, Action, Adventure, Indie, Casual]",http://steamcommunity.com/app/640250/reviews/?...,"[Single-player, Steam Leaderboards, HTC Vive, ...",2.99,True,640250,
75,,,Mortars VR,,http://store.steampowered.com/app/711440/Morta...,,"[Early Access, Strategy, Action, Indie, Casual...",http://steamcommunity.com/app/711440/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",0.99,True,711440,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74796,,,Robotpencil Presents: Exercise: Brushwork,Robotpencil Presents: Exercise: Brushwork,http://store.steampowered.com/app/775640/Robot...,2018-01-03,"[Design & Illustration, Tutorial]",http://steamcommunity.com/app/775640/reviews/?...,,3.99,False,775640,
74797,,,Robotpencil Presents: Creative Composition,Robotpencil Presents: Creative Composition,http://store.steampowered.com/app/777930/Robot...,2018-01-03,"[Design & Illustration, Tutorial]",http://steamcommunity.com/app/777930/reviews/?...,,3.99,False,777930,
74798,,,The Gamble House,The Gamble House,http://store.steampowered.com/app/775370/The_G...,2016-11-19,[Movie],http://steamcommunity.com/app/775370/reviews/?...,[Captions available],4.99,False,775370,
74799,,,Kalen Chock Presents: 2017 Free Tutorial,Kalen Chock Presents: 2017 Free Tutorial,http://store.steampowered.com/app/777950/Kalen...,2018-01-03,"[Design & Illustration, Tutorial]",http://steamcommunity.com/app/777950/reviews/?...,,Free,False,777950,


#### En base a lo observado:

- hay 3283 registros con gran cantidad de nulos, al analizar en profundidad hay columnas que repiten información, se elige aplicar el metodo fillna para rellenar las columnas nulas con las columnas que tienen los datos necesarios. 

In [11]:
# Se interpola la columna title con la columna app_name para rellenar los nulos utilizando el metodo fillna
steam_games['title'] = steam_games['title'].fillna(steam_games['app_name'])

In [12]:
# Almaceno los indices de los nulos en la columna id que no me permiten realizar la transformacion del tipo de dato
id_nulos = steam_games[steam_games['id'].isna()].index

# Utilizo la variable nulos donde almacene el indice de nulos para ubicar las filas correspondientes
steam_games.iloc[id_nulos]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
189,,,,,http://store.steampowered.com/,,,,,19.99,False,,


#### En base a lo observado:

- la fila 189 va a ser eliminada ya que un 80% de sus datos son nulos

In [13]:
# Se elimina la fila 189
steam_games.drop(index=189,inplace=True)

#### 

In [14]:
# Se aplica la transformación del tipo en la columna id
steam_games['id'] = steam_games['id'].astype(int)

In [15]:
# Se cambia la columna a tipo numero y los valores que sean str los convierte en nulos
steam_games["price"] = pd.to_numeric(steam_games["price"], errors='coerce')

# Se rellenan los nulos de la columna con 0
steam_games["price"] = steam_games["price"].fillna(0)

In [16]:
# Se cambia el tipo de la columna release_date a datetime
steam_games["release_date"] = pd.to_datetime(steam_games["release_date"],errors='coerce')

# Se extrae el año de la columna release_date y se lo almacena en una nueva columna
steam_games["year"] = steam_games["release_date"].dt.year

In [17]:
# Se aplica una interpolación cruzada a los datos nulos que hayan quedado en la columna year
steam_games["year"].interpolate(method='ffill', inplace=True)

In [18]:
# Se cambia el tipo de la columna year a int
steam_games['year'] = steam_games['year'].astype(int)

In [19]:
steam_games.dropna(thresh= 4, inplace= True)

In [20]:
steam_games.dtypes

publisher               object
genres                  object
app_name                object
title                   object
url                     object
release_date    datetime64[ns]
tags                    object
reviews_url             object
specs                   object
price                  float64
early_access            object
id                       int32
developer               object
year                     int32
dtype: object

In [21]:
steam_games.isna().mean()

publisher       0.208371
genres          0.043858
app_name        0.000027
title           0.000027
url             0.000000
release_date    0.038726
tags            0.002459
reviews_url     0.000000
specs           0.012561
price           0.000000
early_access    0.000000
id              0.000000
developer       0.046477
year            0.000000
dtype: float64

In [22]:
# Se almacenan las columnas para aplicar el metodo lower
columnas_a_minusculas = ['publisher', 'genres', 'app_name','title','developer']

# Se lleva a minusculas las columnas previamente almacenadas
steam_games[columnas_a_minusculas] = steam_games[columnas_a_minusculas].apply(lambda x: x.str.lower())

In [23]:
steam_games

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer,year
0,kotoshiro,action,lost summoner kitty,lost summoner kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,kotoshiro,2018
1,kotoshiro,casual,lost summoner kitty,lost summoner kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,kotoshiro,2018
2,kotoshiro,indie,lost summoner kitty,lost summoner kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,kotoshiro,2018
3,kotoshiro,simulation,lost summoner kitty,lost summoner kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,kotoshiro,2018
4,kotoshiro,strategy,lost summoner kitty,lost summoner kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,kotoshiro,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74829,laush studio,racing,russian roads,russian roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,laush dmitriy sergeevich,2018
74830,laush studio,simulation,russian roads,russian roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,laush dmitriy sergeevich,2018
74831,sixnails,casual,exit 2 - directions,exit 2 - directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns",2017
74832,sixnails,indie,exit 2 - directions,exit 2 - directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns",2017


In [24]:
# Se guardan las columnas posibles a utilizar en las consultas en un nuevo dataframe
steamGamesFinal = steam_games[['id','title','genres','price','year']]

steamGamesFinal

Unnamed: 0,id,title,genres,price,year
0,761140,lost summoner kitty,action,4.99,2018
1,761140,lost summoner kitty,casual,4.99,2018
2,761140,lost summoner kitty,indie,4.99,2018
3,761140,lost summoner kitty,simulation,4.99,2018
4,761140,lost summoner kitty,strategy,4.99,2018
...,...,...,...,...,...
74829,610660,russian roads,racing,1.99,2018
74830,610660,russian roads,simulation,1.99,2018
74831,658870,exit 2 - directions,casual,4.99,2017
74832,658870,exit 2 - directions,indie,4.99,2017


In [None]:
steamGamesFinal.to_csv('csv_limpios/steam_games.csv', index= False)

In [26]:
steamGamesFinal

Unnamed: 0,id,title,genres,price,year
0,761140,lost summoner kitty,action,4.99,2018
1,761140,lost summoner kitty,casual,4.99,2018
2,761140,lost summoner kitty,indie,4.99,2018
3,761140,lost summoner kitty,simulation,4.99,2018
4,761140,lost summoner kitty,strategy,4.99,2018
...,...,...,...,...,...
74829,610660,russian roads,racing,1.99,2018
74830,610660,russian roads,simulation,1.99,2018
74831,658870,exit 2 - directions,casual,4.99,2017
74832,658870,exit 2 - directions,indie,4.99,2017


In [27]:
#steamGamesFinal2 = steamGamesFinal.sample(n=25000, random_state=324)

In [28]:
#steamGamesFinal2

Unnamed: 0,id,title,genres,price,year
46032,586570,metanet hunter cd,action,4.99,2017
48754,555630,a magical high school girl / 魔法の女子高生,adventure,9.99,2016
51863,497090,amihailu in dreamland,indie,0.00,2015
68958,239430,q.u.b.e: director's cut,indie,9.99,2014
55176,383580,flesh eaters,indie,1.99,2016
...,...,...,...,...,...
45374,587410,hmm founder's pack - gold edition,strategy,79.99,2017
9139,298641,brawlhalla - collectors pack,indie,99.99,2015
46614,509696,rocksmith® 2014 edition – remastered – booker ...,casual,2.99,2017
68603,308020,fantasy grounds - call of cthulhu: shadows of ...,indie,19.99,2014


In [None]:
# Se convierte el DataFrame a una tabla de Arrow para posteriormente comprimirlo en parquet
table = pa.Table.from_pandas(steamGamesFinal)

# Se especifica el nombre del archivo Parquet y comprime con snappy
parquet_file = 'datos_parquet/steam_games.parquet'
pq.write_table(table, parquet_file, compression='snappy')

In [None]:
#sg = pd.read_parquet('datos_parquet/steam_games.parquet')
#sg