## Convertir .json a .parquet

In [1]:
import pandas as pd
import pickle
import ast
import json



# Initialize an empty list to store dictionaries
data = []


# Open and parse the JSON file line by line
with open("data/steam_games.json") as f:
    for line in f.readlines():
        try:
            row_dict = json.loads(line)  # Parse the JSON data in each line
            data.append(row_dict)
        
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON data: {line}")

In [2]:
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)

In [3]:
df.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     24083 non-null  object
 1   genres        28852 non-null  object
 2   app_name      32133 non-null  object
 3   title         30085 non-null  object
 4   url           32135 non-null  object
 5   release_date  30068 non-null  object
 6   tags          31972 non-null  object
 7   reviews_url   32133 non-null  object
 8   specs         31465 non-null  object
 9   price         30758 non-null  object
 10  early_access  32135 non-null  object
 11  id            32133 non-null  object
 12  developer     28836 non-null  object
dtypes: object(13)
memory usage: 11.9+ MB


In [5]:
df.head(5)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,


In [6]:
df.dropna(inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22530 entries, 88310 to 120443
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     22530 non-null  object
 1   genres        22530 non-null  object
 2   app_name      22530 non-null  object
 3   title         22530 non-null  object
 4   url           22530 non-null  object
 5   release_date  22530 non-null  object
 6   tags          22530 non-null  object
 7   reviews_url   22530 non-null  object
 8   specs         22530 non-null  object
 9   price         22530 non-null  object
 10  early_access  22530 non-null  object
 11  id            22530 non-null  object
 12  developer     22530 non-null  object
dtypes: object(13)
memory usage: 2.4+ MB


### Corregir el problema antes de guardar en formato .parquet
`"Could not convert 'Free To Play' with type str: tried to convert to double`

Ahora Free to play es 0

In [10]:
# Convertir la columna price a data type numeric y llenar los NaN con valor '0'
df['price'] = pd.to_numeric(df['price'], errors='coerce').fillna(0)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22530 entries, 88310 to 120443
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     22530 non-null  object 
 1   genres        22530 non-null  object 
 2   app_name      22530 non-null  object 
 3   title         22530 non-null  object 
 4   url           22530 non-null  object 
 5   release_date  22530 non-null  object 
 6   tags          22530 non-null  object 
 7   reviews_url   22530 non-null  object 
 8   specs         22530 non-null  object 
 9   price         22530 non-null  float64
 10  early_access  22530 non-null  object 
 11  id            22530 non-null  object 
 12  developer     22530 non-null  object 
dtypes: float64(1), object(12)
memory usage: 2.4+ MB


## Grabar dataframe a .parquet

In [12]:
# Drop the 'price' column from the DataFrame
#df = df.drop('price', axis=1)

# Specify the output Parquet file path
parquet_file_path = 'steam_games.parquet'

# Save the DataFrame to a Parquet file
df.to_parquet(parquet_file_path, index=False)

In [13]:
dfp = pd.read_parquet('steam_games.parquet')

In [14]:
dfp.sample(5)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
17643,Lost Spell,"[Adventure, Indie]",Hidden: On the trail of the Ancients,Hidden: On the trail of the Ancients,http://store.steampowered.com/app/352580/Hidde...,2015-08-05,"[Adventure, Indie, Horror, Point & Click, Psyc...",http://steamcommunity.com/app/352580/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",9.99,False,352580,Lost Spell
8016,Blender Games,"[Action, Adventure, Casual, Indie]",Bitcoin Collector,Bitcoin Collector,http://store.steampowered.com/app/704190/Bitco...,2017-09-13,"[Difficult, Casual, Indie, Platformer, 2D, Sin...",http://steamcommunity.com/app/704190/reviews/?...,"[Single-player, Steam Achievements]",0.99,False,704190,Blender Games
14507,AGM PLAYISM,"[Action, Indie, RPG]",Dungeons & Darkness,Dungeons &amp; Darkness,http://store.steampowered.com/app/479990/Dunge...,2016-09-29,"[Action, RPG, Indie, Magic, Dungeon Crawler, D...",http://steamcommunity.com/app/479990/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",9.99,False,479990,Yamiuchi Project
927,Nyu Media,[Action],eXceed 2nd - Vampire REX,eXceed 2nd - Vampire REX,http://store.steampowered.com/app/207380/eXcee...,2012-08-02,"[Bullet Hell, Shoot 'Em Up, Anime, Action, Fem...",http://steamcommunity.com/app/207380/reviews/?...,"[Single-player, Steam Trading Cards]",5.99,False,207380,Tennen-sozai
1276,SEGA,[Action],Aliens: Colonial Marines Sawed-off Double Barr...,Aliens: Colonial Marines Sawed-off Double Barr...,http://store.steampowered.com/app/219447/Alien...,2013-05-28,[Action],http://steamcommunity.com/app/219447/reviews/?...,"[Single-player, Multi-player, Co-op, Downloada...",29.99,False,219447,Gearbox Software


In [1]:
# Wow el archivo ahora solo pesa 2.6mb

In [15]:
# En caso de necesitar columnas especificas

df_parquet = pd.read_parquet('steam_games.parquet', columns = ['genres', 'release_date'])

In [16]:
df_parquet.tail()

Unnamed: 0,genres,release_date
22525,"[Action, Adventure, Casual, Indie]",2018-01-04
22526,"[Casual, Indie, Simulation, Strategy]",2018-01-04
22527,"[Casual, Indie, Strategy]",2018-01-04
22528,"[Indie, Racing, Simulation]",2018-01-04
22529,"[Casual, Indie]",2017-09-02


In [17]:
df_parquet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22530 entries, 0 to 22529
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        22530 non-null  object
 1   release_date  22530 non-null  object
dtypes: object(2)
memory usage: 352.2+ KB
