## Convertir .json a .parquet

In [11]:
import pandas as pd
import pickle
import ast
import json


# Initialize an empty list to store dictionaries
data = []

# Open and parse the JSON file line by line
with open("data/steam_games.json") as f:
    for line in f.readlines():
        try:
            row_dict = json.loads(line)  # Parse the JSON data in each line
            data.append(row_dict)
        
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON data: {line}")

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)


In [12]:
print(df)

              publisher                                 genres  \
0                   NaN                                    NaN   
1                   NaN                                    NaN   
2                   NaN                                    NaN   
3                   NaN                                    NaN   
4                   NaN                                    NaN   
...                 ...                                    ...   
120440  Ghost_RUS Games  [Casual, Indie, Simulation, Strategy]   
120441           Sacada              [Casual, Indie, Strategy]   
120442     Laush Studio            [Indie, Racing, Simulation]   
120443         SIXNAILS                        [Casual, Indie]   
120444              NaN                                    NaN   

                        app_name                     title  \
0                            NaN                       NaN   
1                            NaN                       NaN   
2                    

In [13]:
df.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     24083 non-null  object
 1   genres        28852 non-null  object
 2   app_name      32133 non-null  object
 3   title         30085 non-null  object
 4   url           32135 non-null  object
 5   release_date  30068 non-null  object
 6   tags          31972 non-null  object
 7   reviews_url   32133 non-null  object
 8   specs         31465 non-null  object
 9   price         30758 non-null  object
 10  early_access  32135 non-null  object
 11  id            32133 non-null  object
 12  developer     28836 non-null  object
dtypes: object(13)
memory usage: 11.9+ MB


## Grabar dataframe a .parquet

In [17]:
# Drop the 'price' column from the DataFrame
df = df.drop('price', axis=1)

# Specify the output Parquet file path
parquet_file_path = 'steam_games.parquet'

# Save the DataFrame to a Parquet file
df.to_parquet(parquet_file_path, index=False)

In [19]:
dfp = pd.read_parquet('steam_games.parquet')

In [24]:
dfp.sample(5)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,early_access,id,developer
64545,,,,,,,,,,,,
41025,,,,,,,,,,,,
22090,,,,,,,,,,,,
99790,Bethesda Softworks,[Action],Dishonored®: Death of the Outsider™,Dishonored®: Death of the Outsider™,http://store.steampowered.com/app/614570/Disho...,2017-09-14,"[Stealth, Action, Violent, First-Person, Femal...",http://steamcommunity.com/app/614570/reviews/?...,"[Single-player, Steam Achievements, Full contr...",False,614570.0,Arkane Studios
75856,,,,,,,,,,,,


In [25]:
# Wow el archivo solo pesa 3.5mb, notar que dropeamos la columna price
!ls -GFlash steam_games.parquet

3.4M -rw-rw-r-- 1 krelar 3.4M Sep  5 03:03 steam_games.parquet


In [30]:
# En caso de necesitar columnas especificas

df_parquet = pd.read_parquet('steam_games.parquet', columns = ['genres', 'release_date'])

In [28]:
df_parquet.tail()

Unnamed: 0,genres,release_date
120440,"[Casual, Indie, Simulation, Strategy]",2018-01-04
120441,"[Casual, Indie, Strategy]",2018-01-04
120442,"[Indie, Racing, Simulation]",2018-01-04
120443,"[Casual, Indie]",2017-09-02
120444,,


In [34]:
df_parquet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        28852 non-null  object
 1   release_date  30068 non-null  object
dtypes: object(2)
memory usage: 1.8+ MB
