In [3]:
import pandas as pd
import ast
import gzip

## Ingesta de datos (Extracion)

In [4]:
# Ruta del archivo JSON
file_path = '../Dataset/users_items.json.gz' 
data = []

# Abrir el archivo y procesar cada línea
with gzip.open(file_path, 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            # Usar ast.literal_eval para convertir la línea en un diccionario
            json_data = ast.literal_eval(line)
            data.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

#Crear un DataFrame a partir de la lista de diccionarios
df_items = pd.DataFrame(data)
df_items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [5]:
df_items.shape

(88310, 5)

Se verifica si hay resgistros nulos

In [6]:
df_items.isna().sum()

user_id        0
items_count    0
steam_id       0
user_url       0
items          0
dtype: int64

Se eliminan columnas que no son necesarias para la rubrica

In [7]:
df_items.drop(columns= ['steam_id', 'user_url'], inplace= True)

In [8]:
df_items.head(1)

Unnamed: 0,user_id,items_count,items
0,76561197970982479,277,"[{'item_id': '10', 'item_name': 'Counter-Strik..."


Se expanden los valores de la columna 'items'

In [9]:
df_explode = df_items.explode('items').reset_index(drop=True)

items = pd.json_normalize(df_explode['items']).reset_index(drop=True)

df_items_full = pd.concat([df_explode.drop('items', axis=1), items], axis=1)
df_items_full.shape


(5170015, 6)

Se eliminan columnas innecesarias

In [10]:
df_items_full.drop(columns= ['item_id','item_name'], inplace=True)
df_items_full.head(2)

Unnamed: 0,user_id,items_count,playtime_forever,playtime_2weeks
0,76561197970982479,277,6.0,0.0
1,76561197970982479,277,0.0,0.0


Se eliminan los duplicados


In [11]:
df_items_ok = df_items_full.drop_duplicates(keep='first')
df_items_ok

Unnamed: 0,user_id,items_count,playtime_forever,playtime_2weeks
0,76561197970982479,277,6.0,0.0
1,76561197970982479,277,0.0,0.0
2,76561197970982479,277,7.0,0.0
8,76561197970982479,277,4733.0,0.0
9,76561197970982479,277,1853.0,0.0
...,...,...,...,...
5170008,76561198329548331,7,43.0,43.0
5170009,76561198329548331,7,0.0,0.0
5170011,76561198329548331,7,3.0,3.0
5170012,76561198329548331,7,4.0,4.0


Se exporta el dataset en formato parquet para su posterior consumo en la API

In [12]:
df_items_ok.to_parquet('../Dataset/user_items.parquet')