In [1]:
import pandas as pd 
import json
import gzip
import ast
import re 
import pyarrow as pa
import pyarrow.parquet as pq 

In [2]:
row = []  # creamos una lista vacia para ir agregando las filas del archivo json

with open('datasets_json/australian_users_items.json', encoding='utf-8') as file:
    for line in file:
        # Utilizamos ast.literal_eval para interpretar la línea como expresión literal de Python
        # Luego, convertimos la expresión evaluada a formato JSON utilizando json.dumps
        row.append(json.loads(json.dumps(ast.literal_eval(line))))

# Creamos un dataframe con la lista de diccionarios
items = pd.DataFrame(row)
items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [3]:
items.shape

(88310, 5)

TRANSFORMACION DEL DATASET

In [4]:
# Como podes observar, la columna 'items' esta llena de listas anidadas. Por lo cual, usaremos el metodo normalize para generar un nuevo dataset donde la columna 'items' sirve como columna organizadora.
# y las claves de los diccionarios sirven como columnas individuales.

items_normalized = pd.json_normalize(row, record_path=['items'], meta=['steam_id','items_count','user_id', 'user_url'] )
items_normalized.head()

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,steam_id,items_count,user_id,user_url
0,10,Counter-Strike,6,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,20,Team Fortress Classic,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
2,30,Day of Defeat,7,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
3,40,Deathmatch Classic,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
4,50,Half-Life: Opposing Force,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...


In [5]:
tipo_data = {"columna":[],"tipos_de_datos":[]} # generamos un diccionario vacio para ir almacenando lo que genere el bucle

for columna in items_normalized.columns: # recorremos las columnas del dataframe 
    tipo_data["columna"].append(columna) # agregamos el nombre de la columna
    tipo_data["tipos_de_datos"].append(items_normalized[columna].apply(type).unique()) # agregamos los tipos de datos que hay en la columna

analisis_dtype= pd.DataFrame(tipo_data) # creamos un dataframe con el diccionario
analisis_dtype

Unnamed: 0,columna,tipos_de_datos
0,item_id,[<class 'str'>]
1,item_name,[<class 'str'>]
2,playtime_forever,[<class 'int'>]
3,playtime_2weeks,[<class 'int'>]
4,steam_id,[<class 'str'>]
5,items_count,[<class 'int'>]
6,user_id,[<class 'str'>]
7,user_url,[<class 'str'>]


In [6]:
items_normalized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5153209 entries, 0 to 5153208
Data columns (total 8 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   item_id           object
 1   item_name         object
 2   playtime_forever  int64 
 3   playtime_2weeks   int64 
 4   steam_id          object
 5   items_count       object
 6   user_id           object
 7   user_url          object
dtypes: int64(2), object(6)
memory usage: 314.5+ MB


In [7]:
# una vez desanidada la columna 'items' podemos proseguir a borrar los nulos y los duplicados, sin danar ningun dato de interes.
duplicates = items_normalized.loc[items_normalized.duplicated()]
duplicates

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,steam_id,items_count,user_id,user_url
164294,20,Team Fortress Classic,5,0,76561198084006094,109,Nikiad,http://steamcommunity.com/id/Nikiad
164295,50,Half-Life: Opposing Force,0,0,76561198084006094,109,Nikiad,http://steamcommunity.com/id/Nikiad
164296,70,Half-Life,0,0,76561198084006094,109,Nikiad,http://steamcommunity.com/id/Nikiad
164297,130,Half-Life: Blue Shift,0,0,76561198084006094,109,Nikiad,http://steamcommunity.com/id/Nikiad
164298,220,Half-Life 2,198,0,76561198084006094,109,Nikiad,http://steamcommunity.com/id/Nikiad
...,...,...,...,...,...,...,...,...
4898223,213670,South Park™: The Stick of Truth™,725,0,76561198080057659,39,76561198080057659,http://steamcommunity.com/profiles/76561198080...
4898224,221910,The Stanley Parable,53,0,76561198080057659,39,76561198080057659,http://steamcommunity.com/profiles/76561198080...
4898225,261030,The Walking Dead: Season Two,253,0,76561198080057659,39,76561198080057659,http://steamcommunity.com/profiles/76561198080...
4898226,273110,Counter-Strike Nexon: Zombies,0,0,76561198080057659,39,76561198080057659,http://steamcommunity.com/profiles/76561198080...


In [8]:
items_normalized = items_normalized.drop_duplicates(keep='first') # eliminamos las columnas duplicadas, pero usando el parametro 'keep='first'' para  conservar la primera aparición de la fila duplicada. 
items_normalized

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,steam_id,items_count,user_id,user_url
0,10,Counter-Strike,6,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,20,Team Fortress Classic,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
2,30,Day of Defeat,7,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
3,40,Deathmatch Classic,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
4,50,Half-Life: Opposing Force,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
...,...,...,...,...,...,...,...,...
5153204,346330,BrainBread 2,0,0,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5153205,373330,All Is Dust,0,0,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5153206,388490,One Way To Die: Steam Edition,3,3,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5153207,521570,You Have 10 Seconds 2,4,4,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...


In [9]:
items_normalized.isna().sum() # Despues de eliminar las filas duplicadas, comprobamos que no hayan valores nulos. 

item_id             0
item_name           0
playtime_forever    0
playtime_2weeks     0
steam_id            0
items_count         0
user_id             0
user_url            0
dtype: int64

Una vez que eliminamos los duplicados y comprobamos que no hay valores nulos en el dataframe, proseguimos a trabajar con la columna 'playtime_forever' ya que uno de los endpoints esta directamente relacionada con esta. En principio vemos que el dataframe contiene mas de 5millones de registros, lo cual hace que los tiempos de carga sean largos, por este motivo vamos a eliminar los registros de la columna 'playtime_forever' que sean = 0.

In [10]:
items_normalized['playtime_forever'].value_counts()

playtime_forever
0        1847730
1         101586
2          34391
3          31530
4          29127
          ...   
76541          1
64676          1
53328          1
44029          1
34753          1
Name: count, Length: 48861, dtype: int64

In [11]:
#Procedemos a filtrar y mantener los valores que sean distintos a 0
items_normalized = items_normalized[items_normalized['playtime_forever'] != 0]

#reestablecemos el index
items_normalized = items_normalized.reset_index(drop=True)

In [12]:
items_normalized['playtime_forever'].value_counts()

playtime_forever
1        101586
2         34391
3         31530
4         29127
5         27612
          ...  
76541         1
64676         1
53328         1
44029         1
34753         1
Name: count, Length: 48860, dtype: int64

Teniendo en cuenta que la columna 'playtime_forever' esta en minutos, lo convertiremos a horas y esta conversion se guardara en una nueva columna llamada 'playtime_hours'

In [15]:
# Creamos la columna 'playtime__hours' que es el resultado de dividir 'playtime_forever' por 60 y redondeando a 2 decimales
items_normalized['playtime_hours'] = round(items_normalized['playtime_forever']/60,2)

In [16]:
items_normalized['playtime_hours'].value_counts() # vemos la cantidad de horas jugadas

playtime_hours
0.02       101586
0.03        34391
0.05        31530
0.07        29127
0.08        27612
            ...  
1275.68         1
1077.93         1
888.80          1
733.82          1
579.22          1
Name: count, Length: 48860, dtype: int64

In [17]:
items_normalized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3246375 entries, 0 to 3246374
Data columns (total 9 columns):
 #   Column            Dtype  
---  ------            -----  
 0   item_id           object 
 1   item_name         object 
 2   playtime_forever  int64  
 3   playtime_2weeks   int64  
 4   steam_id          object 
 5   items_count       object 
 6   user_id           object 
 7   user_url          object 
 8   playtime_hours    float64
dtypes: float64(1), int64(2), object(6)
memory usage: 222.9+ MB


Una vez que tenemos la columna 'playtime_hours', se procede a eliminar las columnas 'playtime_forever' y 'playtime_2weeks' ya que son innecesarias.

In [18]:
#se procede a eliminar las columnas 
items_normalized = items_normalized.drop(['playtime_2weeks','playtime_forever'], axis=1)

In [25]:
items_normalized.info() # vemos que las columnas fueron eliminadas con exito

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3246375 entries, 0 to 3246374
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   item_id         object 
 1   item_name       object 
 2   steam_id        object 
 3   items_count     object 
 4   user_id         object 
 5   playtime_hours  float64
dtypes: float64(1), object(5)
memory usage: 148.6+ MB


In [26]:
items_normalized.shape

(3246375, 6)

Como podemos observar, pasamos de 5 millones de registros x 8 columnas a 3.2millones x 5 columnas. Esto reduce considerablemente el tamaño de nuestro dataframe.

PROCEDEMOS A GUARDAR EL DATAFRAME COMO CSV


In [27]:
items_normalized.to_csv('dataset_limpio/items_normalized.csv', index=False, encoding='utf-8') #convertimos el dataframe a csv y ponemos index=False para que no haga una nueva columna de indice.

AHORA CONVERTIMOS EL DATAFRAME.CSV A FORMATE PARQUET

In [28]:
items_normalized = pd.read_csv("dataset_limpio/items_normalized.csv") # leemos el archivo csv

tabla = pa.Table.from_pandas(items_normalized) # convertimos el dataframe en una tabla de pyarrow
pq.write_table(tabla,"dataset_limpio/items_normalized.parquet") # guardamos la tabla en un archivo parquet