In [26]:
import pandas as pd
import ast
import gzip
import os

In [27]:
def descompimir_json(ruta, variable_anidada):
    # recibe una ruta de un archivo json anidado y carga la información en un DF
    fila = []
    with gzip.open(ruta, 'rt', encoding='MacRoman') as archivo:
      for line in archivo.readlines():
          fila.append(ast.literal_eval(line))

    df = pd.DataFrame(fila) ## leemos cada una de las filas

    ## desanidad el contenido de items y se agrega una nueva fila por cada elemento deitems
    ## Reseteamos los indices del dataframe
    df = df.explode(variable_anidada).reset_index()

    ## Eliminamos los indices pasados (repetidos)
    df = df.drop(columns="index")

    ## Concatenamos con user_id con algunas columnas aplanadas del json.
    data_plana = pd.json_normalize(df[variable_anidada])[['item_id', 'playtime_forever','playtime_2weeks']]
    df = pd.concat([df['user_id'], data_plana ], axis=1)

    return df

In [28]:
path_data =  os.path.join('../data/original/users_items.json.gz')
user_items = descompimir_json(path_data,'items')

In [29]:
backup=user_items.copy()

In [30]:
#user_items=backup.copy()

In [31]:
backup.head(3)

Unnamed: 0,user_id,item_id,playtime_forever,playtime_2weeks
0,76561197970982479,10,6.0,0.0
1,76561197970982479,20,0.0,0.0
2,76561197970982479,30,7.0,0.0


In [32]:
user_items.head(3)

Unnamed: 0,user_id,item_id,playtime_forever,playtime_2weeks
0,76561197970982479,10,6.0,0.0
1,76561197970982479,20,0.0,0.0
2,76561197970982479,30,7.0,0.0


In [33]:
user_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5170015 entries, 0 to 5170014
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   item_id           object 
 2   playtime_forever  float64
 3   playtime_2weeks   float64
dtypes: float64(2), object(2)
memory usage: 157.8+ MB


In [34]:
user_items['item_id'] = user_items['item_id'].astype(float)

In [35]:
user_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5170015 entries, 0 to 5170014
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   item_id           float64
 2   playtime_forever  float64
 3   playtime_2weeks   float64
dtypes: float64(3), object(1)
memory usage: 157.8+ MB


In [36]:
pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [37]:
user_items.describe()

Unnamed: 0,item_id,playtime_forever,playtime_2weeks
count,5153209.0,5153209.0,5153209.0
mean,178448.1,991.5,9.1
std,131859.8,5418.2,140.4
min,10.0,0.0,0.0
25%,34460.0,0.0,0.0
50%,214420.0,34.0,0.0
75%,266430.0,355.0,0.0
max,530720.0,642773.0,19967.0


In [38]:
user_items[['playtime_forever', 'playtime_2weeks']] = user_items[['playtime_forever', 'playtime_2weeks']] / 60

In [39]:
user_items.describe()

Unnamed: 0,item_id,playtime_forever,playtime_2weeks
count,5153209.0,5153209.0,5153209.0
mean,178448.1,16.5,0.2
std,131859.8,90.3,2.3
min,10.0,0.0,0.0
25%,34460.0,0.0,0.0
50%,214420.0,0.6,0.0
75%,266430.0,5.9,0.0
max,530720.0,10712.9,332.8


In [40]:
user_items.isna().sum()

user_id                 0
item_id             16806
playtime_forever    16806
playtime_2weeks     16806
dtype: int64

In [41]:
user_items.dropna(subset=['playtime_forever'], inplace=True)

In [42]:
user_items.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5153209 entries, 0 to 5170013
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   item_id           float64
 2   playtime_forever  float64
 3   playtime_2weeks   float64
dtypes: float64(3), object(1)
memory usage: 196.6+ MB


In [43]:
user_items.isna().sum()

user_id             0
item_id             0
playtime_forever    0
playtime_2weeks     0
dtype: int64

In [44]:
user_items.drop_duplicates(subset=['user_id','item_id','playtime_forever'], inplace=True)

In [45]:
#exporto la data de users_items en formato CSV comprimido con la info basica que necesito.
with gzip.open('../data/limpio/users_items.csv.gz', 'wb') as file:
    user_items.to_csv(file, index=False, encoding='utf-8')