In [2]:
import pandas as pd
import ast
import gzip
import os

In [3]:
def descompimir_json(ruta, variable_anidada):
    # recibe una ruta de un archivo json anidado y carga la información en un DF
    fila = []
    with gzip.open(ruta, 'rt', encoding='MacRoman') as archivo:
      for line in archivo.readlines():
          fila.append(ast.literal_eval(line))

    df = pd.DataFrame(fila) ## leemos cada una de las filas

    ## desanidad el contenido de items y se agrega una nueva fila por cada elemento deitems
    ## Reseteamos los indices del dataframe
    df = df.explode(variable_anidada).reset_index()

    ## Eliminamos los indices pasados (repetidos)
    df = df.drop(columns="index")

    ## Concatenamos con user_id con algunas columnas aplanadas del json.
    data_plana = pd.json_normalize(df[variable_anidada])[['item_id', 'playtime_forever','playtime_2weeks']]
    df = pd.concat([df['user_id'], data_plana ], axis=1)

    return df

In [4]:
path_data =  os.path.join('../data/original/users_items.json.gz')
user_items = descompimir_json(path_data,'items')

In [5]:
backup=user_items.copy()

In [6]:
#user_items=backup.copy()

In [7]:
user_items.head(3)

Unnamed: 0,user_id,item_id,playtime_forever,playtime_2weeks
0,76561197970982479,10,6.0,0.0
1,76561197970982479,20,0.0,0.0
2,76561197970982479,30,7.0,0.0


In [8]:
user_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5170015 entries, 0 to 5170014
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   item_id           object 
 2   playtime_forever  float64
 3   playtime_2weeks   float64
dtypes: float64(2), object(2)
memory usage: 157.8+ MB


In [9]:
user_items['item_id'] = user_items['item_id'].astype(float)

In [10]:
user_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5170015 entries, 0 to 5170014
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   item_id           float64
 2   playtime_forever  float64
 3   playtime_2weeks   float64
dtypes: float64(3), object(1)
memory usage: 157.8+ MB


In [11]:
pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [12]:
user_items.describe()

Unnamed: 0,item_id,playtime_forever,playtime_2weeks
count,5153209.0,5153209.0,5153209.0
mean,178448.1,991.5,9.1
std,131859.8,5418.2,140.4
min,10.0,0.0,0.0
25%,34460.0,0.0,0.0
50%,214420.0,34.0,0.0
75%,266430.0,355.0,0.0
max,530720.0,642773.0,19967.0


In [13]:
user_items[['playtime_forever', 'playtime_2weeks']] = user_items[['playtime_forever', 'playtime_2weeks']] / 60

In [14]:
user_items.describe()

Unnamed: 0,item_id,playtime_forever,playtime_2weeks
count,5153209.0,5153209.0,5153209.0
mean,178448.1,16.5,0.2
std,131859.8,90.3,2.3
min,10.0,0.0,0.0
25%,34460.0,0.0,0.0
50%,214420.0,0.6,0.0
75%,266430.0,5.9,0.0
max,530720.0,10712.9,332.8


In [15]:
user_items.isna().sum()

user_id                 0
item_id             16806
playtime_forever    16806
playtime_2weeks     16806
dtype: int64

In [16]:
user_items.dropna(subset=['playtime_forever'], inplace=True)

In [17]:
user_items.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5153209 entries, 0 to 5170013
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   item_id           float64
 2   playtime_forever  float64
 3   playtime_2weeks   float64
dtypes: float64(3), object(1)
memory usage: 196.6+ MB


In [18]:
user_items.isna().sum()

user_id             0
item_id             0
playtime_forever    0
playtime_2weeks     0
dtype: int64

In [19]:
user_items.drop_duplicates(subset=['user_id','item_id','playtime_forever'], inplace=True)

In [20]:
#hay muchos registros de usuarion con 0 horas jugadas (los borro, no sirven para el analisis)
user_items[user_items['playtime_forever']==0]

Unnamed: 0,user_id,item_id,playtime_forever,playtime_2weeks
1,76561197970982479,20.0,0.0,0.0
3,76561197970982479,40.0,0.0,0.0
4,76561197970982479,50.0,0.0,0.0
5,76561197970982479,60.0,0.0,0.0
6,76561197970982479,70.0,0.0,0.0
...,...,...,...,...
5170003,76561198326700687,519170.0,0.0,0.0
5170004,76561198326700687,358390.0,0.0,0.0
5170005,76561198326700687,521570.0,0.0,0.0
5170009,76561198329548331,346330.0,0.0,0.0


In [21]:
user_items=user_items[user_items['playtime_forever']>0]

In [22]:
pd.set_option('display.float_format', lambda x: '%.10f' % x)

In [41]:
user_items['playtime_forever'].value_counts().head(10)

playtime_forever
0.0166666667    101586
0.0333333333     34391
0.0500000000     31530
0.0666666667     29127
0.0833333333     27612
0.1000000000     26233
0.1166666667     25051
0.1333333333     23961
0.1500000000     22532
0.1666666667     21946
Name: count, dtype: int64

In [39]:
# reviso los usuarios que jugaron menos de 1 hora a algun juego, y luego los borro porque lo considero insignificante para el analisis
user_items[['user_id','item_id','playtime_forever']][user_items['playtime_forever']<1].groupby(['user_id', 'item_id']).sum().sort_values('playtime_forever')

Unnamed: 0_level_0,Unnamed: 1_level_0,playtime_forever
user_id,item_id,Unnamed: 2_level_1
lukutis,22120.0000000000,0.0166666667
mattcolt,238070.0000000000,0.0166666667
GungeonRighteer,39120.0000000000,0.0166666667
GungeonRighteer,248570.0000000000,0.0166666667
76561198035011178,33910.0000000000,0.0166666667
...,...,...
76561198002557471,313630.0000000000,0.9833333333
Sparon,265630.0000000000,0.9833333333
76561198073794421,282440.0000000000,0.9833333333
76561198039700455,239160.0000000000,0.9833333333


In [42]:
user_items

Unnamed: 0,user_id,item_id,playtime_forever,playtime_2weeks
0,76561197970982479,10.0000000000,0.1000000000,0.0000000000
2,76561197970982479,30.0000000000,0.1166666667,0.0000000000
8,76561197970982479,300.0000000000,78.8833333333,0.0000000000
9,76561197970982479,240.0000000000,30.8833333333,0.0000000000
10,76561197970982479,3830.0000000000,5.5500000000,0.0000000000
...,...,...,...,...
5170007,76561198329548331,304930.0000000000,11.2833333333,11.2833333333
5170008,76561198329548331,227940.0000000000,0.7166666667,0.7166666667
5170011,76561198329548331,388490.0000000000,0.0500000000,0.0500000000
5170012,76561198329548331,521570.0000000000,0.0666666667,0.0666666667


In [45]:
#borro los registros que jugaron menos de 1 hora forever
user_items=user_items[user_items['playtime_forever']>1]

In [46]:
user_items

Unnamed: 0,user_id,item_id,playtime_forever,playtime_2weeks
8,76561197970982479,300.0000000000,78.8833333333,0.0000000000
9,76561197970982479,240.0000000000,30.8833333333,0.0000000000
10,76561197970982479,3830.0000000000,5.5500000000,0.0000000000
11,76561197970982479,2630.0000000000,1.2500000000,0.0000000000
12,76561197970982479,3900.0000000000,5.6333333333,0.0000000000
...,...,...,...,...
5169481,76561198320038728,442080.0000000000,2.5500000000,0.0000000000
5169797,76561198320136420,273350.0000000000,1.7166666667,0.0000000000
5169804,ArkPlays7,730.0000000000,75.9500000000,28.3000000000
5169805,ArkPlays7,346110.0000000000,10.3833333333,0.0000000000


In [47]:
#exporto la data de users_items en formato CSV comprimido con la info basica que necesito.
with gzip.open('../data/limpio/users_items.csv.gz', 'wb') as file:
    user_items.to_csv(file, index=False, encoding='utf-8')