In [20]:

import pandas as pd
import numpy as np
import ast
import gzip
import json
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re

# CARGA DE DATOS

* Carga del json comprimido

In [21]:
def descompimir_json(ruta, variable_anidada):
    '''Función que recibe una ruta de acceso a un archivo json anidado y carga la información en un
    DataFrame de Pandas'''
    fila = []
    with gzip.open(ruta, 'rt', encoding='MacRoman') as archivo:
      for line in archivo.readlines():
          fila.append(ast.literal_eval(line))

    df = pd.DataFrame(fila)                                                 
    df = df.explode(variable_anidada).reset_index()                         
    df = df.drop(columns="index")                                           
    df = pd.concat([df, pd.json_normalize(df[variable_anidada])], axis=1)   
    df = df.drop(columns=variable_anidada)                                  

    return df

In [22]:
user_items = descompimir_json("../Datasets/users_items.json.gz",'items')

In [23]:
#Realizo una copia de 'user_items' para evitar tener que descomprimirlo cada vez que desee ejecutar el archivo.
user_items_cleaned = user_items.copy()

In [24]:
user_items_cleaned

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6.0,0.0
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,30,Day of Defeat,7.0,0.0
3,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,50,Half-Life: Opposing Force,0.0,0.0
...,...,...,...,...,...,...,...,...
5170010,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,373330,All Is Dust,0.0,0.0
5170011,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,388490,One Way To Die: Steam Edition,3.0,3.0
5170012,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,521570,You Have 10 Seconds 2,4.0,4.0
5170013,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,519140,Minds Eyes,3.0,3.0


In [25]:
user_items_cleaned.isna().sum()

user_id                 0
items_count             0
steam_id                0
user_url                0
item_id             16806
item_name           16806
playtime_forever    16806
playtime_2weeks     16806
dtype: int64

* Todos los valores nulos se refieren a las mismas filas, por lo que serán eliminados.

In [26]:
user_items_cleaned.dropna(inplace=True)

In [27]:
user_items_cleaned

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6.0,0.0
1,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,30,Day of Defeat,7.0,0.0
3,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,50,Half-Life: Opposing Force,0.0,0.0
...,...,...,...,...,...,...,...,...
5170009,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,346330,BrainBread 2,0.0,0.0
5170010,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,373330,All Is Dust,0.0,0.0
5170011,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,388490,One Way To Die: Steam Edition,3.0,3.0
5170012,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,521570,You Have 10 Seconds 2,4.0,4.0


* Elimino columnas irrelebantes al analisis

In [28]:
user_items_cleaned = user_items_cleaned.drop(columns=['user_url','playtime_2weeks','steam_id','items_count'])

In [29]:
# Paso a datos Float los datos de la columna item_id
user_items_cleaned['item_id'] = user_items_cleaned['item_id'].astype(float)

In [30]:
user_items_cleaned

Unnamed: 0,user_id,item_id,item_name,playtime_forever
0,76561197970982479,10.0,Counter-Strike,6.0
1,76561197970982479,20.0,Team Fortress Classic,0.0
2,76561197970982479,30.0,Day of Defeat,7.0
3,76561197970982479,40.0,Deathmatch Classic,0.0
4,76561197970982479,50.0,Half-Life: Opposing Force,0.0
...,...,...,...,...
5170009,76561198329548331,346330.0,BrainBread 2,0.0
5170010,76561198329548331,373330.0,All Is Dust,0.0
5170011,76561198329548331,388490.0,One Way To Die: Steam Edition,3.0
5170012,76561198329548331,521570.0,You Have 10 Seconds 2,4.0


* Convierto de minutos a horas columna 'playtime_forever'

In [31]:
user_items_cleaned['playtime_forever'] = user_items_cleaned['playtime_forever'] / 60

In [32]:
user_items_cleaned

Unnamed: 0,user_id,item_id,item_name,playtime_forever
0,76561197970982479,10.0,Counter-Strike,0.100000
1,76561197970982479,20.0,Team Fortress Classic,0.000000
2,76561197970982479,30.0,Day of Defeat,0.116667
3,76561197970982479,40.0,Deathmatch Classic,0.000000
4,76561197970982479,50.0,Half-Life: Opposing Force,0.000000
...,...,...,...,...
5170009,76561198329548331,346330.0,BrainBread 2,0.000000
5170010,76561198329548331,373330.0,All Is Dust,0.000000
5170011,76561198329548331,388490.0,One Way To Die: Steam Edition,0.050000
5170012,76561198329548331,521570.0,You Have 10 Seconds 2,0.066667


* Exporto el dataframe a csv y lo comprimo en gz

In [33]:
user_items_cleaned.to_csv('../Datasets/users_item_cleaned.csv',index=False)

with gzip.open('../Datasets/user_items_cleaned.csv.gz', 'wb') as f:
    user_items.to_csv(f, index=False, encoding='utf-8')