In [2]:
import pandas as pd
import ast
import json


%load_ext autoreload
%autoreload 2
import utils

import warnings
warnings.filterwarnings("ignore")

In [2]:
#pip install textblob

## Dataset `australian_users_items`

### Extracción y primera exploración

In [3]:
# Ruta al dataset australian_user_reviews
ruta_items = 'data/australian_users_items.json'

# Se lee de cada línea del dataset
filas_items = []
with open(ruta_items) as f:
    for line in f.readlines():
        filas_items.append(ast.literal_eval(line))

# Se convierte en dataframe
df_items = pd.DataFrame(filas_items)
df_items

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,http://steamcommunity.com/profiles/76561198323...,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,177,76561198326700687,http://steamcommunity.com/profiles/76561198326...,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...,[]
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"[{'item_id': '304930', 'item_name': 'Unturned'..."


Se revisa el tipo de dato de cada columna y si hay nulos.

In [4]:
# Se revisan tipos de datos y existencias de nulos
utils.verificar_tipo_datos(df_items)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,user_id,[<class 'str'>],100.0,0.0,0
1,items_count,[<class 'int'>],100.0,0.0,0
2,steam_id,[<class 'str'>],100.0,0.0,0
3,user_url,[<class 'str'>],100.0,0.0,0
4,items,[<class 'list'>],100.0,0.0,0


La columna 'items' es una lista, así que se explora la misma para conocer su estructura.

In [5]:
# se observa el tipo de dato en 'early_access'
df_items['items'][0]

[{'item_id': '10',
  'item_name': 'Counter-Strike',
  'playtime_forever': 6,
  'playtime_2weeks': 0},
 {'item_id': '20',
  'item_name': 'Team Fortress Classic',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '30',
  'item_name': 'Day of Defeat',
  'playtime_forever': 7,
  'playtime_2weeks': 0},
 {'item_id': '40',
  'item_name': 'Deathmatch Classic',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '50',
  'item_name': 'Half-Life: Opposing Force',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '60',
  'item_name': 'Ricochet',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '70',
  'item_name': 'Half-Life',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '130',
  'item_name': 'Half-Life: Blue Shift',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '300',
  'item_name': 'Day of Defeat: Source',
  'playtime_forever': 4733,
  'playtime_2weeks': 0},
 {'item_id': '240',
  'item_name': 'Counter-Strike: S

Este conjunto contiene 5 columnas y 88309 filas, no tiene nulos. Las columnas que contiene son:

* **user_id**: contiene el id del usuario.
* **items_count**: contiene un número entero del cual no se dispone de información sobre su significado.
* **steam_id**: es un número de id de stram del cuál no se dispone información. Se asume que el valor único del item.
* **user_url**: es la url del perfil del usuario
* **items**: contiene una lista de uno o mas diccionarios de los items que consume cada usuario. Cada diccionario tiene las siguientes claves:
  * **item_id**: es un número con el id del item.
  * **item_name**: es el nombre del contenido que consume.
  * **playtime_forever**: es un número entero del cuál no se dispone información.
  * **playtime_2weeks**: es un número entero del cuál no se dispone información.

### Tratamiento de la columna 'items'

In [6]:
# Se transforma a columnas cada elemento de las listas
df_items2 = pd.json_normalize(df_items['items'])
df_items2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7752,7753,7754,7755,7756,7757,7758,7759,7760,7761
0,"{'item_id': '10', 'item_name': 'Counter-Strike...","{'item_id': '20', 'item_name': 'Team Fortress ...","{'item_id': '30', 'item_name': 'Day of Defeat'...","{'item_id': '40', 'item_name': 'Deathmatch Cla...","{'item_id': '50', 'item_name': 'Half-Life: Opp...","{'item_id': '60', 'item_name': 'Ricochet', 'pl...","{'item_id': '70', 'item_name': 'Half-Life', 'p...","{'item_id': '130', 'item_name': 'Half-Life: Bl...","{'item_id': '300', 'item_name': 'Day of Defeat...","{'item_id': '240', 'item_name': 'Counter-Strik...",...,,,,,,,,,,
1,"{'item_id': '10', 'item_name': 'Counter-Strike...","{'item_id': '80', 'item_name': 'Counter-Strike...","{'item_id': '100', 'item_name': 'Counter-Strik...","{'item_id': '300', 'item_name': 'Day of Defeat...","{'item_id': '30', 'item_name': 'Day of Defeat'...","{'item_id': '40', 'item_name': 'Deathmatch Cla...","{'item_id': '60', 'item_name': 'Ricochet', 'pl...","{'item_id': '240', 'item_name': 'Counter-Strik...","{'item_id': '280', 'item_name': 'Half-Life: So...","{'item_id': '360', 'item_name': 'Half-Life Dea...",...,,,,,,,,,,
2,"{'item_id': '1200', 'item_name': 'Red Orchestr...","{'item_id': '1230', 'item_name': 'Mare Nostrum...","{'item_id': '1280', 'item_name': 'Darkest Hour...","{'item_id': '1520', 'item_name': 'DEFCON', 'pl...","{'item_id': '220', 'item_name': 'Half-Life 2',...","{'item_id': '320', 'item_name': 'Half-Life 2: ...","{'item_id': '340', 'item_name': 'Half-Life 2: ...","{'item_id': '360', 'item_name': 'Half-Life Dea...","{'item_id': '380', 'item_name': 'Half-Life 2: ...","{'item_id': '400', 'item_name': 'Portal', 'pla...",...,,,,,,,,,,
3,"{'item_id': '10', 'item_name': 'Counter-Strike...","{'item_id': '20', 'item_name': 'Team Fortress ...","{'item_id': '30', 'item_name': 'Day of Defeat'...","{'item_id': '40', 'item_name': 'Deathmatch Cla...","{'item_id': '50', 'item_name': 'Half-Life: Opp...","{'item_id': '60', 'item_name': 'Ricochet', 'pl...","{'item_id': '70', 'item_name': 'Half-Life', 'p...","{'item_id': '130', 'item_name': 'Half-Life: Bl...","{'item_id': '80', 'item_name': 'Counter-Strike...","{'item_id': '100', 'item_name': 'Counter-Strik...",...,,,,,,,,,,
4,"{'item_id': '300', 'item_name': 'Day of Defeat...","{'item_id': '20', 'item_name': 'Team Fortress ...","{'item_id': '50', 'item_name': 'Half-Life: Opp...","{'item_id': '70', 'item_name': 'Half-Life', 'p...","{'item_id': '130', 'item_name': 'Half-Life: Bl...","{'item_id': '10', 'item_name': 'Counter-Strike...","{'item_id': '30', 'item_name': 'Day of Defeat'...","{'item_id': '40', 'item_name': 'Deathmatch Cla...","{'item_id': '60', 'item_name': 'Ricochet', 'pl...","{'item_id': '80', 'item_name': 'Counter-Strike...",...,,,,,,,,,,


aprovecho para no incluir url

In [11]:
# Se agrega el 'user_id' y 'user_url' a las columnas separadas 
df_items3 = pd.concat([df_items[['user_id', 'items_count', 'steam_id']], df_items2], axis=1)
df_items3.head()

MemoryError: Unable to allocate 5.11 GiB for an array with shape (7762, 88310) and data type object

In [None]:
# Tamaño del lote para el procesamiento
batch_size = 100  # Puedes ajustar este valor según tus recursos y requerimientos

# Lista para almacenar columnas no nulas
non_null_columns = []

# Procesamiento en lotes
for batch_start in range(0, df_items2.shape[1], batch_size):
    batch_end = min(batch_start + batch_size, df_items2.shape[1])
    batch = df_items2.iloc[:, batch_start:batch_end]
    
    # Encuentra las columnas no nulas en el lote actual
    batch_non_null_columns = batch.columns[batch.notna().any()]
    
    # Agregar columnas no nulas al acumulador
    non_null_columns.extend(batch_non_null_columns)

# Crear un DataFrame resultante con las columnas no nulas
df_cleaned = df_items2[non_null_columns]
df_cleaned

In [None]:
# Ruta al dataset australian_user_reviews
ruta_items = 'data/australian_users_items.json'

# Se lee de cada línea del dataset
filas_items = []
with open(ruta_items) as f:
    for line in f.readlines():
        filas_items.append(ast.literal_eval(line))

# Se convierte en dataframe
df_items = pd.DataFrame(filas_items)
df_items

In [3]:
# Define el tamaño de los fragmentos que deseas cargar
chunk_size = 50  # Puedes ajustar este valor según tus necesidades

# Crea un generador de fragmentos usando read_json con chunksize
json_generator = pd.read_json('data/australian_users_items.json', lines=True, chunksize=chunk_size)
json_generator

<pandas.io.json._json.JsonReader at 0x2075a0fbbb0>

In [12]:
utils.verificar_tipo_datos(df_items2)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,0,"[<class 'dict'>, <class 'NoneType'>]",80.97,19.03,16806
1,1,"[<class 'dict'>, <class 'NoneType'>]",77.23,22.77,20110
2,2,"[<class 'dict'>, <class 'NoneType'>]",75.22,24.78,21882
3,3,"[<class 'dict'>, <class 'NoneType'>]",73.50,26.50,23403
4,4,"[<class 'dict'>, <class 'NoneType'>]",71.93,28.07,24791
...,...,...,...,...,...
7757,7757,"[<class 'NoneType'>, <class 'dict'>]",0.00,100.00,88309
7758,7758,"[<class 'NoneType'>, <class 'dict'>]",0.00,100.00,88309
7759,7759,"[<class 'NoneType'>, <class 'dict'>]",0.00,100.00,88309
7760,7760,"[<class 'NoneType'>, <class 'dict'>]",0.00,100.00,88309


In [12]:
null_columns = df_items2.columns[df_items2.isna().all()]
null_columns

Index([], dtype='object')

In [9]:
# Se utiliza pd.melt para transformar las columnas en filas conservando el 'user_id' y 'user_url'
df_items2 = pd.melt(df_items2, id_vars=['user_id', 'items_count', 'steam_id'], 
                       value_vars=list(range(7761)),
                       value_name='items')
df_items2.head()

MemoryError: Unable to allocate 5.11 GiB for an array with shape (7764, 88310) and data type object