<b>Ingesta de Datos</b>

In [1]:
import json
import gzip
import pandas as pd
import ast
import pdb
import numpy as np
from datetime import datetime as dt

In [2]:
# Codigo de extraccion de datos de users_items, tambien desanida la columna items
datos = []
def items_reader(archive):
    try: 
        # Abre el archivo en modo lectura de texto
        with open(archive, 'r',encoding='utf-8') as file:
            try:
                for line in file:
                    try:
                        # Convierte cada línea JSON en un diccionario de Python
                        dato_raw = ast.literal_eval(line.strip()) 
                        
                        if dato_raw:
                            #[dato_raw['user_id'],dato_raw['items_count'],dato_raw['steam_id']]
                            for item_data in dato_raw['items']:
                                datos.append([dato_raw['user_id'],dato_raw['items_count'],dato_raw['steam_id'],item_data['item_id'],item_data['item_name'],item_data['playtime_forever']])
                    except:
                        print(f"Error al cargar JSON en la línea:")
                        pdb.set_trace()
            except UnicodeDecodeError as e :                
                print(e)      
                pdb.set_trace()

    except FileNotFoundError:
        print(f"El archivo {archive} no fue encontrado.")
        
    df = pd.DataFrame(datos,columns=['user_id','items_count','steam_id','item_id','item_name','playtime_forever'])
    df.head(5)
    return df

In [3]:
items_archive = "..\\datasets\\australian_users_items.json"
df_items = items_reader(items_archive)
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5153209 entries, 0 to 5153208
Data columns (total 6 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   user_id           object
 1   items_count       int64 
 2   steam_id          object
 3   item_id           object
 4   item_name         object
 5   playtime_forever  int64 
dtypes: int64(2), object(4)
memory usage: 235.9+ MB


In [4]:
df_items.head(3)

Unnamed: 0,user_id,items_count,steam_id,item_id,item_name,playtime_forever
0,76561197970982479,277,76561197970982479,10,Counter-Strike,6
1,76561197970982479,277,76561197970982479,20,Team Fortress Classic,0
2,76561197970982479,277,76561197970982479,30,Day of Defeat,7


In [5]:
# Codigo de extraccion de datos de users_reviews, tambien desanida la columna reviews
datos = []
def reviews_reader(archive):
    try: 
        # Abre el archivo en modo lectura de texto
        with open(archive, 'r',encoding='utf-8') as file:
            try:
                for line in file:
                    try:
                        # Convierte cada línea JSON en un diccionario de Python
                        dato_raw = ast.literal_eval(line.strip()) 
                        
                        if dato_raw:
                            #[dato_raw['user_id'],dato_raw['items_count'],dato_raw['steam_id']]
                            for review_data in dato_raw['reviews']:
                                #pdb.set_trace()
                                try:
                                    fecha = dt.strptime(review_data['posted'],'Posted %B %d, %Y.')
                                    datos.append([dato_raw['user_id'],review_data['item_id'],review_data['recommend'],review_data['review'],fecha.year])
                                except:
                                    pass
                    except:
                        pdb.set_trace()
                        print(f"Error al cargar JSON en la línea:")
            except UnicodeDecodeError as e :
                print(e)      

    except FileNotFoundError:
        print(f"El archivo {archive} no fue encontrado.")
        
    df = pd.DataFrame(datos,columns=['user_id','item_id','recommend','review','year_posted'])
    df.head(100)
    return df

In [6]:
reviews_archive = "..\\datasets\\australian_user_reviews.json"
df_reviews = reviews_reader(reviews_archive)
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49186 entries, 0 to 49185
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      49186 non-null  object
 1   item_id      49186 non-null  object
 2   recommend    49186 non-null  bool  
 3   review       49186 non-null  object
 4   year_posted  49186 non-null  int64 
dtypes: bool(1), int64(1), object(3)
memory usage: 1.5+ MB


In [7]:
df_reviews.head(3)

Unnamed: 0,user_id,item_id,recommend,review,year_posted
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,2011
1,76561197970982479,22200,True,It's unique and worth a playthrough.,2011
2,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...,2011


In [8]:
# Codigo de extraccion de datos de steam
archive_steam = "..\\datasets\\steam_games.json.gz"
#anio = df_steam.apply(pd.to_datetime, axis=1)
data = []

with gzip.open(archive_steam, 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            obj = json.loads(line.strip())
            try:
                obj['release_year'] = pd.to_datetime(obj['release_date']).year
            except:
                obj['release_year'] = np.nan
                
            data.append(obj)
        except json.JSONDecodeError as e:
            print(f"Error al decodificar JSON en línea: {line.strip()}")
            pdb.set_trace()
            print(str(e))

# Creo un DataFrame a partir de los datos
df_steam = pd.DataFrame(data)


  obj['release_year'] = pd.to_datetime(obj['release_date']).year


In [9]:
df_steam.tail(3)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer,release_year
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich,2018.0
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns",2017.0
120444,,,Maze Run VR,,http://store.steampowered.com/app/681550/Maze_...,,"[Early Access, Adventure, Indie, Action, Simul...",http://steamcommunity.com/app/681550/reviews/?...,"[Single-player, Stats, Steam Leaderboards, HTC...",4.99,True,681550,,


In [10]:
df_steam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  object 
 11  id            32133 non-null  object 
 12  developer     28836 non-null  object 
 13  release_year  29894 non-null  float64
dtypes: float64(1), object(13)
memory usage: 12.9+ MB


<b>Guardar dataframes </b>

In [11]:
df_steam.to_csv('..\\datasets\\New_datasets\\user_steams.csv',index=False)
df_items.to_parquet('..\\datasets\\New_datasets\\user_items.parquet',index=False)
df_reviews.to_parquet('..\\datasets\\New_datasets\\user_reviews.parquet',index=False)