In [1]:
import pandas as pd
import gzip
import ast
import pyarrow as pa
import pyarrow.parquet as pq

## Ingesta de datos (Extracion)

In [3]:
# Ruta del archivo JSON
file_path = 'Dataset/user_reviews.json.gz'
data = []

# Abrir el archivo y procesar cada línea
with gzip.open(file_path, 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            # Usar ast.literal_eval para convertir la línea en un diccionario
            json_data = ast.literal_eval(line)
            data.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

#Crear un DataFrame a partir de la lista de diccionarios
df_original = pd.DataFrame(data)
df_original.head(3)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."


## Transformacion de los datos
Desanido la columna que necesito 'reviews'


In [4]:
df_review = df_original.explode('reviews') #con la funcion explode() se desglosa las listas de diccionarios en filas individuales
df_review = pd.concat([df_review.drop(['reviews'],axis= 1), df_review['reviews'].apply(pd.Series)], axis = 1) # concatena para que cada item del diccionario quede en columnas individuales.
df_review.rename(columns= {0: 'sentiment_analysis'}, inplace= True)
df_review.drop(['funny', 'posted', 'last_edited', 'helpful'], axis=1, inplace=True), # se borra las columnas que se prevee no son necesarias
df_review['sentiment_analysis'] = 0
df_review.head(3)

Unnamed: 0,user_id,user_url,item_id,recommend,review,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,True,Simple yet with great replayability. In my opi...,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,True,It's unique and worth a playthrough.,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,True,Great atmosphere. The gunplay can be a bit chu...,0


Revision de valores nulos para 'item_id'

In [5]:
df_review['item_id'].isna().value_counts() # se cuentan cuantos valores nulo existen


item_id
False    59305
True        28
Name: count, dtype: int64

Se eliminan los valores nulos de la columna 'item_id'

In [6]:
df_review = df_review.dropna(subset= ['item_id'])
df_review.shape

(59305, 6)

## Analisis de sentimientos

In [7]:
# Para resolver este punto se importan las librerias necesarias
from textblob import TextBlob

In [9]:
# Función para asignar el valor de sentimiento
def analisis_sentimiento(review:str) -> int:
  
    if not review:
        return 1  # Valor neutro si no hay texto
    else:
        analisis = TextBlob(review)
        if analisis.sentiment.polarity < 0:
            return 0  # Valor 0 para sentimiento negativo (malo)
        elif analisis.sentiment.polarity == 0:
            return 1  # Valor 1 para sentimiento neutro
        else:
            return 2  # Valor 2 para sentimiento positivo (bueno)


In [10]:
df_review.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59305 entries, 0 to 25798
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             59305 non-null  object
 1   user_url            59305 non-null  object
 2   item_id             59305 non-null  object
 3   recommend           59305 non-null  object
 4   review              59305 non-null  object
 5   sentiment_analysis  59305 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 3.2+ MB


In [11]:
df_review['sentiment_analysis'] = df_review['review'].apply(analisis_sentimiento)
df_review

Unnamed: 0,user_id,user_url,item_id,recommend,review,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,True,Simple yet with great replayability. In my opi...,2
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,True,It's unique and worth a playthrough.,2
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,True,Great atmosphere. The gunplay can be a bit chu...,2
1,js41637,http://steamcommunity.com/id/js41637,251610,True,I know what you think when you see this title ...,2
1,js41637,http://steamcommunity.com/id/js41637,227300,True,For a simple (it's actually not all that simpl...,0
...,...,...,...,...,...,...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,70,True,a must have classic from steam definitely wort...,2
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,362890,True,this game is a perfect remake of the original ...,2
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,273110,True,had so much fun plaing this and collecting res...,2
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,730,True,:D,2


In [12]:
df_review[df_review['sentiment_analysis'] == 1].sample(20)

Unnamed: 0,user_id,user_url,item_id,recommend,review,sentiment_analysis
10091,76561198091404694,http://steamcommunity.com/profiles/76561198091...,252490,True,Ran around naked with a terrorist mask on.Shou...,1
21642,DerpingKoala,http://steamcommunity.com/id/DerpingKoala,224260,True,This is funy,1
20943,76561198058784699,http://steamcommunity.com/profiles/76561198058...,400,True,I like cake!,1
3894,76561198039506014,http://steamcommunity.com/profiles/76561198039...,233250,False,No.,1
13233,inconsequential,http://steamcommunity.com/id/inconsequential,285900,True,This is basically what jelly babys do when you...,1
9438,76561198085294887,http://steamcommunity.com/profiles/76561198085...,730,True,Otimo jogo,1
11313,TheWatchdogEM,http://steamcommunity.com/id/TheWatchdogEM,317360,True,Can I Have My 5 Minutes Back,1
8139,Ryan264,http://steamcommunity.com/id/Ryan264,286160,True,♥♥♥♥ MY CARDDDDDDDDDDDDDS (flips table),1
7587,Breadi,http://steamcommunity.com/id/Breadi,440,True,"Adoro esse jogo. Jogo o dia todo, melhor jogo.",1
5171,76561198064394417,http://steamcommunity.com/profiles/76561198064...,17390,True,I made a human10/10,1


In [13]:
df_review[df_review['user_id'] == 'zyr0n1c']

Unnamed: 0,user_id,user_url,item_id,recommend,review,sentiment_analysis
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,620,True,Fantastic Game! It allows one to think really ...,0
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,230410,True,Fantastic game! Lots of gamemodes and large va...,2
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,72850,True,It's been a long way since Elder Scrolls start...,2
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,730,True,"After playing 500 hours on this fantastic FPS,...",0
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,208090,True,Great game. Good physics l0l,2
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,17470,True,Dis game is action packed and thrilling,0
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,4000,True,Great Game! With lots and lots of props to pla...,2
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,440,True,THis FPS Shooter game really packs a punch.I'm...,0
24058,zyr0n1c,http://steamcommunity.com/id/zyr0n1c,8980,True,A Gem.It's level of stupidity is just overwhem...,0


Ultimas transformaciones

In [14]:
df_review = df_review.drop(columns=['user_url', 'review'])

In [15]:
df_review

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis
0,76561197970982479,1250,True,2
0,76561197970982479,22200,True,2
0,76561197970982479,43110,True,2
1,js41637,251610,True,2
1,js41637,227300,True,0
...,...,...,...,...
25797,76561198312638244,70,True,2
25797,76561198312638244,362890,True,2
25798,LydiaMorley,273110,True,2
25798,LydiaMorley,730,True,2


Se exporta el dataset para u uso en la API

In [18]:
df_review.to_csv("Dataset/user_review.parquet")