In [1]:
import pandas as pd
import numpy as np
import ast
import gzip
import json
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from textblob import TextBlob

# CARGA DE DATOS

* Carga del json comprimido

In [2]:
user_reviews_gz = '../Datasets/user_reviews.json.gz'
filas = []

with gzip.open(user_reviews_gz, 'rt', encoding='utf-8') as archivo:
    for line in archivo.readlines():
        # Convertir la cadena a un diccionario utilizando ast.literal_eval
        diccionario = ast.literal_eval(line)
        filas.append(diccionario)

# Crear el DataFrame
user_review = pd.DataFrame(filas)

In [3]:
user_review

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


* Se observa que la columna 'reviews' contiene listas de diccionarios, donde cada elemento de la lista representa un producto <br>
 que el usuario ha comentado. Para abordar esto, se utilizará la función explode para duplicar las filas por cada item de la lista.<br>
 Posteriormente, se generará un nuevo DataFrame a partir de los diccionarios en la columna 'reviews' y se concatenará con el<br>
 DataFrame original a través de las columnas duplicadas.

In [4]:
# se desanidan los datos de la columna ['reviews']
data_user_1 = user_review.explode(['reviews'])
data_user_2 = data_user_1['reviews'].apply(pd.Series)
user_review_explode = pd.concat([data_user_1, data_user_2], axis=1)

In [5]:
# se crea una columna nueva con el año extraido de los valores de la columna 'posted'
user_review_explode['year_posted'] = user_review_explode['posted'].str.extract('(\d{4})')

In [6]:
# Se eliminan columnas que no seran utilizadas
user_review_explode.drop(columns=['user_url','funny','helpful','last_edited','reviews','posted',0],inplace=True)

In [7]:
user_review_explode

Unnamed: 0,user_id,item_id,recommend,review,year_posted
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,2011
0,76561197970982479,22200,True,It's unique and worth a playthrough.,2011
0,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...,2011
1,js41637,251610,True,I know what you think when you see this title ...,2014
1,js41637,227300,True,For a simple (it's actually not all that simpl...,2013
...,...,...,...,...,...
25797,76561198312638244,70,True,a must have classic from steam definitely wort...,
25797,76561198312638244,362890,True,this game is a perfect remake of the original ...,
25798,LydiaMorley,273110,True,had so much fun plaing this and collecting res...,
25798,LydiaMorley,730,True,:D,


In [8]:
user_review_explode.isna().sum()

user_id            0
item_id           28
recommend         28
review            28
year_posted    10147
dtype: int64

In [9]:
# se eliminan valores nulos por colunas
user_review_explode = user_review_explode.dropna(subset=['year_posted'])
user_review_explode = user_review_explode.dropna(subset=['item_id'])
user_review_explode = user_review_explode.dropna(subset=['review'])
user_review_explode = user_review_explode.dropna(subset=['recommend'])

In [10]:
user_review_explode.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49186 entries, 0 to 25780
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      49186 non-null  object
 1   item_id      49186 non-null  object
 2   recommend    49186 non-null  object
 3   review       49186 non-null  object
 4   year_posted  49186 non-null  object
dtypes: object(5)
memory usage: 2.3+ MB


In [11]:
# reemplazar valores booleanos de T/F por 1/0 y cambiar el tipo de dato a 'int' en columna 'recomended'
user_review_explode['recommend'] = user_review_explode['recommend'].replace({'False': 0, 'True': 1}).astype(int)

In [12]:
# funcion que analisa el sentimiento con la libreria TextBlob
#Si la polaridad es menor que -0.1, se considera un sentimiento negativo (etiqueta 0).
#Si la polaridad es mayor que 0.1, se considera un sentimiento positivo (etiqueta 2).
#Si la polaridad está entre -0.1 y 0.1, se considera un sentimiento neutral (etiqueta 1).

def get_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment < -0.1:
        return 0
    elif sentiment > 0.1:
        return 2
    else:
        return 1

In [13]:
# crea nueva columna 'sentiment_analysis'aplicando la funcion anterior sobre la columna 'review'
user_review_explode['sentiment_analysis'] = user_review_explode['review'].apply(get_sentiment)

In [14]:
user_review_explode

Unnamed: 0,user_id,item_id,recommend,review,year_posted,sentiment_analysis
0,76561197970982479,1250,1,Simple yet with great replayability. In my opi...,2011,2
0,76561197970982479,22200,1,It's unique and worth a playthrough.,2011,2
0,76561197970982479,43110,1,Great atmosphere. The gunplay can be a bit chu...,2011,1
1,js41637,251610,1,I know what you think when you see this title ...,2014,2
1,js41637,227300,1,For a simple (it's actually not all that simpl...,2013,1
...,...,...,...,...,...,...
25764,wayfeng,730,1,its FUNNNNNNNN,2015,1
25765,76561198251004808,253980,1,Awesome fantasy game if you don't mind the gra...,2015,2
25769,72947282842,730,1,Prettyy Mad Game,2015,0
25771,ApxLGhost,730,1,AMAZING GAME 10/10,2015,2


In [15]:
# se eliminan los duplicados y valores nulos en caso de haber
user_review_explode = user_review_explode.drop_duplicates()
user_review_explode = user_review_explode.dropna()

In [None]:
# se exporta la tabla como csv
user_review_explode.to_csv('user_reviews.csv', index=False)