### LIBRERIAS

In [2]:
import pandas as pd
import json
import ast
import warnings
import pyarrow as pa
import pyarrow.parquet as pq
warnings.filterwarnings('ignore')

#### ETL SOBRE USER_REVIEWS

In [3]:
# Se carga el json de user_reviews y se realiza una lectura linea a linea que se irá almacenando 
# en una lista para luego convertirla en un dataframe pandas
user_reviews = 'datos_json/australian_user_reviews.json'

listado = []

with open(user_reviews, 'r', encoding= 'utf-8') as archivo:
    for linea in archivo.readlines():
        listado.append(ast.literal_eval(linea))

# Se crea un dataframe con el listado de lineas
dfUserReviews = pd.DataFrame(listado)

# Se guarda en formato csv para mejorar tiempos de carga y flujo de trabajo.
dfUserReviews.to_csv('json_a_csv/australian_user_reviews.csv', index=False)



In [4]:
#Se carga el csv previamente creado.
userReviews = pd.read_csv("json_a_csv/australian_user_reviews.csv")

userReviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [5]:
# Se verifica que no haya nulos
userReviews.isnull().any()

user_id     False
user_url    False
reviews     False
dtype: bool

In [6]:
# Se importa la funcion literal_eval para evaluar expresiones literales de tipo str
from ast import literal_eval

# Se aplica el literal_eval a toda la columna reviews convirtiendo las cadenas en estructuras de datos
userReviews['reviews'] = userReviews['reviews'].apply(literal_eval)

# Se aplica el metodo explode a la columna reviews para dividir las listas en filas separadas y se replican las demas columnas para cada fila resultante.
filasReviews = userReviews.explode('reviews')

# Se desglosan las columnas anidadas utilizando el metodo json_normalize.
desgloseColumnas = pd.json_normalize(filasReviews['reviews'])

# Se reinician los indices para posteriormente realizar un join
filasReviews = filasReviews.reset_index(drop=True)

# Se combina filasReviews y desgloseColumnas
userReviews = filasReviews.join(desgloseColumnas)

# Se reinician los indices luego de realizar el join
userReviews = userReviews.reset_index(drop=True)

# Se eliminan las columnas user_url y reviews
userReviewsFinal = userReviews.drop(['user_url','reviews'], axis=1)

In [7]:
# Se hace una revision para ver si existen duplicados
userReviewsFinal[userReviewsFinal.duplicated()]

Unnamed: 0,user_id,funny,posted,last_edited,item_id,helpful,recommend,review
1114,bokkkbokkk,,"Posted September 24, 2015.",,346110,1 of 1 people (100%) found this review helpful,True,yep
2894,ImSeriouss,,"Posted January 10, 2014.",,218620,1 of 3 people (33%) found this review helpful,True,"Good graphics, fun heists! A bit laggy"
2895,ImSeriouss,,"Posted January 10, 2014.",,105600,0 of 2 people (0%) found this review helpful,True,So fun! DEFINITELY NOT RIP OFF OF MINECRAFT! e...
2896,ImSeriouss,,"Posted December 17, 2014.",,570,No ratings yet,True,bobo pinoy
2897,ImSeriouss,,"Posted January 13, 2014.",,211820,No ratings yet,True,If you want to play this game.. expect glithes...
...,...,...,...,...,...,...,...,...
44456,76561198092022514,,Posted July 3.,,422400,No ratings yet,True,Muy entretenido y una coleccion de armas prome...
44457,76561198092022514,,Posted June 1.,,218620,No ratings yet,True,"Tiene una jugabilidad y tematica muy buena :D,..."
44458,76561198092022514,,"Posted August 17, 2014.",,261820,No ratings yet,True,"Buen juego, no importa el desarrrollo que tien..."
44459,76561198092022514,,"Posted February 17, 2014.",,224260,No ratings yet,True,exelente aporte :D¡¡¡ es una buen mod basado e...


In [8]:
# Se verifica si los duplicados observados previamente son correctos utilizando el primer user_id duplicado
userReviewsFinal[userReviewsFinal['user_id']=='bokkkbokkk']

Unnamed: 0,user_id,funny,posted,last_edited,item_id,helpful,recommend,review
1113,bokkkbokkk,,"Posted September 24, 2015.",,346110,1 of 1 people (100%) found this review helpful,True,yep
1114,bokkkbokkk,,"Posted September 24, 2015.",,346110,1 of 1 people (100%) found this review helpful,True,yep


In [9]:
# Se aplica el metodo drop_duplicates 
userReviewsFinal= userReviewsFinal.drop_duplicates()

In [10]:
# Se consulta la información disponible del dataframe
userReviewsFinal.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58459 entries, 0 to 59332
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      58459 non-null  object
 1   funny        58431 non-null  object
 2   posted       58431 non-null  object
 3   last_edited  58431 non-null  object
 4   item_id      58431 non-null  object
 5   helpful      58431 non-null  object
 6   recommend    58431 non-null  object
 7   review       58431 non-null  object
dtypes: object(8)
memory usage: 4.0+ MB


In [11]:
# Se modifica el tipo de la columna item_id
userReviewsFinal['item_id'] = userReviewsFinal['item_id'].astype('Int64')

In [12]:
# Se observa la media de nulos de cada columna.
userReviewsFinal.isnull().mean()

user_id        0.000000
funny          0.000479
posted         0.000479
last_edited    0.000479
item_id        0.000479
helpful        0.000479
recommend      0.000479
review         0.000479
dtype: float64

In [13]:
# Se almacenan los indices de los nulos en la columna id que no permiten realizar la transformacion del tipo de dato
nulos = userReviewsFinal[userReviewsFinal['recommend'].isna()].index

# Se utiliza la variable nulos donde almacene el indice de nulos para ubicar las filas correspondientes
userReviewsFinal.iloc[nulos]

Unnamed: 0,user_id,funny,posted,last_edited,item_id,helpful,recommend,review
137,gdxsd,,,,,,,
177,76561198094224872,,,,,,,
2560,76561198074079694,,"Posted October 17, 2015.",,208090.0,No ratings yet,True,Best game with gore. =P
10205,PintoProAlto,1 person found this review funny,"Posted April 21, 2014.",,570.0,3 of 3 people (100%) found this review helpful,True,belezura d jogo
14008,76561198064975700,,"Posted June 26, 2012.",,440.0,No ratings yet,True,'tis good
15777,mtmoss,,"Posted January 9, 2014.",,252490.0,1 of 1 people (100%) found this review helpful,True,"Rust is stil in uts early stages. For now, i h..."
19609,zrustz16,,"Posted July 7, 2014.",,304930.0,4 of 6 people (67%) found this review helpful,True,"Jogo divertido, graficos quadrados s2, tem um ..."
20664,WardPearce,,Posted March 18.,,373360.0,3 of 5 people (60%) found this review helpful,True,"Very, Very Fun. One of the best games I have e..."
25593,BM5K,,"Posted July 7, 2014.",,242920.0,No ratings yet,True,If you're into civilization-building (note the...
26815,matrix12384,,"Posted May 24, 2013.","Last edited January 1, 2014.",570.0,No ratings yet,True,"Dota 2, hours of fun spent playing this game w..."


In [14]:
# Se analiza el dataset resultante a partir de los nulos y se procede a eliminar las filas 137 y 177 ya que las componen un 90% de datos nulos.
userReviewsFinal.drop(index=137,inplace=True)
userReviewsFinal.drop(index=177,inplace=True)

In [17]:
# Se procede a eliminar las columnas funny, Last_edited y helpful ya que no serán usadas por la API ni por el Sentiment Analysis.
userReviewsFinal = userReviewsFinal.drop(['funny','last_edited','helpful'], axis= 1)

# Se extrae el dato de la fecha de la columna posted y se lo transforma a datetime.
userReviewsFinal["posted"] = pd.to_datetime(userReviewsFinal["posted"].astype(str).str.replace(r'Posted |,|\.', '', regex=True), errors='coerce')

# Se aplica una interpolación cruzada para completar los valores faltantes.
userReviewsFinal['posted'] = userReviewsFinal['posted'].interpolate(method='linear', limit_area='inside')

# Se extrae el año de la columna posted y se crea una columna 
userReviewsFinal['review_year'] = userReviewsFinal['posted'].dt.year

# Se aplica el metodo drop a la columna posted ya que no será usada posteriormente.
userReviewsFinal.drop('posted', axis=1, inplace=True)

userReviewsFinal

Unnamed: 0,user_id,item_id,recommend,review,review_year
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,2011
1,76561197970982479,22200,True,It's unique and worth a playthrough.,2011
2,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...,2011
3,js41637,251610,True,I know what you think when you see this title ...,2014
4,js41637,227300,True,For a simple (it's actually not all that simpl...,2013
...,...,...,...,...,...
59328,76561198312638244,70,True,a must have classic from steam definitely wort...,2015
59329,76561198312638244,362890,True,this game is a perfect remake of the original ...,2015
59330,LydiaMorley,273110,True,had so much fun plaing this and collecting res...,2015
59331,LydiaMorley,730,True,:D,2015


### Sentiment Analyzer

Al dataframe de User Reviews se le va a realizar un analisis de sentimiento para poder determinar en base a un modelo estadistico 

##### Importaciones

In [18]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to C:\Users\Nano
[nltk_data]     L3AN\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [19]:
# Se almacenan los indices de los nulos en la columna review
reviewsNulos = userReviewsFinal[userReviewsFinal['review'].isnull()].index

# Se utiliza iloc para realizar una lectura y analisis de los nulos
userReviewsFinal.iloc[reviewsNulos]

Unnamed: 0,user_id,item_id,recommend,review,review_year
2562,Halio99,730,True,You either die a noob; repetitively. Or live l...,2015
10207,PintoProAlto,4000,True,melhor que pinto no lixo,2014
14011,H4unted_P3n,440,True,Team Fortress 2 is a excellent FPS game that i...,2014
15779,mtmoss,286100,True,Quite a good fun little game for a free to pla...,2014
19611,zrustz16,265630,True,Muito bom !,2014
20666,WardPearce,319630,True,This is one the best game I have played.The be...,2014
25595,BZ4LIFE,407250,False,Aweful gameplay; players take 2 seconds to res...,2014
26817,76561198052485823,99900,True,that's right Free to play!and have funny to yo...,2013
27254,roitethere,224260,True,"What more could you ask for a zombie game, it ...",2015
29046,SkateboardSquats,730,True,Step 1: Play gameStep 2: Get skinsStep 3: ????...,2014


In [20]:
# Se normaliza el texto a utilizar en el analisis de sentimiento.
userReviewsFinal['review'] = userReviewsFinal['review'].str.lower()

# Se transforma el tipo de la columna review a str.
userReviewsFinal['review'] = userReviewsFinal['review'].astype(str)

userReviewsFinal

Unnamed: 0,user_id,item_id,recommend,review,review_year
0,76561197970982479,1250,True,simple yet with great replayability. in my opi...,2011
1,76561197970982479,22200,True,it's unique and worth a playthrough.,2011
2,76561197970982479,43110,True,great atmosphere. the gunplay can be a bit chu...,2011
3,js41637,251610,True,i know what you think when you see this title ...,2014
4,js41637,227300,True,for a simple (it's actually not all that simpl...,2013
...,...,...,...,...,...
59328,76561198312638244,70,True,a must have classic from steam definitely wort...,2015
59329,76561198312638244,362890,True,this game is a perfect remake of the original ...,2015
59330,LydiaMorley,273110,True,had so much fun plaing this and collecting res...,2015
59331,LydiaMorley,730,True,:d,2015


In [21]:
# Se crea una funcion para aplicar puntaje al texto ingresado.
def puntaje(texto):
    # Se verifica que texto no este vacio.
    if texto: # Se calcula el puntaje compuesto mediante la funcion polarity_scores.
        compound_score = sia.polarity_scores(texto)['compound']
        
        # Se evalúa el puntaje compuesto y devuelve el puntaje correspondiente.
        if compound_score >= 0.05:
            return 2  # Positivo
        elif compound_score <= -0.05:
            return 0  # Negativo
        else:
            return 1  # Neutral
    else:
        return 1  # valor por defecto si no hay reseña

# Se crea la columna sentiment_analysis donde se cargan los puntajes aplicados al texto de la columna review 
userReviewsFinal['sentiment_analysis'] = userReviewsFinal['review'].apply(puntaje)

# Se elimina la columna review aplicando el metodo drop.
userReviewsFinal.drop('review', axis=1, inplace=True)

In [22]:
# Se realiza un recuento de cada puntaje para entender su distribución
userReviewsFinal['sentiment_analysis'].value_counts()


sentiment_analysis
2    36843
1    12132
0     9482
Name: count, dtype: int64

In [23]:
# Se verifica la media de nulos en el dataset.
userReviewsFinal.isna().mean()

user_id               0.000000
item_id               0.000445
recommend             0.000445
review_year           0.000000
sentiment_analysis    0.000000
dtype: float64

In [24]:
# Para evitar problemas futuros en la interpretación de datos al cargar el archivo csv se procede a eliminar los nulos en la columna item_id
#userReviewsFinal[userReviewsFinal['item_id'].isna() == True].count()

#userReviewsFinal["item_id"].fillna(0, inplace=True)

#userReviewsFinal[userReviewsFinal['item_id'] == 0]

In [27]:
userReviewsFinal.count()


user_id               58457
item_id               58431
recommend             58431
review_year           58457
sentiment_analysis    58457
dtype: int64

In [28]:
userReviewsFinal[userReviewsFinal['item_id'] != 0]

Unnamed: 0,user_id,item_id,recommend,review_year,sentiment_analysis
0,76561197970982479,1250,True,2011,2
1,76561197970982479,22200,True,2011,2
2,76561197970982479,43110,True,2011,2
3,js41637,251610,True,2014,2
4,js41637,227300,True,2013,2
...,...,...,...,...,...
59328,76561198312638244,70,True,2015,2
59329,76561198312638244,362890,True,2015,2
59330,LydiaMorley,273110,True,2015,2
59331,LydiaMorley,730,True,2015,2


In [29]:
userReviewsFinal.rename(columns={'item_id': 'id'}, inplace=True)

In [30]:
userReviewsFinal.isna().mean()

user_id               0.000000
id                    0.000445
recommend             0.000445
review_year           0.000000
sentiment_analysis    0.000000
dtype: float64

In [31]:
userReviewsFinal

Unnamed: 0,user_id,id,recommend,review_year,sentiment_analysis
0,76561197970982479,1250,True,2011,2
1,76561197970982479,22200,True,2011,2
2,76561197970982479,43110,True,2011,2
3,js41637,251610,True,2014,2
4,js41637,227300,True,2013,2
...,...,...,...,...,...
59328,76561198312638244,70,True,2015,2
59329,76561198312638244,362890,True,2015,2
59330,LydiaMorley,273110,True,2015,2
59331,LydiaMorley,730,True,2015,2


In [32]:
# Se guarda el dataset en csv para su posterior consumo
userReviewsFinal.to_csv('csv_limpios/user_reviews.csv', index= False)

In [33]:
# Se comprime en parquet

# Se convierte el DataFrame a una tabla de Arrow para posteriormente comprimirlo en parquet
table = pa.Table.from_pandas(userReviewsFinal)

# Especifica el nombre del archivo Parquet y comprime con snappy
parquet_file = 'datos_parquet/user_reviews.parquet'
pq.write_table(table, parquet_file, compression='snappy')