# ETL australian_user_reviews.json 

## Importando Librerías

In [1]:
import json
import re
import ast
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

## Extracción  y lectura del dataset en formato json

In [2]:
#extraccion del json
row = [] #lista vacia para ir guardando las filas

with open ("data/australian_user_reviews.json", 'r', encoding='utf-8') as file: #utilizo with para que el archivo se abra y cierre
    for line in file.readlines(): #bucle para ir leyendo filas y luego agregarlas a row
        row.append(ast.literal_eval(line)) # interpreta las lineas del json y transforma en objeto de python

# genero el dataframe 
reviews = pd.DataFrame(row)
reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


## Transformación del dataset 

### Desanidar la lista con diccionarios en la columna "reviews"

In [3]:
#Utilizo la función explode para explotar la columna y desanidar los datos
exploded = reviews.explode('reviews')
exploded 
#Conservo el resultado en una variable

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20..."
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011...."
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011..."
1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014...."
1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2..."
...,...,...,...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"{'funny': '', 'posted': 'Posted July 10.', 'la..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"{'funny': '', 'posted': 'Posted July 8.', 'las..."
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '1 person found this review funny', ..."
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '', 'posted': 'Posted July 20.', 'la..."


In [4]:
#normalizo o aplano los datos 
normalizado = pd.json_normalize(exploded['reviews'].dropna())
normalizado

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...
59300,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59301,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59302,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59303,,Posted July 20.,,730,No ratings yet,True,:D


In [5]:
#reseteo los indices para que no se desordenen las filas
normalizado.reset_index(inplace=True)
normalizado

Unnamed: 0,index,funny,posted,last_edited,item_id,helpful,recommend,review
0,0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...
59300,59300,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59301,59301,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59302,59302,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59303,59303,,Posted July 20.,,730,No ratings yet,True,:D


In [6]:
exploded.reset_index(inplace=True)
exploded

Unnamed: 0,index,user_id,user_url,reviews
0,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20..."
1,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011...."
2,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011..."
3,1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014...."
4,1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2..."
...,...,...,...,...
59328,25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"{'funny': '', 'posted': 'Posted July 10.', 'la..."
59329,25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"{'funny': '', 'posted': 'Posted July 8.', 'las..."
59330,25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '1 person found this review funny', ..."
59331,25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '', 'posted': 'Posted July 20.', 'la..."


In [7]:
#Concateno con el data orignal y elimino la columna original "reviews" anidada
reviews_ok = pd.concat([exploded, normalizado], axis=1)
reviews_ok = reviews_ok.drop(columns = ['reviews'])
reviews_ok

Unnamed: 0,index,user_id,user_url,index.1,funny,posted,last_edited,item_id,helpful,recommend,review
0,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0.0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1.0,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,2.0,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,1,js41637,http://steamcommunity.com/id/js41637,3.0,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,1,js41637,http://steamcommunity.com/id/js41637,4.0,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...,...,...
59328,25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,,,,,,
59329,25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,,,,,,
59330,25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,,,,,,,
59331,25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,,,,,,,


In [8]:
#elimino el doble index para que solo quede una sola manera de ordenar por indice
reviews_ok= reviews_ok.drop(columns="index")
reviews_ok

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...
59328,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,,,,,
59329,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,,,,,
59330,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,,,,,,
59331,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,,,,,,


In [9]:
tipo_data = {"columna":[],"tipos_de_datos":[]} #genero un diccionario vacio para ir almacenando lo que genere el bucle

for columna in reviews_ok.columns: #un bucle que va recorriendo 
    tipo_data["columna"].append(columna)
    tipo_data["tipos_de_datos"].append(reviews_ok[columna].apply(type).unique())

analisis= pd.DataFrame(tipo_data)
analisis

Unnamed: 0,columna,tipos_de_datos
0,user_id,[<class 'str'>]
1,user_url,[<class 'str'>]
2,funny,"[<class 'str'>, <class 'float'>]"
3,posted,"[<class 'str'>, <class 'float'>]"
4,last_edited,"[<class 'str'>, <class 'float'>]"
5,item_id,"[<class 'str'>, <class 'float'>]"
6,helpful,"[<class 'str'>, <class 'float'>]"
7,recommend,"[<class 'bool'>, <class 'float'>]"
8,review,"[<class 'str'>, <class 'float'>]"


### Busqueda de duplicados y nulos

#### En este dataframe la busqueda y eliminación de nulos ser realiza después de la normalización, debido a la gran cantidad de información contenida en la columna anidada "reviews", a modo de no perder datos sin haberlos analizado

In [10]:
#se utiliza la variable duplicados para guardar la busqueda y poder comparar
duplicados= reviews_ok.loc[reviews_ok.duplicated()]
duplicados

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
1112,bobseagull,http://steamcommunity.com/id/bobseagull,,"Posted September 24, 2015.",,346110,1 of 1 people (100%) found this review helpful,True,yep
2894,ImSeriouss,http://steamcommunity.com/id/ImSeriouss,,"Posted January 13, 2014.",,211820,No ratings yet,True,If you want to play this game.. expect glithes...
2895,ImSeriouss,http://steamcommunity.com/id/ImSeriouss,,"Posted January 10, 2014.",,440,No ratings yet,True,Really good game! fun! Good for people who wan...
2896,ImSeriouss,http://steamcommunity.com/id/ImSeriouss,,"Posted March 19, 2012.",,42680,No ratings yet,True,Good but a bit overdone. Still love it though.
3582,76561198062039159,http://steamcommunity.com/profiles/76561198062...,,"Posted December 11, 2015.",,730,0 of 1 people (0%) found this review helpful,True,I rate it R8/Revolver
...,...,...,...,...,...,...,...,...,...
59327,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,,,,,
59328,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,,,,,
59329,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,,,,,
59331,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,,,,,,


### Se analiza si es correcto eliminar los duplicados

In [11]:
reviews_ok = reviews_ok.drop_duplicates(keep='first')
reviews_ok

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...
59323,76561198306599751,http://steamcommunity.com/profiles/76561198306...,,,,,,,
59324,Ghoustik,http://steamcommunity.com/id/Ghoustik,,,,,,,
59325,76561198310819422,http://steamcommunity.com/profiles/76561198310...,,,,,,,
59326,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,,,,,,


In [12]:
nulos= reviews_ok.isnull().sum()
nulos

user_id         0
user_url        0
funny          18
posted         18
last_edited    18
item_id        18
helpful        18
recommend      18
review         18
dtype: int64

In [13]:
reviews_ok = reviews_ok.dropna().reset_index(drop=True)
reviews_ok

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...
59156,Fuckfhaisjnsnsjakaka,http://steamcommunity.com/id/Fuckfhaisjnsnsjakaka,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59157,3214213216,http://steamcommunity.com/id/3214213216,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59158,ChrisCoroner,http://steamcommunity.com/id/ChrisCoroner,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59159,CaptainAmericaCw,http://steamcommunity.com/id/CaptainAmericaCw,,Posted July 20.,,730,No ratings yet,True,:D


## Transformación de los datos de tipo fecha en "posted" 

### El formato de fecha en la columna es Posted November 5, 2011. Mientras que debería estar en el mismo formato YYYY-MM-DD, que en los demás dataset a fin de facilitar las consultas 

In [14]:
#Primer paso es extraer del la columna posted la fecha que esta en tipo de dato string, entonces puedo transformarla
reviews_ok['posted'] = reviews_ok['posted'].str.extract(r'Posted ([\w\s\d,]+)')
reviews_ok.head(3)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"November 5, 2011",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"July 15, 2011",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"April 21, 2011",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...


In [15]:
#Reemplazo la palabra Posted por espacio vacio
reviews_ok['posted'] = reviews_ok['posted'].replace({'Posted': ''}, regex=True)
reviews_ok.head(3)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"November 5, 2011",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"July 15, 2011",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"April 21, 2011",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...


In [16]:
#Transformo ahora la columna a tipo de dato datetime
reviews_ok['posted'] = pd.to_datetime(reviews_ok['posted'], errors='coerce')
reviews_ok.head(3)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,2011-11-05,,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,2011-07-15,,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,2011-04-21,,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...


## Guardo el dataframe para las consultas transformandolo a tipo de archivo parquet para optimizar su utilización

In [17]:
reviews_ok = reviews_ok.copy()

In [18]:
guardar = 'data/user_review_limpio.csv'
reviews_ok.to_csv(guardar, index=False, encoding='utf-8')

In [19]:
#Transformo el archivo csv a parquet
#Leo el archivo csv
reviews_ok= pd.read_csv("data/user_review_limpio.csv") 

#Indico donde quiero guardar el parquet y con que nombre
output_file= "data/user_review.parquet"

#Transformo a traves de una tabla el archivo csv en parquet
table = pa.Table.from_pandas(reviews_ok)
pq.write_table(table,output_file)