# ETL USER_REVIEWS

In [1]:
import pandas as pd
import numpy as np
import ast

def load_json_lines(new_file_path):
    data = []
    with open(new_file_path, "r", encoding="utf-8") as file:
        for line in file:
            data.append(ast.literal_eval(line))
    return pd.DataFrame(data)
#Carga archivos formato csv
data_reviews = load_json_lines("./user_reviews.json/australian_user_reviews.json")

In [2]:
data_reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [3]:
#Usamos la función explode para desanidar los datos
exploded_data_reviews = data_reviews.explode('reviews')
exploded_data_reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20..."
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011...."
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011..."
1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014...."
1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2..."
...,...,...,...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"{'funny': '', 'posted': 'Posted July 10.', 'la..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"{'funny': '', 'posted': 'Posted July 8.', 'las..."
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '1 person found this review funny', ..."
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '', 'posted': 'Posted July 20.', 'la..."


In [4]:
# Normaliza los datos
normalizar_data = pd.json_normalize(exploded_data_reviews['reviews'])
normalizar_data

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...
59328,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59329,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59330,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59331,,Posted July 20.,,730,No ratings yet,True,:D


In [5]:
normalizar_data.tail()

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review
59328,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59329,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59330,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59331,,Posted July 20.,,730,No ratings yet,True,:D
59332,,Posted July 2.,,440,No ratings yet,True,so much fun :D


In [6]:
normalizar_data.isnull().sum()

funny          28
posted         28
last_edited    28
item_id        28
helpful        28
recommend      28
review         28
dtype: int64

In [7]:
#reseteo los indices para que no se desordenen las filas
normalizar_data.reset_index(inplace=True)
normalizar_data

Unnamed: 0,index,funny,posted,last_edited,item_id,helpful,recommend,review
0,0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...
59328,59328,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59329,59329,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59330,59330,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59331,59331,,Posted July 20.,,730,No ratings yet,True,:D


In [8]:
normalizar_data.isnull().sum()

index           0
funny          28
posted         28
last_edited    28
item_id        28
helpful        28
recommend      28
review         28
dtype: int64

In [9]:
exploded_data_reviews.reset_index(inplace=True)
exploded_data_reviews

Unnamed: 0,index,user_id,user_url,reviews
0,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20..."
1,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted July 15, 2011...."
2,0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted April 21, 2011..."
3,1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014...."
4,1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted September 8, 2..."
...,...,...,...,...
59328,25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"{'funny': '', 'posted': 'Posted July 10.', 'la..."
59329,25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"{'funny': '', 'posted': 'Posted July 8.', 'las..."
59330,25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '1 person found this review funny', ..."
59331,25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,"{'funny': '', 'posted': 'Posted July 20.', 'la..."


In [10]:
# Concatena el DataFrame normalizado con el DataFrame original
data_reviews = pd.concat([data_reviews, normalizar_data], axis=1)

# Elimina la columna "reviews" anidada
data_reviews = data_reviews.drop(columns=['reviews'])


In [11]:
data_reviews.head()

Unnamed: 0,user_id,user_url,index,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,evcentric,http://steamcommunity.com/id/evcentric,2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,doctr,http://steamcommunity.com/id/doctr,3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,maplemage,http://steamcommunity.com/id/maplemage,4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [12]:
tipo_data = {"columna":[],"tipos_de_datos":[]} #genero un diccionario vacio para que almacene al bucle

for columna in data_reviews: #un bucle que va recorriendo 
    tipo_data["columna"].append(columna)
    tipo_data["tipos_de_datos"].append(data_reviews[columna].apply(type).unique())

analisis= pd.DataFrame(tipo_data)
analisis

Unnamed: 0,columna,tipos_de_datos
0,user_id,"[<class 'str'>, <class 'float'>]"
1,user_url,"[<class 'str'>, <class 'float'>]"
2,index,[<class 'int'>]
3,funny,"[<class 'str'>, <class 'float'>]"
4,posted,"[<class 'str'>, <class 'float'>]"
5,last_edited,"[<class 'str'>, <class 'float'>]"
6,item_id,"[<class 'str'>, <class 'float'>]"
7,helpful,"[<class 'str'>, <class 'float'>]"
8,recommend,"[<class 'bool'>, <class 'float'>]"
9,review,"[<class 'str'>, <class 'float'>]"


## Proceso de extracción de valores Duplicados o Nulos

### Descripción:
 En este DataFrame, el proceso de identificación y eliminación de valores nulos se lleva a cabo posteriormente a la normalización. La razón de esta secuencia es que la columna anidada “reviews” alberga una vasta cantidad de información valiosa. Al posponer la eliminación de valores nulos hasta después de la normalización, nos aseguramos de no descartar prematuramente datos que podrían ser esenciales para nuestro análisis. Este enfoque maximiza la integridad de los datos y permite una exploración más completa y precisa del conjunto de datos.

In [13]:
#La variable ‘duplicates’ almacena los resultados de la búsqueda para facilitar comparaciones y análisis posteriores, maximizando así la eficiencia del proceso de exploración de datos.
duplicates= data_reviews.loc[data_reviews.duplicated()]
duplicates

Unnamed: 0,user_id,user_url,index,funny,posted,last_edited,item_id,helpful,recommend,review


In [28]:
#eliminar valores duplicados
data_reviews = data_reviews.drop_duplicates(keep='first')
data_reviews

Unnamed: 0,user_id,user_url,index,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,,2011-11-05,,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,1,,2011-07-15,,22200,No ratings yet,True,It's unique and worth a playthrough.
2,evcentric,http://steamcommunity.com/id/evcentric,2,,2011-04-21,,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,doctr,http://steamcommunity.com/id/doctr,3,,2014-06-24,,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,maplemage,http://steamcommunity.com/id/maplemage,4,,2013-09-08,,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...,...
25785,76561198306599751,http://steamcommunity.com/profiles/76561198306...,25794,,2014-07-28,,8500,1 of 4 people (25%) found this review helpful,False,Waste of time and money. Can't ever get the st...
25786,Ghoustik,http://steamcommunity.com/id/Ghoustik,25795,,2014-06-23,,238460,1 of 1 people (100%) found this review helpful,True,This game touched me in my specle place. 10/10...
25787,76561198310819422,http://steamcommunity.com/profiles/76561198310...,25796,1 person found this review funny,NaT,,391220,0 of 2 people (0%) found this review helpful,True,i liek
25788,76561198312638244,http://steamcommunity.com/profiles/76561198312...,25797,,NaT,,209870,0 of 3 people (0%) found this review helpful,True,blackman retribution


In [14]:
data_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      25799 non-null  object
 1   user_url     25799 non-null  object
 2   index        59333 non-null  int64 
 3   funny        59305 non-null  object
 4   posted       59305 non-null  object
 5   last_edited  59305 non-null  object
 6   item_id      59305 non-null  object
 7   helpful      59305 non-null  object
 8   recommend    59305 non-null  object
 9   review       59305 non-null  object
dtypes: int64(1), object(9)
memory usage: 4.5+ MB


In [15]:
value_nulls= data_reviews.isnull().sum()
value_nulls

user_id        33534
user_url       33534
index              0
funny             28
posted            28
last_edited       28
item_id           28
helpful           28
recommend         28
review            28
dtype: int64

In [16]:
data_reviews = data_reviews.dropna().reset_index(drop=True)
data_reviews

Unnamed: 0,user_id,user_url,index,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,evcentric,http://steamcommunity.com/id/evcentric,2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,doctr,http://steamcommunity.com/id/doctr,3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,maplemage,http://steamcommunity.com/id/maplemage,4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...,...
25785,76561198306599751,http://steamcommunity.com/profiles/76561198306...,25794,,"Posted July 28, 2014.",,8500,1 of 4 people (25%) found this review helpful,False,Waste of time and money. Can't ever get the st...
25786,Ghoustik,http://steamcommunity.com/id/Ghoustik,25795,,"Posted June 23, 2014.",,238460,1 of 1 people (100%) found this review helpful,True,This game touched me in my specle place. 10/10...
25787,76561198310819422,http://steamcommunity.com/profiles/76561198310...,25796,1 person found this review funny,Posted June 30.,,391220,0 of 2 people (0%) found this review helpful,True,i liek
25788,76561198312638244,http://steamcommunity.com/profiles/76561198312...,25797,,Posted June 2.,,209870,0 of 3 people (0%) found this review helpful,True,blackman retribution


## Cambiar formato de la fecha ubicada en la columna posted 

La columna de fecha actualmente muestra las fechas en el formato “Posted November 5, 2011”. Sin embargo, para facilitar las consultas y mantener la consistencia con otros conjuntos de datos, sería preferible que las fechas estuvieran en el formato “YYYY-MM-DD”.

In [17]:
#El primer paso consiste en extraer la fecha, que actualmente es una cadena de texto, de la columna ‘posted’. Posteriormente, podemos convertirla a un formato de fecha adecuado.
data_reviews['posted'] = data_reviews['posted'].str.extract(r'Posted ([\w\s\d,]+)')
data_reviews.head()

Unnamed: 0,user_id,user_url,index,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,,"November 5, 2011",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,1,,"July 15, 2011",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,evcentric,http://steamcommunity.com/id/evcentric,2,,"April 21, 2011",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,doctr,http://steamcommunity.com/id/doctr,3,,"June 24, 2014",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,maplemage,http://steamcommunity.com/id/maplemage,4,,"September 8, 2013",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [18]:
#Reemplazo la palabra Posted por espacio vacio
data_reviews['posted'] = data_reviews['posted'].replace({'Posted': ''}, regex=True)
data_reviews.head(3)

Unnamed: 0,user_id,user_url,index,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,,"November 5, 2011",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,1,,"July 15, 2011",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,evcentric,http://steamcommunity.com/id/evcentric,2,,"April 21, 2011",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...


In [19]:
#Transformo ahora la columna a tipo de dato datetime
data_reviews['posted'] = pd.to_datetime(data_reviews['posted'], errors='coerce')
data_reviews.head()

Unnamed: 0,user_id,user_url,index,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,,2011-11-05,,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,1,,2011-07-15,,22200,No ratings yet,True,It's unique and worth a playthrough.
2,evcentric,http://steamcommunity.com/id/evcentric,2,,2011-04-21,,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,doctr,http://steamcommunity.com/id/doctr,3,,2014-06-24,,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,maplemage,http://steamcommunity.com/id/maplemage,4,,2013-09-08,,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [20]:
# Calculamos cada columna del DataFrame con el fin de verificar que esté limpia
for col in data_reviews.columns:
    print(f"Columna: {col}")

    # Imprimimos la frecuencia de los valores en cada columna
    print(data_reviews[col].value_counts())

    # Contamos los valores faltantes en cada columna
    print(f"Valores faltantes: {data_reviews[col].isnull().sum()}")

Columna: user_id
user_id
76561198100326818    3
76561198045953692    3
blablabla174         3
76561198027488037    3
76561198051777058    3
                    ..
SakurasouNo          1
goneckahorse         1
coutlindo            1
superdedicated       1
LydiaMorley          1
Name: count, Length: 25477, dtype: int64
Valores faltantes: 0
Columna: user_url
user_url
http://steamcommunity.com/profiles/76561198100326818    3
http://steamcommunity.com/profiles/76561198045953692    3
http://steamcommunity.com/id/blablabla174               3
http://steamcommunity.com/profiles/76561198027488037    3
http://steamcommunity.com/profiles/76561198051777058    3
                                                       ..
http://steamcommunity.com/id/SakurasouNo                1
http://steamcommunity.com/id/goneckahorse               1
http://steamcommunity.com/id/coutlindo                  1
http://steamcommunity.com/id/superdedicated             1
http://steamcommunity.com/id/LydiaMorley             

In [23]:
data_reviews['posted'] = data_reviews['posted'].fillna('')

En esta linea corregimos errores asociados a la columna posted 

In [31]:

# 1. Eliminar filas con datos nulos en la columna 'posted'
data_reviews = data_reviews.dropna(subset=['posted'])

# 2. Eliminar filas duplicadas en el DataFrame
data_reviews = data_reviews.drop_duplicates()

# 3. Eliminar fechas fuera de rango en la columna 'posted'
fecha_min = pd.Timestamp('2000-01-01')  # Fecha mínima razonable
fecha_max = pd.Timestamp.today()  # Fecha máxima razonable
data_reviews = data_reviews[data_reviews['posted'].between(fecha_min, fecha_max)]

## Creación codigo de validacones de optimidad 

Para el caso de este dataset vimos errores en la columna posted gracias a la validación de parametros por lo se vio necesario volver a realizar la limpieza hasta tener condiciones optimas

In [32]:
def verificar_limpieza(df):
    # Verificar si hay datos duplicados
    duplicados = df.duplicated().any()
    
    # Verificar si hay datos nulos y en qué columnas
    nulos = df.isnull().any()
    columnas_nulos = nulos[nulos == True].index.tolist()
    
    # Si no hay datos duplicados ni nulos, la limpieza es óptima
    if not duplicados and not nulos.any():
        print("La limpieza del DataFrame es óptima.")
    else:
        print("La limpieza del DataFrame no es óptima.")
        if duplicados:
            print("Existen datos duplicados en el DataFrame.")
        if nulos.any():
            print(f"Existen datos nulos en las siguientes columnas: {columnas_nulos}")

# Usar la función para verificar la limpieza de data_reviews
verificar_limpieza(data_reviews)

La limpieza del DataFrame es óptima.


In [33]:
def verificar_columna(df, columna):
    # Verificar si hay datos nulos
    nulos = df[columna].isnull().sum()
    print(f"Valores nulos en '{columna}': {nulos}")
    
    # Verificar si hay datos duplicados
    duplicados = df[columna].duplicated().sum()
    print(f"Valores duplicados en '{columna}': {duplicados}")
    
    # Verificar el tipo de datos
    tipo_datos = df[columna].dtype
    print(f"Tipo de datos en '{columna}': {tipo_datos}")
    
    # Si la columna es de tipo object (cadena), verificar si hay cadenas vacías
    if tipo_datos == 'object':
        vacios = (df[columna] == '').sum()
        print(f"Cadenas vacías en '{columna}': {vacios}")
    
    # Si la columna es de tipo fecha, verificar si hay fechas fuera de un rango razonable
    if tipo_datos == 'datetime64[ns]':
        fecha_min = pd.Timestamp('2000-01-01')  # Fecha mínima razonable
        fecha_max = pd.Timestamp.today()  # Fecha máxima razonable
        fuera_rango = (~df[columna].between(fecha_min, fecha_max)).sum()
        print(f"Fechas fuera de rango en '{columna}': {fuera_rango}")

# Usar la función para verificar la columna 'posted' de data_reviews
verificar_columna(data_reviews, 'posted')

Valores nulos en 'posted': 0
Valores duplicados en 'posted': 18908
Tipo de datos en 'posted': datetime64[ns]
Fechas fuera de rango en 'posted': 0


## Guardar el dataset limpio 

In [34]:
# Copiamos el dataset para no afectar el original 
data_reviews = data_reviews.copy()

In [35]:
#   Guardamos en formato CSV
data_reviews.to_csv('steam_reviews_cleaned.csv', index=False)
#Guardar en formato json
data_reviews.to_json('steam_reviews_cleaned.json', orient='records', lines=True)
# Guardar parquet
data_reviews.to_parquet('steam_reviews_cleaned.parquet', index=False)