In [2]:
import pandas as pd
import utils as ut
import ast
import importlib

In [125]:
importlib.reload(ut)

<module 'utils' from '/home/mauro/HENRY FT17/Proyectos/PI 1/Steam-Rec-System/ETL/utils.py'>

In [126]:
# Especifica la ruta del archivo JSON
json_path = "../data/originals/user_reviews.json"

reviews = []
with open(json_path, 'r', encoding='utf-8') as f:
    # loop for insert data into list
    for line in f.readlines():
        reviews.append(ast.literal_eval(line))

reviews_df = pd.DataFrame(reviews)



#### The first record is reviewed to see its content and the names of the columns.

In [127]:
print(reviews_df.iloc[0])

user_id                                     76561197970982479
user_url    http://steamcommunity.com/profiles/76561197970...
reviews     [{'funny': '', 'posted': 'Posted November 5, 2...
Name: 0, dtype: object


In [128]:
# This step is applied to convert the lists into strings and thus be able to generate an overview of the data.
reviews_df['reviews'] = reviews_df['reviews'].apply(str)

In [129]:
ut.data_overview(reviews_df)


Total rows:  25799

Total full null rows:  0

Total duplicated rows: 313


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,user_id,[<class 'str'>],100.0,25799,0.0,0
1,user_url,[<class 'str'>],100.0,25799,0.0,0
2,reviews,[<class 'str'>],100.0,25799,0.0,0


#### Obtain a dataset with the information from the unnested 'reviews' column.

In [130]:
# Convertir la columna 'reviews' de cadena a una lista de diccionarios
reviews_df['reviews'] = reviews_df['reviews'].apply(ast.literal_eval)

# Obtener todas las claves únicas de los diccionarios en la columna 'reviews'
all_keys = set()
for review_list in reviews_df['reviews']:
    for review_dict in review_list:
        all_keys.update(review_dict.keys())

# Crear columnas adicionales en el DataFrame para cada clave única
for key in all_keys:
    reviews_df[key] = reviews_df['reviews'].apply(lambda x: [d.get(key, None) for d in x])

# Expandir las listas de diccionarios en nuevas filas
df_expanded = reviews_df.explode('reviews')



# Expandir las listas de valores en las columnas
for col in reviews_df.columns.difference(['user_id', 'user_url', 'reviews']):
    df_expanded[col] = df_expanded[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# Reorganizar las columnas para tener las originales seguidas de las nuevas
new_columns_order = list(reviews_df.columns) + [col for col in df_expanded.columns if col not in reviews_df.columns]
df_expanded = df_expanded[new_columns_order]


In [131]:
# Eliminar las columnas 'reviews', 'user_id' y 'user_url'
df_expanded.drop(['reviews', 'user_id', 'user_url'], axis=1, inplace=True)

In [132]:
ut.data_overview(df_expanded)


Total rows:  59333

Total full null rows:  28

Total duplicated rows: 33875


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,last_edited,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28
1,item_id,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28
2,funny,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28
3,posted,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28
4,helpful,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28
5,recommend,"[<class 'bool'>, <class 'NoneType'>]",99.95,59305,0.05,28
6,review,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28


#### We proceed to remove nulls and duplicates.

In [133]:
df_expanded.dropna(inplace=True)
df_expanded.drop_duplicates(inplace=True)

In [134]:
ut.data_overview(df_expanded)


Total rows:  25457

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,last_edited,[<class 'str'>],100.0,25457,0.0,0
1,item_id,[<class 'str'>],100.0,25457,0.0,0
2,funny,[<class 'str'>],100.0,25457,0.0,0
3,posted,[<class 'str'>],100.0,25457,0.0,0
4,helpful,[<class 'str'>],100.0,25457,0.0,0
5,recommend,[<class 'bool'>],100.0,25457,0.0,0
6,review,[<class 'str'>],100.0,25457,0.0,0


#### The presence of string values 'None' in the columns is checked.

In [135]:
ut.check_none_values(df_expanded)

       Columna  Porcentaje None
0  last_edited              0.0
1      item_id              0.0
2        funny              0.0
3       posted              0.0
4      helpful              0.0
5    recommend              0.0
6       review              0.0


In [136]:
total_empty_records = ut.count_empty_strings(df_expanded)
total_empty_records

44786

#### Empty strings will be imputed with 'Not specified'.

In [137]:
df_expanded_imputed = df_expanded.replace("", "Not specified")

In [138]:
total_empty_records = ut.count_empty_strings(df_expanded_imputed)
total_empty_records

0

#### We are reviewing the 'last_edited' column to analyze how to represent the date. This column is crucial for the functionality to be developed.

In [139]:
unique_values = df_expanded_imputed['posted'].unique()
unique_values

array(['Posted November 5, 2011.', 'Posted June 24, 2014.',
       'Posted February 3.', ..., 'Posted May 22, 2013.',
       'Posted August 24, 2013.', 'Posted August 10.'], dtype=object)

In [140]:
unique_values_without_last_edited = df_expanded_imputed['posted'][~df_expanded_imputed['posted'].str.contains('Posted')].unique()
unique_values_without_last_edited

array([], dtype=object)

#### Note that all specified values in the 'last edited' column follow a particular format. If the year is not specified, it looks like this: "Last edited April 4". With a specified year, it looks like this: "Last edited June 24, 2014."

In [141]:
import re
def contar_fechas(df):
    # Inicializar contadores
    con_anio = 0
    sin_anio = 0

    # Iterar sobre las filas del DataFrame
    for indice, fila in df.iterrows():
        # Utilizar expresiones regulares para verificar el formato de la fecha
        match_con_anio = re.match(r'Posted (\w+ \d+, \d+)', fila['posted'])
        match_sin_anio = re.match(r'Posted (\w+ \d+)', fila['posted'])

        if match_con_anio:
            con_anio += 1
        elif match_sin_anio:
            sin_anio += 1
    # Imprimir resultados
    print(f'Fechas con año: {con_anio}')
    print(f'Fechas sin año: {sin_anio}')

# Llamar a la función con tu DataFrame
contar_fechas(df_expanded_imputed)



Fechas con año: 20823
Fechas sin año: 4634


#### Records with unspecified years in the 'Posted' column are being removed.

In [143]:
# Crear una máscara booleana para los registros con año
mask_con_anio = df_expanded_imputed['posted'].str.match(r'Posted (\w+ \d+, \d+)', na=False)

# Filtrar el DataFrame para mantener solo los registros con año
df_expanded_con_anio = df_expanded_imputed[mask_con_anio]


In [145]:
contar_fechas(df_expanded_con_anio)


Fechas con año: 20823
Fechas sin año: 0


#### Basic ETL completed, data types have been successfully adjusted, and there are no null values or duplicates. It is exported to CSV to facilitate the subsequent handling of the dataset.


##### The column 'reviews,'contain a list with json dict. They will be kept as strings and handled accordingly as needed.

In [146]:
path = r'../data/generated/'
df_expanded_con_anio.to_csv(path + 'reviews.csv', index=False)