In [1]:
import pandas as pd
import utils as ut
import ast
import importlib

In [112]:
importlib.reload(ut)

<module 'utils' from '/home/mauro/HENRY FT17/Proyectos/PI 1/Steam-Rec-System/ETL/utils.py'>

In [113]:
json_path = "../data/originals/user_reviews.json"

reviews = []
with open(json_path, 'r', encoding='utf-8') as f:
    # Loop to insert data into list
    for line in f.readlines():
        reviews.append(ast.literal_eval(line))

reviews_df = pd.DataFrame(reviews)



#### The first record is reviewed to see its content and the names of the columns.

In [114]:
print(reviews_df.iloc[0])

user_id                                     76561197970982479
user_url    http://steamcommunity.com/profiles/76561197970...
reviews     [{'funny': '', 'posted': 'Posted November 5, 2...
Name: 0, dtype: object


In [115]:
# This step is applied to convert the lists into strings and thus be able to generate an overview of the data.
reviews_df['reviews'] = reviews_df['reviews'].apply(str)

In [116]:
ut.data_overview(reviews_df)


Total rows:  25799

Total full null rows:  0

Total duplicated rows: 313


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,user_id,[<class 'str'>],100.0,25799,0.0,0
1,user_url,[<class 'str'>],100.0,25799,0.0,0
2,reviews,[<class 'str'>],100.0,25799,0.0,0


#### Obtain a dataset with the information from the unnested 'reviews' column.

In [117]:
# Convert the 'reviews' column from string to a list of dictionaries
reviews_df['reviews'] = reviews_df['reviews'].apply(ast.literal_eval)

# Get all unique keys from dictionaries in the 'reviews' column
all_keys = set()
for review_list in reviews_df['reviews']:
    for review_dict in review_list:
        all_keys.update(review_dict.keys())

# Create additional columns in the DataFrame for each unique key
for key in all_keys:
    reviews_df[key] = reviews_df['reviews'].apply(lambda x: [d.get(key, None) for d in x])

# Expand the dictionaries in the 'reviews' column into new rows
df_expanded = reviews_df.explode('reviews')

# Expand the lists of values in the columns
for col in reviews_df.columns.difference(['user_id', 'user_url', 'reviews']):
    df_expanded[col] = df_expanded[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# Reorganize the columns to have the original ones followed by the new ones
new_columns_order = list(reviews_df.columns) + [col for col in df_expanded.columns if col not in reviews_df.columns]
df_expanded = df_expanded[new_columns_order]


In [118]:
# Drop the columns 'reviews', 'user_id', and 'user_url'
df_expanded.drop(['reviews', 'user_id', 'user_url'], axis=1, inplace=True)

In [119]:
ut.data_overview(df_expanded)


Total rows:  59333

Total full null rows:  28

Total duplicated rows: 33875


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,funny,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28
1,item_id,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28
2,recommend,"[<class 'bool'>, <class 'NoneType'>]",99.95,59305,0.05,28
3,helpful,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28
4,review,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28
5,posted,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28
6,last_edited,"[<class 'str'>, <class 'NoneType'>]",99.95,59305,0.05,28


#### We proceed to remove nulls and duplicates.

In [120]:
df_expanded.dropna(inplace=True)
df_expanded.drop_duplicates(inplace=True)

In [121]:
ut.data_overview(df_expanded)


Total rows:  25457

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,funny,[<class 'str'>],100.0,25457,0.0,0
1,item_id,[<class 'str'>],100.0,25457,0.0,0
2,recommend,[<class 'bool'>],100.0,25457,0.0,0
3,helpful,[<class 'str'>],100.0,25457,0.0,0
4,review,[<class 'str'>],100.0,25457,0.0,0
5,posted,[<class 'str'>],100.0,25457,0.0,0
6,last_edited,[<class 'str'>],100.0,25457,0.0,0


#### The presence of string values 'None' in the columns is checked.

In [122]:
ut.check_none_values(df_expanded)

       Columna  Porcentaje None
0        funny              0.0
1      item_id              0.0
2    recommend              0.0
3      helpful              0.0
4       review              0.0
5       posted              0.0
6  last_edited              0.0


In [123]:
total_empty_records = ut.count_empty_strings(df_expanded)
total_empty_records

44786

#### Empty strings will be imputed with 'Not specified'.

In [124]:
df_expanded_imputed = df_expanded.replace("", "Not specified")

In [125]:
total_empty_records = ut.count_empty_strings(df_expanded_imputed)
total_empty_records

0

#### We are reviewing the 'last_edited' column to analyze how to represent the date. This column is crucial for the functionality to be developed.

In [126]:
unique_values = df_expanded_imputed['posted'].unique()
unique_values

array(['Posted November 5, 2011.', 'Posted June 24, 2014.',
       'Posted February 3.', ..., 'Posted May 22, 2013.',
       'Posted August 24, 2013.', 'Posted August 10.'], dtype=object)

In [127]:
unique_values_without_last_edited = df_expanded_imputed['posted'][~df_expanded_imputed['posted'].str.contains('Posted')].unique()
unique_values_without_last_edited

array([], dtype=object)

#### Note that all specified values in the 'posted' column follow a particular format. If the year is not specified, it looks like this: "Posted April 4". With a specified year, it looks like this: "Posted June 24, 2014."

In [128]:
import re

def count_dates(df):
    # Initialize counters
    with_year = 0
    without_year = 0

    # Iterate over the rows of the DataFrame
    for index, row in df.iterrows():
        # Use regular expressions to check the date format
        match_with_year = re.match(r'Posted (\w+ \d+, \d+)', row['posted'])
        match_without_year = re.match(r'Posted (\w+ \d+)', row['posted'])

        if match_with_year:
            with_year += 1
        elif match_without_year:
            without_year += 1

    # Print results
    print(f'Dates with year: {with_year}')
    print(f'Dates without year: {without_year}')

# Call the function with your DataFrame
count_dates(df_expanded_imputed)

Dates with year: 20823
Dates without year: 4634


#### Records with unspecified years in the 'Posted' column are being removed.

In [139]:
# Create a boolean mask for records with a year
mask_with_year = df_expanded_imputed['posted'].str.match(r'Posted (\w+ \d+, \d+)', na=False)

# Filter the DataFrame to keep only records with a year
df_with_year = df_expanded_imputed.loc[mask_with_year]


In [140]:
count_dates(df_with_year)


Dates with year: 20823
Dates without year: 0


In [141]:
df_with_year['posted'] = pd.to_datetime(df_with_year['posted'], format='%B %d, %Y', errors='coerce')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_year['posted'] = pd.to_datetime(df_with_year['posted'], format='%B %d, %Y', errors='coerce')


In [142]:
df_with_year.iloc[1250]

funny                          46 people found this review funny
item_id                                                   291550
recommend                                                   True
helpful          45 of 56 people (80%) found this review helpful
review         Everything about the game is so...crispy. The ...
posted                                                       NaT
last_edited                                        Not specified
Name: 1653, dtype: object

In [144]:
import datetime


filtered_df = df_with_year[df_with_year['posted'] == datetime.strptime("November 5, 2011.", '%M %d, %Y')]
filtered_df

AttributeError: module 'datetime' has no attribute 'strptime'

In [43]:
ut.data_overview(df_with_year)


Total rows:  20823

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,funny,[<class 'str'>],100.0,20823,0.0,0
1,item_id,[<class 'str'>],100.0,20823,0.0,0
2,recommend,[<class 'bool'>],100.0,20823,0.0,0
3,helpful,[<class 'str'>],100.0,20823,0.0,0
4,review,[<class 'str'>],100.0,20823,0.0,0
5,posted,[<class 'datetime.date'>],100.0,20823,0.0,0
6,last_edited,[<class 'str'>],100.0,20823,0.0,0


#### Basic ETL completed, data types have been successfully adjusted, and there are no null values or duplicates. It is exported to CSV to facilitate the subsequent handling of the dataset.


##### The 'last_edited' column contains dates, but they won't be processed as they are not relevant for the queries to be developed.

In [44]:
path = r'../data/generated/'
df_with_year.to_csv(path + 'reviews.csv', index=False)