In [1]:
import pandas as pd
import utils as ut
import ast
import json
import importlib

In [2]:
importlib.reload(ut)

<module 'utils' from '/home/mauro/HENRY FT17/Proyectos/PI 1/Steam-Rec-System/ETL/utils.py'>

In [3]:
items = []

with open(r"../data/originals/users_items.json", 'r', encoding='utf-8') as f:
    # Loop for insert data into list
    for line in f.readlines():
        data = ast.literal_eval(line)          
        items.append(data)

# Convert the list of dictionaries to a DataFrame
items_df = pd.DataFrame(items)

In [4]:
items_df.columns

Index(['user_id', 'items_count', 'steam_id', 'user_url', 'items'], dtype='object')

#### The first record is reviewed to see its content and the names of the columns.

In [5]:
print(items_df.iloc[0])


user_id                                        76561197970982479
items_count                                                  277
steam_id                                       76561197970982479
user_url       http://steamcommunity.com/profiles/76561197970...
items          [{'item_id': '10', 'item_name': 'Counter-Strik...
Name: 0, dtype: object


In [6]:
# This step is applied to convert the lists into strings and thus be able to generate an overview of the data.
items_df['items'] = items_df['items'].apply(str)

In [7]:
# The conversion is verified.
print(items_df['items'].apply(type).unique())

[<class 'str'>]


In [8]:
ut.data_overview(items_df)


Total rows:  88310

Total full null rows:  0

Total duplicated rows: 657


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,user_id,[<class 'str'>],100.0,88310,0.0,0
1,items_count,[<class 'int'>],100.0,88310,0.0,0
2,steam_id,[<class 'str'>],100.0,88310,0.0,0
3,user_url,[<class 'str'>],100.0,88310,0.0,0
4,items,[<class 'str'>],100.0,88310,0.0,0


#### There are no nulls, so we proceed to remove duplicates.

In [9]:
items_df.drop_duplicates(inplace=True)

In [10]:
ut.data_overview(items_df)


Total rows:  87653

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,user_id,[<class 'str'>],100.0,87653,0.0,0
1,items_count,[<class 'int'>],100.0,87653,0.0,0
2,steam_id,[<class 'str'>],100.0,87653,0.0,0
3,user_url,[<class 'str'>],100.0,87653,0.0,0
4,items,[<class 'str'>],100.0,87653,0.0,0


#### The presence of string values 'None' in the columns is checked.

In [11]:
ut.check_none_values(items_df)

       Columna  Porcentaje None
0      user_id              0.0
1  items_count              0.0
2     steam_id              0.0
3     user_url              0.0
4        items              0.0


#### Process to unnest the column of items, which contains nested data.

In [12]:
# Create an empty DataFrame to store the results
new_columns = ["item_id", "item_name", "playtime_forever", "playtime_2weeks"]
items_column_df = pd.DataFrame(columns=items_df.columns.tolist() + new_columns)

# Iterate over the rows of the original DataFrame
for index, row in items_df.iterrows():
    # Evaluate the string in the "items" column as a literal Python expression
    try:
        item_list = ast.literal_eval(row["items"])
    except (ValueError, SyntaxError):
        item_list = []

    # Create new columns with the unnested values
    if isinstance(item_list, list) and item_list:
        item_data = item_list[0]  # Take the first element of the list
        for col in new_columns:
            items_column_df.at[index, col] = item_data.get(col, None)
    else:
        # If the list is empty or not valid, set null values in the new columns
        for col in new_columns:
            items_column_df.at[index, col] = None

In [13]:
items_column_df.columns

Index(['user_id', 'items_count', 'steam_id', 'user_url', 'items', 'item_id',
       'item_name', 'playtime_forever', 'playtime_2weeks'],
      dtype='object')

In [14]:
ut.data_overview(items_column_df)


Total rows:  87653

Total full null rows:  16714

Total duplicated rows: 50717


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,user_id,[<class 'float'>],0.0,0,100.0,87653
1,items_count,[<class 'float'>],0.0,0,100.0,87653
2,steam_id,[<class 'float'>],0.0,0,100.0,87653
3,user_url,[<class 'float'>],0.0,0,100.0,87653
4,items,[<class 'float'>],0.0,0,100.0,87653
5,item_id,"[<class 'str'>, <class 'NoneType'>]",80.93,70939,19.07,16714
6,item_name,"[<class 'str'>, <class 'NoneType'>]",80.93,70939,19.07,16714
7,playtime_forever,"[<class 'int'>, <class 'NoneType'>]",80.93,70939,19.07,16714
8,playtime_2weeks,"[<class 'int'>, <class 'NoneType'>]",80.93,70939,19.07,16714


#### Obtain a dataset with the information from the original dataset and the unnested 'items' columns.

In [29]:
final_df = items_df.copy()


final_df["item_id"] = items_column_df["item_id"]
final_df["item_name"] = items_column_df["item_name"]
final_df["playtime_forever"] = items_column_df["playtime_forever"]
final_df["playtime_2weeks"] = items_column_df["playtime_2weeks"]

In [30]:
ut.data_overview(final_df)


Total rows:  87653

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,user_id,[<class 'str'>],100.0,87653,0.0,0
1,items_count,[<class 'int'>],100.0,87653,0.0,0
2,steam_id,[<class 'str'>],100.0,87653,0.0,0
3,user_url,[<class 'str'>],100.0,87653,0.0,0
4,items,[<class 'str'>],100.0,87653,0.0,0
5,item_id,"[<class 'str'>, <class 'NoneType'>]",80.93,70939,19.07,16714
6,item_name,"[<class 'str'>, <class 'NoneType'>]",80.93,70939,19.07,16714
7,playtime_forever,"[<class 'int'>, <class 'NoneType'>]",80.93,70939,19.07,16714
8,playtime_2weeks,"[<class 'int'>, <class 'NoneType'>]",80.93,70939,19.07,16714


In [31]:
ut.check_none_values(final_df)

            Columna  Porcentaje None
0           user_id              0.0
1       items_count              0.0
2          steam_id              0.0
3          user_url              0.0
4             items              0.0
5           item_id              0.0
6         item_name              0.0
7  playtime_forever              0.0
8   playtime_2weeks              0.0


#### Duplicate values are removed from the final dataset. The null values were imputed as 'Not specified' for str cases and the mean for numeric cases

In [32]:
# Imputar valores nulos en las columnas item_id e item_name con "Not specified"
final_df['item_id'].fillna('Not specified', inplace=True)
final_df['item_name'].fillna('Not specified', inplace=True)

# Imputar valores nulos en las columnas playtime_forever y playtime_2weeks con la media
mean_playtime_forever = final_df['playtime_forever'].mean()
mean_playtime_2weeks = final_df['playtime_2weeks'].mean()

final_df['playtime_forever'].fillna(mean_playtime_forever, inplace=True)
final_df['playtime_2weeks'].fillna(mean_playtime_2weeks, inplace=True)

final_df = final_df.drop_duplicates()
final_df = final_df.reset_index(drop=True)

In [37]:
ut.data_overview(final_df)


Total rows:  87653

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,user_id,[<class 'str'>],100.0,87653,0.0,0
1,items_count,[<class 'int'>],100.0,87653,0.0,0
2,steam_id,[<class 'str'>],100.0,87653,0.0,0
3,user_url,[<class 'str'>],100.0,87653,0.0,0
4,items,[<class 'str'>],100.0,87653,0.0,0
5,item_id,[<class 'str'>],100.0,87653,0.0,0
6,item_name,[<class 'str'>],100.0,87653,0.0,0
7,playtime_forever,[<class 'float'>],100.0,87653,0.0,0
8,playtime_2weeks,[<class 'float'>],100.0,87653,0.0,0


In [36]:
print(final_df.iloc[0])


print(final_df['item_id'].dtypes)


user_id                                             76561197970982479
items_count                                                       277
steam_id                                            76561197970982479
user_url            http://steamcommunity.com/profiles/76561197970...
items               [{'item_id': '10', 'item_name': 'Counter-Strik...
item_id                                                            10
item_name                                              Counter-Strike
playtime_forever                                                  6.0
playtime_2weeks                                                   0.0
Name: 0, dtype: object
object


In [39]:
# Verifica qué valores de la columna 'item_id' no son strings
non_str_values = final_df['item_id'].apply(lambda x: type(x) != str)

# Filtra el DataFrame para mostrar solo las filas con valores no string en 'item_id'
non_str_items = final_df[non_str_values]

# Muestra el resultado
print(non_str_items)

Empty DataFrame
Columns: [user_id, items_count, steam_id, user_url, items, item_id, item_name, playtime_forever, playtime_2weeks]
Index: []


#### Basic ETL completed, data types have been successfully adjusted, and there are no null values or duplicates. The 'items' column has been unnested. It is exported to CSV to facilitate the subsequent handling of the dataset.

In [34]:
path = r'../data/generated/'
final_df.to_csv(path + 'items.csv', index=False)