# ETL + basic EDA

In [1]:
import pandas as pd
import utils as ut
import importlib
import json

In [2]:
importlib.reload(ut)

<module 'utils' from '/home/mauro/HENRY DATA FT17/Etapa Proyectos/Repo PI 1/Steam-Rec-System/ETL/utils.py'>

## Dataset to dataframe

In [3]:
path = r'../data/originals/steam_games.json'

with open(path) as file:
    content = file.read()

json_obj = [json.loads(line) for line in content.split('\n') if line.strip()]

games_df = pd.json_normalize(json_obj)

sample_df = games_df.sample(n=3, random_state=42)
print(sample_df)

      publisher genres app_name title  url release_date tags reviews_url  \
76613       NaN    NaN      NaN   NaN  NaN          NaN  NaN         NaN   
58237       NaN    NaN      NaN   NaN  NaN          NaN  NaN         NaN   
73020       NaN    NaN      NaN   NaN  NaN          NaN  NaN         NaN   

      specs price early_access   id developer  
76613   NaN   NaN          NaN  NaN       NaN  
58237   NaN   NaN          NaN  NaN       NaN  
73020   NaN   NaN          NaN  NaN       NaN  


#### There are columns that have the data type "list"; it is converted to "string" to analyze duplicates (Hash not applicable to lists).


In [4]:
# Convert the columns from lists to strings.
for column in games_df.columns: 
    if any(isinstance(elemento, list) for elemento in games_df[column]):   
        games_df[column] = games_df[column].apply(lambda x: str(x) if isinstance(x, list) else x)

In [5]:
# This function wouldn't work without the previous step of converting lists to strings
ut.data_overview(games_df)


Total rows:  120445

Total full null rows:  88310

Total duplicated rows: 88309


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,publisher,"[<class 'float'>, <class 'str'>]",20.0,24083,80.0,96362
1,genres,"[<class 'float'>, <class 'str'>]",23.95,28852,76.05,91593
2,app_name,"[<class 'float'>, <class 'str'>]",26.68,32133,73.32,88312
3,title,"[<class 'float'>, <class 'str'>]",24.98,30085,75.02,90360
4,url,"[<class 'float'>, <class 'str'>]",26.68,32135,73.32,88310
5,release_date,"[<class 'float'>, <class 'str'>]",24.96,30068,75.04,90377
6,tags,"[<class 'float'>, <class 'str'>]",26.54,31972,73.46,88473
7,reviews_url,"[<class 'float'>, <class 'str'>]",26.68,32133,73.32,88312
8,specs,"[<class 'float'>, <class 'str'>]",26.12,31465,73.88,88980
9,price,"[<class 'float'>, <class 'str'>]",25.54,30758,74.46,89687


#### Remove records with 80% null data.

In [6]:
# 80%
threshold = 0.8

# Calculate the minimum number of non-null values required per row
threshold_value = int((1 - threshold) * len(games_df.columns))

games_df.dropna(thresh=threshold_value, inplace=True)

In [7]:
ut.data_overview(games_df)


Total rows:  32135

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,publisher,"[<class 'str'>, <class 'float'>]",74.94,24083,25.06,8052
1,genres,"[<class 'str'>, <class 'float'>]",89.78,28852,10.22,3283
2,app_name,"[<class 'str'>, <class 'float'>]",99.99,32133,0.01,2
3,title,"[<class 'str'>, <class 'float'>]",93.62,30085,6.38,2050
4,url,[<class 'str'>],100.0,32135,0.0,0
5,release_date,"[<class 'str'>, <class 'float'>]",93.57,30068,6.43,2067
6,tags,"[<class 'str'>, <class 'float'>]",99.49,31972,0.51,163
7,reviews_url,"[<class 'str'>, <class 'float'>]",99.99,32133,0.01,2
8,specs,"[<class 'str'>, <class 'float'>]",97.92,31465,2.08,670
9,price,"[<class 'float'>, <class 'str'>]",95.71,30758,4.29,1377


#### Rows with null values are removed if the column has 95% or more non-null elements.

In [8]:
games_df.dropna(subset=['app_name'], inplace=True)
games_df.dropna(subset=['reviews_url'], inplace=True)
games_df.dropna(subset=['id'], inplace=True)
games_df.dropna(subset=['tags'], inplace=True)
games_df.dropna(subset=['price'], inplace=True)
games_df.dropna(subset=['specs'], inplace=True)

In [9]:
ut.data_overview(games_df)


Total rows:  29976

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,publisher,"[<class 'str'>, <class 'float'>]",75.5,22632,24.5,7344
1,genres,"[<class 'str'>, <class 'float'>]",91.17,27330,8.83,2646
2,app_name,[<class 'str'>],100.0,29976,0.0,0
3,title,"[<class 'str'>, <class 'float'>]",93.56,28045,6.44,1931
4,url,[<class 'str'>],100.0,29976,0.0,0
5,release_date,"[<class 'str'>, <class 'float'>]",93.54,28041,6.46,1935
6,tags,[<class 'str'>],100.0,29976,0.0,0
7,reviews_url,[<class 'str'>],100.0,29976,0.0,0
8,specs,[<class 'str'>],100.0,29976,0.0,0
9,price,"[<class 'float'>, <class 'str'>]",100.0,29976,0.0,0


### "Checking for possible values in the price column."

In [10]:
games_df['price'][games_df['price'].notnull()].unique()

array([4.99, 'Free To Play', 'Free to Play', 0.99, 2.99, 3.99, 9.99,
       18.99, 29.99, 'Free', 10.99, 1.59, 14.99, 1.99, 59.99, 8.99, 6.99,
       7.99, 39.99, 19.99, 7.49, 12.99, 5.99, 2.49, 15.99, 1.25, 24.99,
       17.99, 61.99, 3.49, 11.99, 13.99, 'Free Demo', 'Play for Free!',
       34.99, 74.76, 1.49, 32.99, 99.99, 14.95, 69.99, 16.99, 79.99,
       49.99, 5.0, 44.99, 13.98, 109.99, 149.99, 771.71, 'Install Now',
       21.99, 89.99, 'Play WARMACHINE: Tactics Demo', 0.98, 139.92, 4.29,
       64.99, 'Free Mod', 54.99, 74.99, 'Install Theme', 0.89,
       'Third-party', 0.5, 'Play Now', 299.99, 1.29, 119.99, 3.0, 15.0,
       5.49, 23.99, 20.99, 1.39, 'Free HITMAN™ Holiday Pack', 36.99, 4.49,
       2.0, 4.0, 9.0, 234.99, 1.95, 1.5, 6.66, 27.99, 10.49, 26.99,
       399.99, 31.99, 20.0, 40.0, 3.33, 129.99, 199.99, 22.99, 38.85,
       71.7, 59.95, 995.0, 27.49, 3.39, 6.0, 19.95, 499.99, 16.06, 4.68,
       131.4, 44.98, 202.76, 1.0, 2.3, 0.95, 172.24, 249.99, 2.97, 10.96,
   

#### The unique non-numeric values are filtered from the 'price' column.

In [11]:
def get_non_numeric_values(dataframe, column):
    non_numeric_values = dataframe[column].loc[~dataframe[column].apply(pd.to_numeric, errors='coerce').notnull()]    
    unique_non_numeric_values = non_numeric_values.drop_duplicates().to_dict()

    nan_count = dataframe[column].isna().sum()

    return unique_non_numeric_values, nan_count

column_to_check = 'price'
non_numeric_values_dict, nan_count = get_non_numeric_values(games_df, column_to_check)


print("NaN Qty:", nan_count)

print("\n\nNon numeric values:")
non_numeric_values_dict

NaN Qty: 0


Non numeric values:


{88311: 'Free To Play',
 88312: 'Free to Play',
 88321: 'Free',
 89230: 'Free Demo',
 89279: 'Play for Free!',
 90715: 'Install Now',
 91181: 'Play WARMACHINE: Tactics Demo',
 91624: 'Free Mod',
 92142: 'Install Theme',
 92228: 'Third-party',
 92336: 'Play Now',
 93899: 'Free HITMAN™ Holiday Pack',
 111044: 'Play the Demo',
 117676: 'Free Movie'}

In [12]:
games_df['price'].replace(non_numeric_values_dict.values() , 0, inplace=True)

#### The non-numeric values are successfully removed from the 'price' column.

In [13]:
column_to_check = 'price'
non_numeric_values_dict = get_non_numeric_values(games_df, column_to_check)

non_numeric_values_dict

({}, 0)

In [14]:
games_df['price'] = games_df['price'].round(2)

ut.data_overview(games_df)


Total rows:  29976

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,publisher,"[<class 'str'>, <class 'float'>]",75.5,22632,24.5,7344
1,genres,"[<class 'str'>, <class 'float'>]",91.17,27330,8.83,2646
2,app_name,[<class 'str'>],100.0,29976,0.0,0
3,title,"[<class 'str'>, <class 'float'>]",93.56,28045,6.44,1931
4,url,[<class 'str'>],100.0,29976,0.0,0
5,release_date,"[<class 'str'>, <class 'float'>]",93.54,28041,6.46,1935
6,tags,[<class 'str'>],100.0,29976,0.0,0
7,reviews_url,[<class 'str'>],100.0,29976,0.0,0
8,specs,[<class 'str'>],100.0,29976,0.0,0
9,price,[<class 'float'>],100.0,29976,0.0,0


### Imputed Values 'Not specified' for str

In [15]:
# The rows that still have a percentage of null elements are checked.

columns_to_check = ['publisher', 'genres', 'title', 'release_date', 'developer']

# We will check in the columns whose data is a string to see if the floats found are only NaN elements.

float_rows = games_df[columns_to_check].map(lambda x: isinstance(x, (float, int)))
nan_check = games_df[columns_to_check][float_rows].map(lambda x: pd.isna(x))

# Summary of cases
summary_table = pd.DataFrame({
    'Column': nan_check.columns,
    'All NaN': nan_check.all(),   
})

print(summary_table)

                    Column  All NaN
publisher        publisher     True
genres              genres     True
title                title     True
release_date  release_date     True
developer        developer     True


In [16]:
# In columns where the data type should always be a string, it was identified that the found floats were simply intrusive NaNs, so they are being deleted.

columns_to_inpute = ['publisher', 'genres', 'title', 'release_date', 'developer']

games_df[columns_to_inpute] = games_df[columns_to_inpute].fillna("Not specified")


In [17]:
ut.data_overview(games_df)


Total rows:  29976

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,publisher,[<class 'str'>],100.0,29976,0.0,0
1,genres,[<class 'str'>],100.0,29976,0.0,0
2,app_name,[<class 'str'>],100.0,29976,0.0,0
3,title,[<class 'str'>],100.0,29976,0.0,0
4,url,[<class 'str'>],100.0,29976,0.0,0
5,release_date,[<class 'str'>],100.0,29976,0.0,0
6,tags,[<class 'str'>],100.0,29976,0.0,0
7,reviews_url,[<class 'str'>],100.0,29976,0.0,0
8,specs,[<class 'str'>],100.0,29976,0.0,0
9,price,[<class 'float'>],100.0,29976,0.0,0


#### String checks are performed due to the potential presence of the word 'None'

In [18]:
ut.check_none_values(games_df)

         Columna  Porcentaje None
0      publisher         0.030024
1         genres         0.000000
2       app_name         0.000000
3          title         0.000000
4            url         0.000000
5   release_date         0.000000
6           tags         0.000000
7    reviews_url         0.000000
8          specs         0.000000
9          price         0.000000
10  early_access         0.000000
11            id         0.000000
12     developer         0.000000


In [19]:
def print_rows_with_none_values(df):
    none_rows = df[df.apply(lambda row: any(x == "None" for x in row), axis=1)]
    print(none_rows)

# A look is taken at the columns where 'publisher' is 'None'.
print_rows_with_none_values(games_df)

       publisher                                             genres  \
91623       None                                          ['Indie']   
93368       None  ['Action', 'Indie', 'Massively Multiplayer', '...   
95281       None                                         ['Casual']   
98116       None                      ['Adventure', 'Indie', 'RPG']   
103125      None                ['Action', 'Free to Play', 'Indie']   
106359      None            ['Action', 'Adventure', 'Indie', 'RPG']   
109236      None                   ['Action', 'Adventure', 'Indie']   
115284      None                         ['Simulation', 'Strategy']   
116737      None                      ['Action', 'Casual', 'Indie']   

                                     app_name  \
91623   Glorkian Warrior: The Trials Of Glork   
93368                      Divergence: Online   
95281              Interstellar Logistics Inc   
98116                      SuperCluster: Void   
103125                                  Scra

In [20]:
games_df['publisher'] = games_df['publisher'].replace("None", "Not specified")

#### It is checked that the 'None' values have disappeared.

In [21]:
print_rows_with_none_values(games_df)

Empty DataFrame
Columns: [publisher, genres, app_name, title, url, release_date, tags, reviews_url, specs, price, early_access, id, developer]
Index: []


In [22]:
ut.check_none_values(games_df)

         Columna  Porcentaje None
0      publisher              0.0
1         genres              0.0
2       app_name              0.0
3          title              0.0
4            url              0.0
5   release_date              0.0
6           tags              0.0
7    reviews_url              0.0
8          specs              0.0
9          price              0.0
10  early_access              0.0
11            id              0.0
12     developer              0.0


In [23]:
ut.data_overview(games_df)


Total rows:  29976

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,publisher,[<class 'str'>],100.0,29976,0.0,0
1,genres,[<class 'str'>],100.0,29976,0.0,0
2,app_name,[<class 'str'>],100.0,29976,0.0,0
3,title,[<class 'str'>],100.0,29976,0.0,0
4,url,[<class 'str'>],100.0,29976,0.0,0
5,release_date,[<class 'str'>],100.0,29976,0.0,0
6,tags,[<class 'str'>],100.0,29976,0.0,0
7,reviews_url,[<class 'str'>],100.0,29976,0.0,0
8,specs,[<class 'str'>],100.0,29976,0.0,0
9,price,[<class 'float'>],100.0,29976,0.0,0


In [26]:
# Verificamos los valores actuales en 'release_date' que no son convertibles a datetime
non_convertible_dates = []
for date_str in games_df['release_date']:
    try:
        pd.to_datetime(date_str)
    except ValueError:
        non_convertible_dates.append(date_str)

# Mostramos algunos ejemplos de fechas no convertibles
if non_convertible_dates:
    print("Ejemplos de fechas no convertibles:")
    for date_str in non_convertible_dates[:5]:
        print(date_str)
else:
    print("Todos los valores en 'release_date' son convertibles.")

Ejemplos de fechas no convertibles:
Not specified
Not specified
Not specified
Not specified
Not specified


#### Basic ETL completed, data types have been successfully adjusted, and there are no more null values or duplicates. It is exported to CSV to facilitate the subsequent handling of the dataset.


##### According to the data dictionary, the columns 'genres,' 'specs,' and 'tags' contain lists. They will be kept as strings and handled accordingly as needed.

In [24]:
path = r'../data/generated/'
games_df.to_csv(path + 'steam_games.csv', index=False)

In [25]:
games_df.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')