# ETL

In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import re
import ast

## steam_games.json.gz

In [142]:
with gzip.open(r'E:\Data Science\PI1\Datasets\steam_games.json.gz','rt',encoding='utf-8') as f:
    df_games= pd.read_json(f, lines=True)


In [143]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 11.9+ MB


In [144]:
#Se eliminan todas las filas que tienen valores nulos
df_games.dropna(inplace=True)

In [145]:
#Se revisan los valores nulos
df_games.isnull().sum()

publisher       0
genres          0
app_name        0
title           0
url             0
release_date    0
tags            0
reviews_url     0
specs           0
price           0
early_access    0
id              0
developer       0
dtype: int64

In [146]:
#Vemos las columnas que tenemos para ver cuales van a ser utiles
df_games.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')

In [147]:
#Se eleiminan las columnas que no se van a utilizar 
df_games.drop(['url', 'publisher', 'reviews_url','specs', 'early_access'], axis=1, inplace=True)

In [148]:
#Se comparan los valores de las columnas 'title' y 'app_name' para ver si son iguales
(df_games['title']==df_games['app_name']).sum()

22179

In [149]:
#Debido a que la mayoría de los titulos son iguales a los nombres de las apps, se decide eliminar la columna 'title' 

df_games['title'].notnull().sum()

22530

In [150]:
df_games.drop(['title'], axis=1, inplace=True)

In [151]:
#Cambiar el tipo de dato de las columnas para poder trabajar luego
df_games['id']=df_games['id'].astype(dtype='int64')


In [152]:
df_games['price'] = df_games['price'].apply(lambda x: 0 if isinstance(x, str) else x)

In [153]:
#Verificamos que no haya mas strings en la columna 'price' para poder cambiar el tipo de dato
type_counts = df_games['price'].apply(type).value_counts()

print(type_counts)

price
<class 'float'>    22530
Name: count, dtype: int64


In [154]:
#Se cambia el tipo de dato de la columna 'price' a float64
df_games['price']=df_games['price'].astype(dtype='float64')

In [155]:
#se verifica los tipos de datos dentro de la columna fecha de lanzamiento
type_counts = df_games['release_date'].apply(type).value_counts()

print(type_counts)

release_date
<class 'str'>    22530
Name: count, dtype: int64


In [158]:
#Elimino los valores SOON y SOON™ de la columna 'release_date' que son juegos que están por salir
# Creao una lista con los valores a eliminar
valores = ['SOON', 'SOON™']

# Eliminar las filas donde 'release_date' es 'SOON' o 'SOON™'
df_games = df_games[~df_games['release_date'].isin(valores)]

In [160]:
import re

def extract_four_numbers(s):
    if re.match('^\\d{4}', s):
        return s[:4]  
    else:
        return s[-4:]  

# Aplica la función a la columna 'release_date' y crea una nueva columna 'release_year'
df_games['release_year'] = df_games['release_date'].apply(extract_four_numbers)


In [164]:
df_games=df_games.drop(['release_date'], axis=1)


In [169]:
df_games['app_name'] = df_games['app_name'].apply(lambda x: x.title() if isinstance(x, str) else x)

In [170]:
df_games

Unnamed: 0,genres,app_name,tags,price,id,developer,release_year
88310,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,"[Strategy, Action, Indie, Casual, Simulation]",4.99,761140,Kotoshiro,2018
88311,"[Free to Play, Indie, RPG, Strategy]",Ironbound,"[Free to Play, Strategy, Indie, RPG, Card Game...",0.00,643980,Secret Level SRL,2018
88312,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,"[Free to Play, Simulation, Sports, Casual, Ind...",0.00,670290,Poolians.com,2017
88313,"[Action, Adventure, Casual]",弹炸人2222,"[Action, Adventure, Casual]",0.99,767400,彼岸领域,2017
88315,"[Action, Adventure, Simulation]",Battle Royale Trainer,"[Action, Adventure, Simulation, FPS, Shooter, ...",3.99,772540,Trickjump Games Ltd,2018
...,...,...,...,...,...,...,...
120439,"[Action, Adventure, Casual, Indie]",Kebab It Up!,"[Action, Indie, Casual, Violent, Adventure]",1.99,745400,Bidoniera Games,2018
120440,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,"[Strategy, Indie, Casual, Simulation]",1.99,773640,"Nikita ""Ghost_RUS""",2018
120441,"[Casual, Indie, Strategy]",Logistical: South Africa,"[Strategy, Indie, Casual]",4.99,733530,Sacada,2018
120442,"[Indie, Racing, Simulation]",Russian Roads,"[Indie, Simulation, Racing]",1.99,610660,Laush Dmitriy Sergeevich,2018


In [171]:
# Se crea el parquet necesario para las posteriores etapas
df_games.to_parquet(r'Datasets\steam_games.parquet')

## user_items

In [113]:
data1 = []

with gzip.open(r'E:\Data Science\PI1\Datasets\users_items.json.gz', 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)
            data1.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

df_users = pd.DataFrame(data1)

In [193]:
#Se expnade el json que esta en la columna 'items'
df_expandido=df_users.explode('items')

In [194]:
#Se normaliza ese dataframe
df_descompuesto=pd.json_normalize(df_expandido['items'])

In [195]:
#Se elimina las columnas que no se van a utilizar
df_descompuesto.drop(['item_name','playtime_2weeks'],axis=1,inplace=True)

In [196]:
#Se unen los dos dataframes anteriores
df_final=pd.concat([df_expandido.reset_index(drop=True), df_descompuesto.reset_index(drop=True)], axis=1)

In [197]:
#Se elimina la columna que se expandió
df_user_items=df_final.drop('items', axis=1)

In [198]:
df_user_items.drop(['steam_id','user_url'],axis=1,inplace=True)

In [200]:
df_user_items=df_user_items.dropna()

In [201]:
df_user_items.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5153209 entries, 0 to 5170013
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int64  
 2   item_id           object 
 3   playtime_forever  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 196.6+ MB


In [208]:
df_user_items['item_id']=df_user_items['item_id'].astype(dtype='int64')

In [209]:
#Se verifican valores faltantes
df_user_items.isnull().sum()

user_id             0
items_count         0
item_id             0
playtime_forever    0
dtype: int64

In [212]:
df_user_items


Unnamed: 0,user_id,items_count,item_id,playtime_forever
0,76561197970982479,277,10,6.0
1,76561197970982479,277,20,0.0
2,76561197970982479,277,30,7.0
3,76561197970982479,277,40,0.0
4,76561197970982479,277,50,0.0
...,...,...,...,...
5170009,76561198329548331,7,346330,0.0
5170010,76561198329548331,7,373330,0.0
5170011,76561198329548331,7,388490,3.0
5170012,76561198329548331,7,521570,4.0


In [210]:
df_user_items.to_parquet(r'Datasets\users_items.parquet')

Se resamplea este dataset para que no haya problemas en el deploy por lo se revisan valores atípicos en las horas de juego

In [260]:
df_user_items_sample=df_user_items.copy()

In [261]:
df_user_items_sample.count()

user_id             5153209
items_count         5153209
item_id             5153209
playtime_forever    5153209
dtype: int64

In [262]:
df_user_items_sample['playtime_forever'].describe()

count    5.153209e+06
mean     9.914951e+02
std      5.418204e+03
min      0.000000e+00
25%      0.000000e+00
50%      3.400000e+01
75%      3.550000e+02
max      6.427730e+05
Name: playtime_forever, dtype: float64

In [263]:
df_user_items_sample=df_user_items_sample[df_user_items_sample['playtime_forever']<3.550000e+02]

In [264]:
df_user_items_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3863832 entries, 0 to 5170013
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int64  
 2   item_id           int64  
 3   playtime_forever  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 147.4+ MB


In [265]:
df_user_items_sample.to_parquet(r'Datasets\users_items_sample.parquet')

## user_reviews

In [266]:
data2=[]

with gzip.open(r'E:\Data Science\PI1\Datasets\user_reviews.json.gz', 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)
            data2.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

df_reviews = pd.DataFrame(data2)

In [290]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  25799 non-null  object
 1   reviews  25799 non-null  object
dtypes: object(2)
memory usage: 403.2+ KB


In [267]:
df_reviews.drop('user_url',axis=1,inplace=True)


In [275]:
df_expandido = df_reviews.explode('reviews')


In [276]:
df_descompuesto = pd.json_normalize(df_expandido['reviews'])


In [277]:
#Se eliminan las columnas que no van a ser útiles
df_descompuesto.drop(['funny','posted','last_edited','helpful'], axis=1, inplace=True)

In [278]:
df_final = pd.concat([df_expandido.reset_index(drop=True), df_descompuesto.reset_index(drop=True)], axis=1)

In [279]:
df_user_reviews = df_final.drop('reviews', axis=1)

In [291]:
df_user_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    59333 non-null  object
 1   item_id    59305 non-null  object
 2   recommend  59305 non-null  object
 3   review     59305 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [289]:
df_user_reviews.isnull().sum()

user_id       0
item_id      28
recommend    28
review       28
dtype: int64

In [287]:
df_user_reviews[df_user_reviews['item_id'].isnull()]

Unnamed: 0,user_id,item_id,recommend,review
137,gdxsd,,,
177,76561198094224872,,,
2559,76561198021575394,,,
10080,cmuir37,,,
13767,Jaysteeny,,,
15493,ML8989,,,
19184,76561198079215291,,,
20223,76561198079342142,,,
25056,76561198061996985,,,
26257,76561198108286351,,,


In [293]:
#Al estar todas las columnas con valores nulos se eliminan las filas correspondientes
df_user_reviews.dropna(inplace=True)

In [295]:
#Se verifica que no haya valores nulos
df_user_reviews.isnull().sum()

user_id      0
item_id      0
recommend    0
review       0
dtype: int64

In [296]:
from textblob import TextBlob

def sentiment(review):
    blob=TextBlob(review)
    resultado=1 + round(blob.sentiment.polarity)
    return resultado

Aqui se realizara el analisis de sentimiento de manera muy simple hayq que tener en cuenta que se puede usar otros metodos pero TextBlob lo simplifica demasiado

In [297]:
df_user_reviews['sentiment']=df_user_reviews['review'].apply(sentiment)

In [300]:
#Se eleimina la columna 'review' ya que no se va a utilizar al tener el analisis de sentimientos
df_user_reviews=df_user_reviews.drop('review', axis=1)

In [301]:
df_user_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59305 entries, 0 to 59332
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    59305 non-null  object
 1   item_id    59305 non-null  object
 2   recommend  59305 non-null  object
 3   sentiment  59305 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [302]:
df_user_reviews.to_parquet('Datasets/user_reviews.parquet')