# ETL

In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import re
import ast

## steam_games.json.gz

In [142]:
with gzip.open(r'E:\Data Science\PI1\Datasets\steam_games.json.gz','rt',encoding='utf-8') as f:
    df_games= pd.read_json(f, lines=True)


In [143]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 11.9+ MB


In [144]:
#Se eliminan todas las filas que tienen valores nulos
df_games.dropna(inplace=True)

In [145]:
#Se revisan los valores nulos
df_games.isnull().sum()

publisher       0
genres          0
app_name        0
title           0
url             0
release_date    0
tags            0
reviews_url     0
specs           0
price           0
early_access    0
id              0
developer       0
dtype: int64

In [146]:
#Vemos las columnas que tenemos para ver cuales van a ser utiles
df_games.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')

In [147]:
#Se eleiminan las columnas que no se van a utilizar 
df_games.drop(['url', 'publisher', 'reviews_url','specs', 'early_access'], axis=1, inplace=True)

In [148]:
#Se comparan los valores de las columnas 'title' y 'app_name' para ver si son iguales
(df_games['title']==df_games['app_name']).sum()

22179

In [149]:
#Debido a que la mayoría de los titulos son iguales a los nombres de las apps, se decide eliminar la columna 'title' 

df_games['title'].notnull().sum()

22530

In [150]:
df_games.drop(['title'], axis=1, inplace=True)

In [151]:
#Cambiar el tipo de dato de las columnas para poder trabajar luego
df_games['id']=df_games['id'].astype(dtype='int64')


In [152]:
df_games['price'] = df_games['price'].apply(lambda x: 0 if isinstance(x, str) else x)

In [153]:
#Verificamos que no haya mas strings en la columna 'price' para poder cambiar el tipo de dato
type_counts = df_games['price'].apply(type).value_counts()

print(type_counts)

price
<class 'float'>    22530
Name: count, dtype: int64


In [154]:
#Se cambia el tipo de dato de la columna 'price' a float64
df_games['price']=df_games['price'].astype(dtype='float64')

In [155]:
#se verifica los tipos de datos dentro de la columna fecha de lanzamiento
type_counts = df_games['release_date'].apply(type).value_counts()

print(type_counts)

release_date
<class 'str'>    22530
Name: count, dtype: int64


In [158]:
#Elimino los valores SOON y SOON™ de la columna 'release_date' que son juegos que están por salir
# Creao una lista con los valores a eliminar
valores = ['SOON', 'SOON™']

# Eliminar las filas donde 'release_date' es 'SOON' o 'SOON™'
df_games = df_games[~df_games['release_date'].isin(valores)]

In [160]:
import re

def extract_four_numbers(s):
    if re.match('^\\d{4}', s):
        return s[:4]  
    else:
        return s[-4:]  

# Aplica la función a la columna 'release_date' y crea una nueva columna 'release_year'
df_games['release_year'] = df_games['release_date'].apply(extract_four_numbers)


In [164]:
df_games=df_games.drop(['release_date'], axis=1)


In [169]:
df_games['app_name'] = df_games['app_name'].apply(lambda x: x.title() if isinstance(x, str) else x)

In [170]:
df_games

Unnamed: 0,genres,app_name,tags,price,id,developer,release_year
88310,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,"[Strategy, Action, Indie, Casual, Simulation]",4.99,761140,Kotoshiro,2018
88311,"[Free to Play, Indie, RPG, Strategy]",Ironbound,"[Free to Play, Strategy, Indie, RPG, Card Game...",0.00,643980,Secret Level SRL,2018
88312,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,"[Free to Play, Simulation, Sports, Casual, Ind...",0.00,670290,Poolians.com,2017
88313,"[Action, Adventure, Casual]",弹炸人2222,"[Action, Adventure, Casual]",0.99,767400,彼岸领域,2017
88315,"[Action, Adventure, Simulation]",Battle Royale Trainer,"[Action, Adventure, Simulation, FPS, Shooter, ...",3.99,772540,Trickjump Games Ltd,2018
...,...,...,...,...,...,...,...
120439,"[Action, Adventure, Casual, Indie]",Kebab It Up!,"[Action, Indie, Casual, Violent, Adventure]",1.99,745400,Bidoniera Games,2018
120440,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,"[Strategy, Indie, Casual, Simulation]",1.99,773640,"Nikita ""Ghost_RUS""",2018
120441,"[Casual, Indie, Strategy]",Logistical: South Africa,"[Strategy, Indie, Casual]",4.99,733530,Sacada,2018
120442,"[Indie, Racing, Simulation]",Russian Roads,"[Indie, Simulation, Racing]",1.99,610660,Laush Dmitriy Sergeevich,2018


In [171]:
# Se crea el parquet necesario para las posteriores etapas
df_games.to_parquet(r'Datasets\steam_games.parquet')

## user_items

In [113]:
data1 = []

with gzip.open(r'E:\Data Science\PI1\Datasets\users_items.json.gz', 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)
            data1.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

df_users = pd.DataFrame(data1)

In [21]:
df_users

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,http://steamcommunity.com/profiles/76561198323...,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,177,76561198326700687,http://steamcommunity.com/profiles/76561198326...,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...,[]
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"[{'item_id': '304930', 'item_name': 'Unturned'..."


In [22]:
df_expandido=df_users.explode('items')

In [24]:
df_descompuesto=pd.json_normalize(df_expandido['items'])

In [27]:
#Se elimina las columnas que no se van a utilizar
df_descompuesto.drop(['item_name','playtime_2weeks'],axis=1,inplace=True)

In [29]:
df_final=pd.concat([df_expandido.reset_index(drop=True), df_descompuesto.reset_index(drop=True)], axis=1)

In [44]:
df_user_items=df_final.drop('items', axis=1)

In [45]:
df_user_items.drop(['steam_id','items_count','user_url'],axis=1,inplace=True)

In [47]:
df_user_items.to_parquet(r'E:\Data Science\PI1\Datasets\users_items.parquet')

## user_reviews

In [3]:
data2=[]

with gzip.open(r'E:\Data Science\PI1\Datasets\user_reviews.json.gz', 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)
            data2.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

df_reviews = pd.DataFrame(data2)

In [4]:
df_reviews.drop('user_url',axis=1,inplace=True)


In [5]:
df_expandido = df_reviews.explode('reviews')


In [6]:
df_descompuesto = pd.json_normalize(df_expandido['reviews'])


In [7]:
#Se eliminan las columnas que no van a ser útiles
df_descompuesto.drop(['funny','posted','last_edited'], axis=1, inplace=True)

In [8]:
df_final = pd.concat([df_expandido.reset_index(drop=True), df_descompuesto.reset_index(drop=True)], axis=1)

In [9]:
df_user_reviews = df_final.drop('reviews', axis=1)

In [10]:
#preprocesamiento de la columna 'review'
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from textblob import TextBlob

In [11]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\juan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\juan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
#Se define el lematizador y las stop words 
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [14]:
#Se pasana a minúsculas todas las reviews en una nueva columna llamada 'prepocesed_review'
df_user_reviews['preprocessed_review'] = df_user_reviews['review'].str.lower()

df_user_reviews['preprocessed_review']=df_user_reviews['preprocessed_review'].astype(dtype='str')
#Se eliminan los signos de puntuación
df_user_reviews['preprocessed_review'] = df_user_reviews['preprocessed_review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Se tokeniza la columna 'prepocesed_review'
df_user_reviews['preprocessed_review']=df_user_reviews['preprocessed_review'].apply(word_tokenize)

#se lematiza la columna 'preprocesed_review'para poder utilizarla en el análisis de sentimientos
df_user_reviews['preprocessed_review']=df_user_reviews['preprocessed_review'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [15]:
#Preprocesamiento de datos
def preprocess_review(review):
    review = review.lower()
    review = review.translate(str.maketrans('', '', string.punctuation))
    review = word_tokenize(review)
    review = [lemmatizer.lemmatize(word) for word in review]
    return review

In [16]:
df_user_reviews['preprocessed_review'] = df_user_reviews['review'].apply(lambda x: 1 if pd.isnull(x) else preprocess_review(x))

In [18]:
#Análisis de sentimientos
def sentiment_analysis(preprocess_review):
    if preprocess_review==1:
        return 1
    else:
        analysis = TextBlob(preprocess_review)
        
        return round(analysis.sentiment.polarity)+1

In [20]:
df_user_reviews['preprocessed_review']

0        [simple, yet, with, great, replayability, in, ...
1                 [it, unique, and, worth, a, playthrough]
2        [great, atmosphere, the, gunplay, can, be, a, ...
3        [i, know, what, you, think, when, you, see, th...
4        [for, a, simple, it, actually, not, all, that,...
                               ...                        
59328    [a, must, have, classic, from, steam, definite...
59329    [this, game, is, a, perfect, remake, of, the, ...
59330    [had, so, much, fun, plaing, this, and, collec...
59331                                                  [d]
59332                                   [so, much, fun, d]
Name: preprocessed_review, Length: 59333, dtype: object

In [21]:
#Se pasana a minúsculas todas las reviews en una nueva columna llamada 'prepocesed_review'
df_user_reviews['preprocessed_review'] = df_user_reviews['review'].str.lower()

df_user_reviews['preprocessed_review']=df_user_reviews['preprocessed_review'].astype(dtype='str')

In [22]:
#Se aplica la función de análisis de sentimientos a la columna 'preprocessed_review' para generar la nueva columna sentiment
df_user_reviews['sentiment'] = df_user_reviews['preprocessed_review'].apply(sentiment_analysis)

Unnamed: 0,user_id,item_id,helpful,recommend,review,preprocessed_review,sentiment
0,76561197970982479,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,simple yet with great replayability. in my opi...,1
1,76561197970982479,22200,No ratings yet,True,It's unique and worth a playthrough.,it's unique and worth a playthrough.,1
2,76561197970982479,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,great atmosphere. the gunplay can be a bit chu...,1
3,js41637,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,i know what you think when you see this title ...,1
4,js41637,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,for a simple (it's actually not all that simpl...,1
...,...,...,...,...,...,...,...
59328,76561198312638244,70,No ratings yet,True,a must have classic from steam definitely wort...,a must have classic from steam definitely wort...,1
59329,76561198312638244,362890,No ratings yet,True,this game is a perfect remake of the original ...,this game is a perfect remake of the original ...,1
59330,LydiaMorley,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,had so much fun plaing this and collecting res...,1
59331,LydiaMorley,730,No ratings yet,True,:D,:d,1


In [33]:
df_user_reviews.to_parquet('Datasets/user_reviews.parquet')


In [29]:
df_prueba_reviews = pd.read_parquet(r'E:\Data Science\PI1\Datasets\user_reviews.parquet')
df_prueba_reviews

Unnamed: 0,user_id,item_id,helpful,recommend,review,preprocessed_review,sentiment
0,76561197970982479,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,simple yet with great replayability. in my opi...,1
1,76561197970982479,22200,No ratings yet,True,It's unique and worth a playthrough.,it's unique and worth a playthrough.,1
2,76561197970982479,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,great atmosphere. the gunplay can be a bit chu...,1
3,js41637,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,i know what you think when you see this title ...,1
4,js41637,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,for a simple (it's actually not all that simpl...,1
...,...,...,...,...,...,...,...
59328,76561198312638244,70,No ratings yet,True,a must have classic from steam definitely wort...,a must have classic from steam definitely wort...,1
59329,76561198312638244,362890,No ratings yet,True,this game is a perfect remake of the original ...,this game is a perfect remake of the original ...,1
59330,LydiaMorley,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,had so much fun plaing this and collecting res...,1
59331,LydiaMorley,730,No ratings yet,True,:D,:d,1
