# ETL

In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import re
import ast

## steam_games.json.gz

In [5]:
with gzip.open(r'E:\Data Science\PI1\Datasets\steam_games.json.gz','rt',encoding='utf-8') as f:
    df_games= pd.read_json(f, lines=True)


In [6]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 11.9+ MB


In [8]:
#Se eliminan todas las filas que estan vacías por completo
df_games.dropna(how='all', axis=0,inplace=True)

In [10]:
#Se revisan los valores nulos
df_games.isnull().sum()

publisher       8052
genres          3283
app_name           2
title           2050
url                0
release_date    2067
tags             163
reviews_url        2
specs            670
price           1377
early_access       0
id                 2
developer       3299
dtype: int64

In [12]:
df_games.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')

In [13]:
#Se eleiminan las columnas que no se van a utilizar 
df_games.drop(['url', 'publisher', 'reviews_url','specs', 'price', 'early_access'], axis=1, inplace=True)

In [15]:
#Se comparan los valores de las columnas 'title' y 'app_name' para ver si son iguales
(df_games['title']==df_games['app_name']).sum()

29530

In [16]:
#Debido a que la mayoría de los titulos son iguales a los nombres de las apps, se decide eliminar la columna 'title' 

df_games['title'].notnull().sum()

30085

In [17]:
df_games.drop(['title'], axis=1, inplace=True)

In [19]:
# Se crea el parquet necesario para las posteriores etapas
df_games.to_parquet(r'E:\Data Science\PI1\Datasets\steam_games.parquet')

## user_items

In [20]:
data1 = []

with gzip.open(r'E:\Data Science\PI1\Datasets\users_items.json.gz', 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)
            data1.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

df_users = pd.DataFrame(data1)

In [21]:
df_users

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,http://steamcommunity.com/profiles/76561198323...,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,177,76561198326700687,http://steamcommunity.com/profiles/76561198326...,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...,[]
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"[{'item_id': '304930', 'item_name': 'Unturned'..."


In [22]:
df_expandido=df_users.explode('items')

In [24]:
df_descompuesto=pd.json_normalize(df_expandido['items'])

In [27]:
#Se elimina las columnas que no se van a utilizar
df_descompuesto.drop(['item_name','playtime_2weeks'],axis=1,inplace=True)

In [29]:
df_final=pd.concat([df_expandido.reset_index(drop=True), df_descompuesto.reset_index(drop=True)], axis=1)

In [44]:
df_user_items=df_final.drop('items', axis=1)

In [45]:
df_user_items.drop(['steam_id','items_count','user_url'],axis=1,inplace=True)

In [47]:
df_user_items.to_parquet(r'E:\Data Science\PI1\Datasets\users_items.parquet')

## user_reviews

In [2]:
data2=[]

with gzip.open(r'E:\Data Science\PI1\Datasets\user_reviews.json.gz', 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)
            data2.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

df_reviews = pd.DataFrame(data2)

In [3]:
df_reviews.drop('user_url',axis=1,inplace=True)


In [4]:
df_expandido = df_reviews.explode('reviews')


In [5]:
df_descompuesto = pd.json_normalize(df_expandido['reviews'])


In [6]:
#Se eliminan las columnas que no van a ser útiles
df_descompuesto.drop(['funny','posted','last_edited'], axis=1, inplace=True)

In [7]:
df_final = pd.concat([df_expandido.reset_index(drop=True), df_descompuesto.reset_index(drop=True)], axis=1)

In [8]:
df_user_reviews = df_final.drop('reviews', axis=1)

In [62]:
#preprocesamiento de la columna 'review'
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [11]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\juan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\juan\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [12]:
#Se define el lematizador y las stop words 
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [64]:
#Se pasana a minúsculas todas las reviews en una nueva columna llamada 'prepocesed_review'
df_user_reviews['preprocessed_review'] = df_user_reviews['review'].str.lower()

#Se eliminan los signos de puntuación
df_user_reviews['preprocessed_review'] = df_user_reviews['preprocessed_review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Se tokeniza la columna 'prepocesed_review'
df_user_reviews['preprocessed_review']=df_user_reviews['preprocessed_review'].apply(word_tokenize)

#se lematiza la columna 'preprocesed_review'para poder utilizarla en el análisis de sentimientos
df_user_reviews['preprocessed_review']=df_user_reviews['preprocessed_review'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [65]:
#Análisis de sentimientos

In [70]:
df_user_reviews['preprocessed_review'] 

0        [simple, yet, with, great, replayability, in, ...
1                 [it, unique, and, worth, a, playthrough]
2        [great, atmosphere, the, gunplay, can, be, a, ...
3        [i, know, what, you, think, when, you, see, th...
4        [for, a, simple, it, actually, not, all, that,...
                               ...                        
59328    [a, must, have, classic, from, steam, definite...
59329    [this, game, is, a, perfect, remake, of, the, ...
59330    [had, so, much, fun, plaing, this, and, collec...
59331                                                  [d]
59332                                   [so, much, fun, d]
Name: preprocessed_review, Length: 59333, dtype: object

In [9]:
from textblob import TextBlob

Unnamed: 0,user_id,item_id,helpful,recommend,review
0,76561197970982479,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...
59328,76561198312638244,70,No ratings yet,True,a must have classic from steam definitely wort...
59329,76561198312638244,362890,No ratings yet,True,this game is a perfect remake of the original ...
59330,LydiaMorley,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59331,LydiaMorley,730,No ratings yet,True,:D


In [61]:
df_user_reviews.to_parquet(r'E:\Data Science\PI1\Datasets\user_reviews.parquet')

# Endpoints

def PlayTimeGenre( genero : str ): Debe devolver año con mas horas jugadas para dicho género.
Ejemplo de retorno: {"Año de lanzamiento con más horas jugadas para Género X" : 2013}

In [None]:
def PlaytimeGenre(genero:str)
    
    return

def UserForGenre( genero : str ): Debe devolver el usuario que acumula más horas jugadas para el género dado y una lista de la acumulación de horas jugadas por año.

Ejemplo de retorno: {"Usuario con más horas jugadas para Género X" : us213ndjss09sdf, "Horas jugadas":[{Año: 2013, Horas: 203}, {Año: 2012, Horas: 100}, {Año: 2011, Horas: 23}]}

In [None]:
def UserForGenre(genero:str):
    
    return

def UsersRecommend( año : int ): Devuelve el top 3 de juegos MÁS recomendados por usuarios para el año dado. (reviews.recommend = True y comentarios positivos/neutrales)

Ejemplo de retorno: [{"Puesto 1" : X}, {"Puesto 2" : Y},{"Puesto 3" : Z}]

In [None]:
def UsersRecommend(año:int):
    
    return  

def UsersWorstDeveloper( año : int ): Devuelve el top 3 de desarrolladoras con juegos MENOS recomendados por usuarios para el año dado. (reviews.recommend = False y comentarios negativos)

Ejemplo de retorno: [{"Puesto 1" : X}, {"Puesto 2" : Y},{"Puesto 3" : Z}]

In [None]:
def UserWorstDeveloper(año:int):
    
    return

def sentiment_analysis( empresa desarrolladora : str ): Según la empresa desarrolladora, se devuelve un diccionario con el nombre de la desarrolladora como llave y una lista con la cantidad total de registros de reseñas de usuarios que se encuentren categorizados con un análisis de sentimiento como valor.

Ejemplo de retorno: {'Valve' : [Negative = 182, Neutral = 120, Positive = 278]}

In [None]:
def sentiment_analysis(empresa_desarrolladora:str):
    
    return

In [69]:
df_user_final.columns


Index(['user_id', 'items_count', 'steam_id', 'user_url', 'item_id',
       'item_name', 'playtime_forever', 'playtime_2weeks'],
      dtype='object')

In [75]:
prueba=df_games.copy()

In [82]:
prueba=prueba.dropna(subset=['genres'])
prueba['genres'].isnull().sum()


0

In [83]:
# Supongamos que 'df_games' es tu DataFrame, 'genres' es la columna que contiene las listas y 'Action' es el valor que estás buscando
mask = prueba['genres'].apply(lambda x: 'Action' in x)

# Ahora puedes usar 'mask' para filtrar tu DataFrame
df_filtered = prueba[mask]

In [88]:
df_filtered['genres'](10000)

TypeError: 'Series' object is not callable

In [71]:
df_review_final.columns

Index(['user_id', 'user_url', 'funny', 'posted', 'last_edited', 'item_id',
       'helpful', 'recommend', 'review'],
      dtype='object')