# ETL

Aquí se haran las transformaciones necesarias para tener datasets para optimizar las consultas

In [126]:
import pandas as pd
import numpy as np
import ast

In [63]:
df_games=pd.read_parquet(r'Datasets/steam_games_clean.parquet')

In [64]:
df_users=pd.read_parquet(r'Datasets/users_items_clean.parquet')

In [65]:
df_reviews=pd.read_parquet(r'Datasets/user_reviews_clean.parquet')

In [66]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3865427 entries, 0 to 3865426
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int16  
 2   item_id           int32  
 3   playtime_forever  float16
dtypes: float16(1), int16(1), int32(1), object(1)
memory usage: 59.0+ MB


In [67]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27442 entries, 0 to 27441
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   genres        27442 non-null  object 
 1   app_name      27442 non-null  object 
 2   tags          27442 non-null  object 
 3   price         27442 non-null  float64
 4   id            27442 non-null  int32  
 5   developer     27442 non-null  object 
 6   release_year  27442 non-null  object 
dtypes: float64(1), int32(1), object(5)
memory usage: 1.4+ MB


In [68]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59305 entries, 0 to 59304
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    59305 non-null  object
 1   item_id    59305 non-null  int32 
 2   recommend  59305 non-null  bool  
 3   review     59305 non-null  object
dtypes: bool(1), int32(1), object(2)
memory usage: 1.2+ MB


## Análisis de sentimiento
En esta sección antes de realizar otras transformaciones se realizara el análisis de sentimientos mediante la librería **TextBlob** para simplificar el proceso. Pero esta análisis se puede realizar con otras librerías más complejas o mas elaboradas.

In [69]:
from textblob import TextBlob

#Se crea una función para devolver el sentimiento de un review pero en un rango de 0 a 2
def sentiment(review):
    blob=TextBlob(review)
    resultado=1 + round(blob.sentiment.polarity)
    return resultado

In [70]:
df_reviews['sentiment']=df_reviews['review'].apply(sentiment)

In [71]:
df_reviews.drop(columns=['review'],inplace=True)

In [72]:
df_reviews

Unnamed: 0,user_id,item_id,recommend,sentiment
0,76561197970982479,1250,True,1
1,76561197970982479,22200,True,1
2,76561197970982479,43110,True,1
3,js41637,251610,True,1
4,js41637,227300,True,1
...,...,...,...,...
59300,76561198312638244,70,True,1
59301,76561198312638244,362890,True,1
59302,LydiaMorley,273110,True,1
59303,LydiaMorley,730,True,2


## Transformación
Una vez finalizado el análisis de sentimiento vamos a comenzar a transformar y unir los datasets para que la API y el modelo de similitud del coseno puedan utilizarlos

### Dataset developers

In [73]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59305 entries, 0 to 59304
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    59305 non-null  object
 1   item_id    59305 non-null  int32 
 2   recommend  59305 non-null  bool  
 3   sentiment  59305 non-null  int64 
dtypes: bool(1), int32(1), int64(1), object(1)
memory usage: 1.2+ MB


In [74]:
df_item_sentiment_counts = df_reviews.groupby('item_id')['sentiment'].value_counts().unstack().reset_index()

In [75]:
df_item_sentiment_counts

sentiment,item_id,0,1,2
0,10,1.0,49.0,7.0
1,20,,16.0,1.0
2,30,,3.0,1.0
3,40,,1.0,
4,50,,4.0,
...,...,...,...,...
3677,521340,,,2.0
3678,521430,,1.0,
3679,521570,1.0,1.0,
3680,521990,,1.0,


In [76]:
df_item_recommend_count=df_reviews.groupby('item_id')['recommend'].value_counts().unstack().reset_index()


In [77]:
df_counts=pd.merge(df_item_sentiment_counts,df_item_recommend_count,on='item_id')

In [78]:
df_counts.fillna(0,inplace=True)

In [79]:
df_developers=pd.merge(df_games[['price','developer','release_year','id']],df_counts,left_on='id',right_on='item_id')

In [80]:
df_developers.drop('id',axis=1,inplace=True)

In [81]:
df_developers.rename(columns={'0_x':'Negative','1_x':'Neutral',2:'Positive','False_y':'False','True_y':'True'},inplace=True)

In [82]:
df_developers = df_developers.astype({'Negative': 'int16', 'Neutral': 'int16', 'Positive': 'int16', 'False': 'int16', 'True': 'int16'})

In [83]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3865427 entries, 0 to 3865426
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int16  
 2   item_id           int32  
 3   playtime_forever  float16
dtypes: float16(1), int16(1), int32(1), object(1)
memory usage: 59.0+ MB


In [84]:
df_developers.to_parquet(r'Datasets/developers.parquet')

### Dataset data_ML

In [118]:
df=df_games.copy()

In [120]:
#Se eliminan las columnas que no se van a utilizar
df.drop(columns=['developer','price','release_year'],inplace=True)


In [128]:
#Para simplicar se elige el primer valor de la lista dentro de la columna genres
df['genres'] = df['genres'].apply(lambda x: x[0] if len(x) > 0 else None)

In [130]:
df['tags'] = df['tags'].apply(lambda x: x[0] if len(x) > 0 else None)

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27442 entries, 0 to 27441
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   genres    27442 non-null  object
 1   app_name  27442 non-null  object
 2   tags      27442 non-null  object
 3   id        27442 non-null  int32 
dtypes: int32(1), object(3)
memory usage: 750.5+ KB


In [135]:
df.to_parquet(r'Datasets/data_ML.parquet')

# Prueba de ENDPOINTS

In [91]:
df_developers[df_developers['developer']=='Valve']

Unnamed: 0,price,developer,release_year,item_id,Negative,Neutral,Positive,False,True
1,9.99,Valve,1998,70,0,56,6,1,61
140,0.0,Valve,2010,630,0,83,10,3,90
204,19.98,Valve,2011,620,2,318,54,3,371
306,14.99,Valve,2012,730,52,3368,339,281,3478
1049,0.0,Valve,2016,323910,0,6,0,3,3
2757,9.99,Valve,2010,300,0,27,6,2,31
2785,19.98,Valve,2009,550,9,664,79,21,731
2866,19.98,Valve,2008,500,0,30,3,1,32
2890,9.99,Valve,2007,400,0,95,8,0,103
2914,0.0,Valve,2007,440,44,3256,436,125,3611


In [92]:
df_developers['developer'].unique()

array(['Stainless Games Ltd', 'Valve', 'Outerlight Ltd.', ...,
       'Neversoft', 'Malfador Machinations', 'Strategy First'],
      dtype=object)

def developer( desarrollador : str ): Cantidad de items y porcentaje de contenido Free por año según empresa desarrolladora. 

def developer_reviews_analysis( desarrolladora : str ): Según el desarrollador, se devuelve un diccionario con el nombre del desarrollador como llave y una lista con la cantidad total de registros de reseñas de usuarios que se encuentren categorizados con un análisis de sentimiento como valor positivo o negativo.

In [93]:
desarrollador="valve"
desarrollador=desarrollador.title()
desarrollador

'Valve'

In [94]:
df_developers[df_developers['developer']==desarrollador][['Positive','Negative']].sum()

Positive    1009
Negative     113
dtype: int64