# Diseño de Dataframes para los Endpoints

## importacion, limpieza y normalizacion de las tablas a utilizar

* importamos la librerias

In [348]:
import pandas as pd

* hacemos la carga de archivos

In [349]:
df_games = pd.read_parquet("../Datasets_finales/games_steam.parquet")
df_items = pd.read_parquet("../Datasets_finales/user_items.parquet")
df_reviews = pd.read_parquet("../Datasets_endpoints/user_reviews_sentiment.parquet")

In [350]:
df_items.head(2)

Unnamed: 0,item_id,playtime_forever,user_id,items_count,steam_id
0,10,6.0,76561197970982479,277,76561197970982479
1,20,0.0,76561197970982479,277,76561197970982479


In [351]:
df_reviews.head(2)

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis
0,76561197970982479,1250,True,2
1,76561197970982479,22200,True,2


In [352]:
df_games.head(2)

Unnamed: 0,genres,app_name,release_date,price,item_id,developer
0,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,2018-01-04,4.99,761140,Kotoshiro
1,"[Free to Play, Indie, RPG, Strategy]",Ironbound,2018-01-04,free,643980,Secret Level SRL


### tablas finales para el uso de las consultas
![Tabla dividido por las solicitudes de las consulta](../_src/Tabla_Endpoints.jpg)

### Limpieza y Normalizacion de Games

* **funcion para tomar el primer elemento de la lista de generos**

In [353]:
def primer_genre(lista):
    if isinstance(lista, object) and lista is not None:
        return lista[0]
    else:
        return None

* hacemos la busqueda de duplicados

In [354]:
df_games[df_games.duplicated(subset=["app_name","release_date","developer"],keep=False)]

Unnamed: 0,genres,app_name,release_date,price,item_id,developer
1067,"[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,2012-09-07,19.99,200260,"Rocksteady Studios,Feral Interactive (Mac)"
1507,"[Adventure, Casual, Indie]",The Dream Machine: Chapter 4,2013-08-05,4.99,93304,Cockroach Inc.
1508,"[Adventure, Casual, Indie]",The Dream Machine: Chapter 4,2013-08-05,,94304,Cockroach Inc.
10348,,Escape Room,NaT,0.99,654070,
13419,,Escape Room,NaT,free,758210,
13892,[Action],Wolfenstein II: The New Colossus,2017-10-26,59.99,612880,Machine Games
14571,[Action],Wolfenstein II: The New Colossus,2017-10-26,59.99,612880,Machine Games
30176,[Action],Aliens: Colonial Marines - Reconnaissance Pack,2013-05-07,,224850,Gearbox Software
30177,[Action],Aliens: Colonial Marines - Reconnaissance Pack,2013-05-07,29.99,219441,Gearbox Software
30959,"[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,2012-09-07,19.99,200260,"Rocksteady Studios,Feral Interactive (Mac)"


* borramos los juegos duplicados que esten coincidiendo con el nombre,fecha y desarrollador

In [355]:
df_games.drop_duplicates(subset=["app_name","release_date","developer"],keep="first",inplace=True)

In [356]:
df_games[df_games.duplicated(subset=["app_name","release_date","developer"],keep=False)]

Unnamed: 0,genres,app_name,release_date,price,item_id,developer


* sacamos el primer genero de cada juego

In [357]:
df_games["genre"] = df_games['genres'].apply(primer_genre)

* sacamos el año de release_date y los nulos lo ponemos en 0 y convertimos en enteros a los años

In [358]:
df_games["año"] = df_games["release_date"].dt.year
df_games["año"] = df_games["año"].fillna(0)
df_games["año"] = df_games["año"].astype(int)

* vamos a renombrar id_item a item_id y reordenar las columnas

In [359]:
columnas_a_eliminar_games = ["genres","app_name","release_date"]
columnas_a_eliminar_items = ["items_count","steam_id"]
df_games = df_games.drop(columnas_a_eliminar_games,axis=1)
df_items = df_items.drop(columnas_a_eliminar_items,axis=1)

In [360]:
ordenar_games = ["item_id","price","genre","año","developer"]
df_games = df_games[ordenar_games]
df_games.head(5)

Unnamed: 0,item_id,price,genre,año,developer
0,761140,4.99,Action,2018,Kotoshiro
1,643980,free,Free to Play,2018,Secret Level SRL
2,670290,free,Casual,2017,Poolians.com
3,767400,0.99,Action,2017,彼岸领域
4,773570,2.99,,0,


* convertimos a minusculas la columna de developer y genre por si se pone mal alguna letra en formato mayuscula-minuscula

In [361]:
df_games["developer"] = df_games["developer"].str.lower()
df_games["genre"] = df_games["genre"].str.lower()

In [362]:
df_games.head(5)

Unnamed: 0,item_id,price,genre,año,developer
0,761140,4.99,action,2018,kotoshiro
1,643980,free,free to play,2018,secret level srl
2,670290,free,casual,2017,poolians.com
3,767400,0.99,action,2017,彼岸领域
4,773570,2.99,,0,


### Limpieza y Normalizacion de Items

* hacemos un chequeo de los nulos de la tabla items y borramos los datos nulos que seas necesarios

In [363]:
df_items[df_items["item_id"].isna()].head(5)

Unnamed: 0,item_id,playtime_forever,user_id
3733,,,Wackky
3849,,,76561198079601835
6019,,,hellom8o
6523,,,starkillershadow553
7237,,,darkenkane


In [364]:
df_items[df_items["playtime_forever"].isna()].head(5)

Unnamed: 0,item_id,playtime_forever,user_id
3733,,,Wackky
3849,,,76561198079601835
6019,,,hellom8o
6523,,,starkillershadow553
7237,,,darkenkane


* tanto item_id como playtime_forever tienen nulos, lo mas importante es limpiar los item_id ya que es importante al igual que user_id.

In [365]:
df_items.dropna(subset="item_id",inplace=True)

* hacemos un chequeo de los tipo de formato que estan los datos, y vamos a convertir el item_id en enteros

In [366]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5094082 entries, 0 to 5110794
Data columns (total 3 columns):
 #   Column            Dtype  
---  ------            -----  
 0   item_id           object 
 1   playtime_forever  float64
 2   user_id           object 
dtypes: float64(1), object(2)
memory usage: 155.5+ MB


In [367]:
df_items["item_id"] = df_items["item_id"].astype(int)

* hacemos una normalizacion a minuscula de los User

In [368]:
df_items["user_id"] = df_items["user_id"].str.lower()

* Eliminacion de usuarios las cuales superan las horas maximas de juego, por irregularidades que se descubrieron haciendo el EDA

In [369]:
df_horastotales = df_items.groupby("user_id").agg(
    horas_totales = ("playtime_forever","sum")
).sort_values(by="horas_totales",ascending=False).reset_index()

In [370]:
df_horastotales

Unnamed: 0,user_id,horas_totales
0,rebas_as_f-t,4660393.0
1,shinomegami,3303502.0
2,sp3ctre,2822860.0
3,downsyndromekid,2790419.0
4,terminally-chill,1734007.0
...,...,...
70907,76561198063149846,0.0
70908,76561198068996116,0.0
70909,76561198085192494,0.0
70910,76561198054414666,0.0


* hacemos la union por usuarios y sumatoria de horas jugadas en total y posteriormente limpiamos a los que descubrimos que superaban las horas desde que se creo steam

In [371]:
df_horastotales = df_horastotales[df_horastotales["horas_totales"] < 160110]
df_horastotales.drop("horas_totales",axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_horastotales.drop("horas_totales",axis=1,inplace=True)


* ahora hacemos la union de la tabla corregida con la de items, para solo dejar a los usuarios que no son outliers

In [372]:
df_items = pd.merge(df_items,df_horastotales,on="user_id",how='inner')

In [373]:
df_items

Unnamed: 0,item_id,playtime_forever,user_id
0,10,0.0,js41637
1,80,0.0,js41637
2,100,0.0,js41637
3,300,220.0,js41637
4,30,0.0,js41637
...,...,...,...
3520634,346330,0.0,76561198329548331
3520635,373330,0.0,76561198329548331
3520636,388490,3.0,76561198329548331
3520637,521570,4.0,76561198329548331


* ya quedó listo nuestra tabla items

In [374]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3520639 entries, 0 to 3520638
Data columns (total 3 columns):
 #   Column            Dtype  
---  ------            -----  
 0   item_id           int32  
 1   playtime_forever  float64
 2   user_id           object 
dtypes: float64(1), int32(1), object(1)
memory usage: 67.2+ MB


### Limpieza y normalizacion de Reviews

* Hacemos un chequeo de los nulos y hacemos la limpieza

In [375]:
df_reviews[df_reviews["item_id"].isna()].head()

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis
137,gdxsd,,,1
177,76561198094224872,,,1
2558,76561198021575394,,,1
9956,cmuir37,,,1
13528,Jaysteeny,,,1


In [376]:
df_reviews.dropna(subset=["item_id"],inplace=True)
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58430 entries, 0 to 58457
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             58430 non-null  object
 1   item_id             58430 non-null  object
 2   recommend           58430 non-null  object
 3   sentiment_analysis  58430 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 2.2+ MB


* convertimos user_id a minusculas

In [377]:
df_reviews["user_id"] = df_reviews["user_id"].str.lower()

* ya tenemos la tabla limpio de nulos, a lo que procedemos a convertir los item_id en formato entero 

In [378]:
df_reviews["item_id"] = df_reviews["item_id"].astype(int)
df_reviews.head()

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis
0,76561197970982479,1250,True,2
1,76561197970982479,22200,True,2
2,76561197970982479,43110,True,2
3,js41637,251610,True,2
4,js41637,227300,True,2


### **ya tenemos todas las tablas limpias hasta donde se puede para no evitar perdida de datos y la normalización para la futura unión de las tablas**

## Diseño de Endpoints para las consultas de la API

### Consulta 1: Def developer(desarrollador: str), Debe devolver Cantidad de items y porcentaje de contenido Free por año según empresa desarrolladora.

* copiamos la tabla de games y creamos el endpoint para la consulta 1

In [379]:
df_developer = df_games.copy()
df_developer

Unnamed: 0,item_id,price,genre,año,developer
0,761140,4.99,action,2018,kotoshiro
1,643980,free,free to play,2018,secret level srl
2,670290,free,casual,2017,poolians.com
3,767400,0.99,action,2017,彼岸领域
4,773570,2.99,,0,
...,...,...,...,...,...
32128,773640,1.99,casual,2018,"nikita ""ghost_rus"""
32129,733530,4.99,casual,2018,sacada
32130,610660,1.99,indie,2018,laush dmitriy sergeevich
32131,658870,4.99,casual,2017,"xropi,stev3ns"


* no vamos a necesitar la columna genero ya que no lo vamos a utilizar

In [380]:
df_developer.drop("genre",axis=1,inplace=True)
df_developer.head(5)

Unnamed: 0,item_id,price,año,developer
0,761140,4.99,2018,kotoshiro
1,643980,free,2018,secret level srl
2,670290,free,2017,poolians.com
3,767400,0.99,2017,彼岸领域
4,773570,2.99,0,


* hacemos un chequeo de nulos en las columnas developer y año

In [381]:
df_developer[df_developer["developer"].isna()].head(5)

Unnamed: 0,item_id,price,año,developer
4,773570,2.99,0,
11,724910,free,0,
19,772590,4.99,0,
20,640250,2.99,0,
22,711440,0.99,0,


In [382]:
df_developer[df_developer["año"] == 0]

Unnamed: 0,item_id,price,año,developer
4,773570,2.99,0,
10,768570,,0,qucheza
11,724910,free,0,
19,772590,4.99,0,
20,640250,2.99,0,
...,...,...,...,...
32085,755830,,0,"greyson richey,nicholas lives"
32086,708070,,0,rechargecomplete
32095,250440,,0,the amiable
32121,772180,,0,versovr


* Limpiamos los valores nulos de developer y los años que tienen 0

In [383]:
df_developer.dropna(subset=["developer"],inplace=True)
df_developer = df_developer[df_developer["año"] != 0]
df_developer.reset_index(drop=True)

Unnamed: 0,item_id,price,año,developer
0,761140,4.99,2018,kotoshiro
1,643980,free,2018,secret level srl
2,670290,free,2017,poolians.com
3,767400,0.99,2017,彼岸领域
4,772540,3.99,2018,trickjump games ltd
...,...,...,...,...
28524,745400,1.99,2018,bidoniera games
28525,773640,1.99,2018,"nikita ""ghost_rus"""
28526,733530,4.99,2018,sacada
28527,610660,1.99,2018,laush dmitriy sergeevich


In [384]:
df_developer.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28529 entries, 0 to 32131
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   item_id    28529 non-null  int32 
 1   price      27555 non-null  object
 2   año        28529 non-null  int32 
 3   developer  28529 non-null  object
dtypes: int32(2), object(2)
memory usage: 891.5+ KB


* **Ya terminamos de hacer la tabla que se va a usar en la consulta 1**

### Consulta 2: def userdata (user_id: str), Debe devolver cantidad de dinero gastado por el usuario, el porcentaje de recomendación en base a reviews.recommend y cantidad de items.

In [385]:
df_userdata_review = df_reviews.copy() 
df_userdata_items = df_items.copy()
df_userdata_games = df_games.copy()

In [386]:
df_userdata_games.head(2)

Unnamed: 0,item_id,price,genre,año,developer
0,761140,4.99,action,2018,kotoshiro
1,643980,free,free to play,2018,secret level srl


In [387]:
df_userdata_items.head(2)

Unnamed: 0,item_id,playtime_forever,user_id
0,10,0.0,js41637
1,80,0.0,js41637


In [388]:
df_userdata_review.head(2)

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis
0,76561197970982479,1250,True,2
1,76561197970982479,22200,True,2


* **columnas a utilizar de cada tabla:**
- games : price, item_id
- items : item_id y user_id
- review : item_id, ,user_id,recommend

* **la tabla final debe quedar:**
- user_id,item_id,price,recommend

* procedemos a hacer la limpieza de columnas que no vayamos a utilizar

In [389]:
a_eliminar_games =["genre","año","developer"]
df_userdata_games.drop(a_eliminar_games,axis=1,inplace=True)
df_userdata_items.drop("playtime_forever",axis=1,inplace=True)
df_userdata_review.drop("sentiment_analysis",axis=1,inplace=True)

* hacemos un chequeo de nulos y normalizacion de datos para evitar confictos a la de unir las tablas

In [390]:
df_userdata_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32128 entries, 0 to 32132
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   item_id  32128 non-null  int32 
 1   price    30753 non-null  object
dtypes: int32(1), object(1)
memory usage: 627.5+ KB


In [391]:
df_userdata_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3520639 entries, 0 to 3520638
Data columns (total 2 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   item_id  int32 
 1   user_id  object
dtypes: int32(1), object(1)
memory usage: 40.3+ MB


In [392]:
df_userdata_review.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58430 entries, 0 to 58457
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    58430 non-null  object
 1   item_id    58430 non-null  int32 
 2   recommend  58430 non-null  object
dtypes: int32(1), object(2)
memory usage: 1.6+ MB


* primero vamos a hacer la union de la tabla por coincidentes del id del item

In [393]:
df_userdata_union = pd.merge(df_userdata_games,df_userdata_items,on="item_id") 
df_userdata_union.head(10)

Unnamed: 0,item_id,price,user_id
0,282010,9.99,i_did_911_just_saying
1,282010,9.99,76561197962104795
2,282010,9.99,red_pride
3,282010,9.99,nstokesy
4,282010,9.99,76561198017341667
5,282010,9.99,wallfinstein
6,282010,9.99,ryno69445
7,282010,9.99,76561198003546877
8,282010,9.99,duanea85
9,282010,9.99,76561198025815476


* chequeamos si quedó bien la union de la tabla de uno a muchos y se ve que quedó perfecto

In [394]:
df_userdata_union[df_userdata_union["user_id"] == "utnerd24"].head(5)

Unnamed: 0,item_id,price,user_id


In [395]:
df_userdata_union[df_userdata_union["item_id"] == 282010].head(5)

Unnamed: 0,item_id,price,user_id
0,282010,9.99,i_did_911_just_saying
1,282010,9.99,76561197962104795
2,282010,9.99,red_pride
3,282010,9.99,nstokesy
4,282010,9.99,76561198017341667


* al hacer un analisis visual vemos que tanto item_id como user_id son relaciones muchos a muchos. ahora vamos a hacer la ultima union de la tabla con reviews

In [396]:
df_userdata_final = pd.merge(df_userdata_union,df_userdata_review,on=["user_id"])
df_userdata_final

Unnamed: 0,item_id_x,price,user_id,item_id_y,recommend
0,282010,9.99,i_did_911_just_saying,214420,True
1,282010,9.99,i_did_911_just_saying,365670,True
2,282010,9.99,i_did_911_just_saying,440,True
3,282010,9.99,i_did_911_just_saying,4000,True
4,282010,9.99,i_did_911_just_saying,221260,True
...,...,...,...,...,...
3477294,80,9.99,943525,298110,False
3477295,80,9.99,76561198312638244,233270,True
3477296,80,9.99,76561198312638244,130,True
3477297,80,9.99,76561198312638244,70,True


In [397]:
df_userdata_final[(df_userdata_final["item_id_x"] == df_userdata_final["item_id_y"])]

Unnamed: 0,item_id_x,price,user_id,item_id_y,recommend
95,282010,9.99,instigatorau,282010,True
207,70,9.99,gamerfag,70,True
224,70,9.99,76561198020928326,70,True
454,70,9.99,meeeedie,70,True
650,70,9.99,kewlkatzz,70,True
...,...,...,...,...,...
3474233,80,9.99,killeramateur,80,True
3475761,80,9.99,76561198023508728,80,False
3476101,80,9.99,green290,80,True
3477119,80,9.99,174gamecuman700kngkakak,80,True


In [398]:
df_userdata_final.reset_index(drop=True,inplace=True)
df_userdata_final.head(10)

Unnamed: 0,item_id_x,price,user_id,item_id_y,recommend
0,282010,9.99,i_did_911_just_saying,214420,True
1,282010,9.99,i_did_911_just_saying,365670,True
2,282010,9.99,i_did_911_just_saying,440,True
3,282010,9.99,i_did_911_just_saying,4000,True
4,282010,9.99,i_did_911_just_saying,221260,True
5,282010,9.99,red_pride,70000,True
6,282010,9.99,red_pride,212680,True
7,282010,9.99,76561198017341667,247730,False
8,282010,9.99,76561198017341667,213670,True
9,282010,9.99,76561198017341667,219640,True


In [399]:
df_userdata_final.loc[(df_userdata_final["item_id_x"] != df_userdata_final["item_id_y"]),"recommend"] = None

In [400]:
df_userdata_final.drop("item_id_x",axis=1,inplace=True)
df_userdata_final.rename(columns={"item_id_y":"item_id"},inplace=True)

In [401]:
df_userdata_final

Unnamed: 0,price,user_id,item_id,recommend
0,9.99,i_did_911_just_saying,214420,
1,9.99,i_did_911_just_saying,365670,
2,9.99,i_did_911_just_saying,440,
3,9.99,i_did_911_just_saying,4000,
4,9.99,i_did_911_just_saying,221260,
...,...,...,...,...
3477294,9.99,943525,298110,
3477295,9.99,76561198312638244,233270,
3477296,9.99,76561198312638244,130,
3477297,9.99,76561198312638244,70,


In [402]:
df_userdata_final[df_userdata_final["user_id"] == "utnerd24"]

Unnamed: 0,price,user_id,item_id,recommend


* **Ya terminamos la tabla de user_data donde finalmente se pudo hacer la union de las 3 tablas** 

### Consulta 3: def userforgenre (genre: str), Debe devolver el usuario que acumula más horas jugadas para el género dado y una lista de la acumulación de horas jugadas por año de lanzamiento.

In [403]:
df_userforgenre_games = df_games.copy()
df_userforgenre_items = df_items.copy()

In [404]:
df_userforgenre_games.head()

Unnamed: 0,item_id,price,genre,año,developer
0,761140,4.99,action,2018,kotoshiro
1,643980,free,free to play,2018,secret level srl
2,670290,free,casual,2017,poolians.com
3,767400,0.99,action,2017,彼岸领域
4,773570,2.99,,0,


In [405]:
df_userforgenre_items.head()

Unnamed: 0,item_id,playtime_forever,user_id
0,10,0.0,js41637
1,80,0.0,js41637
2,100,0.0,js41637
3,300,220.0,js41637
4,30,0.0,js41637


* borramos las columnas que no vayamos a utilizar y chequeamos como quedó

In [406]:
df_userforgenre_games.drop(["developer","price"],axis=1,inplace=True)
df_userforgenre_games

Unnamed: 0,item_id,genre,año
0,761140,action,2018
1,643980,free to play,2018
2,670290,casual,2017
3,767400,action,2017
4,773570,,0
...,...,...,...
32128,773640,casual,2018
32129,733530,casual,2018
32130,610660,indie,2018
32131,658870,casual,2017


* hacemos la union de las tablas y ordenamos por item_id

In [407]:
df_userforgenre_final = pd.merge(df_userforgenre_games,df_userforgenre_items,on='item_id',sort=True)
df_userforgenre_final.head(5)

Unnamed: 0,item_id,genre,año,playtime_forever,user_id
0,10,action,2000,0.0,js41637
1,10,action,2000,0.0,riot-punch
2,10,action,2000,0.0,cadmusthreepointoh
3,10,action,2000,328.0,weiedkrsat
4,10,action,2000,580.0,fr0stedline


* convertimos los valores nulos de genre en "desconocido" y posteriormente borramos los que tengan con genre desconocido
* eliminamos las filas que tienen año 0

In [408]:
df_userforgenre_final["genre"] = df_userforgenre_final["genre"].fillna("desconocido")
df_userforgenre_final= df_userforgenre_final[(df_userforgenre_final["genre"] != "desconocido")]
df_userforgenre_final = df_userforgenre_final[(df_userforgenre_final["año"] != 0)]

* reseteamos el index para tener un mejor orden en el index

In [409]:
df_userforgenre_final.reset_index(drop=True)
df_userforgenre_final

Unnamed: 0,item_id,genre,año,playtime_forever,user_id
0,10,action,2000,0.0,js41637
1,10,action,2000,0.0,riot-punch
2,10,action,2000,0.0,cadmusthreepointoh
3,10,action,2000,328.0,weiedkrsat
4,10,action,2000,580.0,fr0stedline
...,...,...,...,...,...
2917198,527510,casual,2016,32.0,76561198038296985
2917199,527810,action,2016,0.0,kiokizz
2917200,527810,action,2016,2.0,lexby
2917201,527810,action,2016,0.0,manuelpr039


* borramos la columna item_id porque ya no nos hace falta

In [410]:
df_userforgenre_final.drop("item_id",axis=1,inplace=True)

* hacemos un chequeo de como quedó la tabla y quedó todo perfecto

In [411]:
df_userforgenre_final = df_userforgenre_final[(df_userforgenre_final["playtime_forever"] != 0.0) & (df_userforgenre_final["año"] >= 2003)]

In [412]:
df_userforgenre_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1846382 entries, 11995 to 2917200
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   genre             object 
 1   año               int32  
 2   playtime_forever  float64
 3   user_id           object 
dtypes: float64(1), int32(1), object(2)
memory usage: 63.4+ MB


borramos todos las filas donde no tienen horas jugadas por año los jugadores

### Consulta 4: def best_developer_year (año : int), Devuelve el top 3 de desarrolladores con juegos MÁS recomendados por usuarios para el año dado. (reviews.recommend = True y comentarios positivos)

In [413]:
df_best_developer_year_review = df_reviews.copy()
df_best_developer_year_games = df_games.copy()

In [414]:
df_best_developer_year_review.head(2)

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis
0,76561197970982479,1250,True,2
1,76561197970982479,22200,True,2


In [415]:
df_best_developer_year_games.head(2)

Unnamed: 0,item_id,price,genre,año,developer
0,761140,4.99,action,2018,kotoshiro
1,643980,free,free to play,2018,secret level srl


In [416]:
df_best_developer_year_games.drop(["genre","price"],axis=1,inplace=True)
df_best_developer_year_games.head(5)

Unnamed: 0,item_id,año,developer
0,761140,2018,kotoshiro
1,643980,2018,secret level srl
2,670290,2017,poolians.com
3,767400,2017,彼岸领域
4,773570,0,


In [417]:
df_best_developer_year_games = df_best_developer_year_games[(df_best_developer_year_games["año"] != 0) & (df_best_developer_year_games["developer"].notna())]
df_best_developer_year_games.head(5)

Unnamed: 0,item_id,año,developer
0,761140,2018,kotoshiro
1,643980,2018,secret level srl
2,670290,2017,poolians.com
3,767400,2017,彼岸领域
5,772540,2018,trickjump games ltd


In [418]:
# df_best_developer_year_review.drop("user_id",axis=1,inplace=True)

In [419]:
df_best_developer_year_review[df_best_developer_year_review["item_id"] == 221040]

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis
627,unholyfallenangel,221040,True,2
4000,sdgsdfgsdfhshdgh,221040,True,1
8544,archangel147,221040,True,2
10315,scyana4,221040,True,0
11136,76561198045470612,221040,True,2
20040,akivararyuzaki,221040,True,1
20864,76561198100422868,221040,True,1
22626,29998000,221040,True,1
26013,76561198084056692,221040,True,1
27666,defalt14,221040,True,0


In [420]:
df_best_developer_year_games[df_best_developer_year_games["developer"] == "capcom"]

Unnamed: 0,item_id,año,developer
267,21660,2009,capcom
287,21680,2008,capcom
288,21670,2009,capcom
365,45700,2008,capcom
1318,221040,2013,capcom
...,...,...,...
31186,45796,2011,capcom
31187,45795,2011,capcom
31188,45793,2011,capcom
31189,45792,2011,capcom


* hacemos la union de la tabla de games y reviews

In [421]:
df_best_developer_year_final = pd.merge(df_best_developer_year_games,df_best_developer_year_review,on="item_id",sort=True)

* reseteamos el index y borramos el item_id

In [422]:
df_best_developer_year_final.reset_index(drop=True,inplace=True)

* hacemos una vista final de como quedó la tabla

In [423]:
df_best_developer_year_final[df_best_developer_year_final["item_id"] == 221040]

Unnamed: 0,item_id,año,developer,user_id,recommend,sentiment_analysis
26928,221040,2013,capcom,unholyfallenangel,True,2
26929,221040,2013,capcom,sdgsdfgsdfhshdgh,True,1
26930,221040,2013,capcom,archangel147,True,2
26931,221040,2013,capcom,scyana4,True,0
26932,221040,2013,capcom,76561198045470612,True,2
26933,221040,2013,capcom,akivararyuzaki,True,1
26934,221040,2013,capcom,76561198100422868,True,1
26935,221040,2013,capcom,29998000,True,1
26936,221040,2013,capcom,76561198084056692,True,1
26937,221040,2013,capcom,defalt14,True,0


### consulta 5: def developer_reviews_analysis (developer: str), se devuelve un diccionario con el nombre del desarrollador como llave y una lista con la cantidad total de registros de reseñas de usuarios que se encuentren categorizados con un análisis de sentimiento como valor positivo o negativo.

* borramos las filas con sentiment_analyst neutrales que es 1 porque no lo usamos en la consulta 4 ni 5

In [424]:
df_best_developer_year_final = df_best_developer_year_final[df_best_developer_year_final["sentiment_analysis"] != 1]
df_best_developer_year_final

Unnamed: 0,item_id,año,developer,user_id,recommend,sentiment_analysis
0,10,2000,valve,bennysaputra,True,2
2,10,2000,valve,76561198040188061,True,2
3,10,2000,valve,mayshowganmore,True,2
4,10,2000,valve,bestintheworldthund3r,True,2
6,10,2000,valve,76561198072207162,True,2
...,...,...,...,...,...,...
49336,521340,2016,vladimir maslov,76561198021048954,True,2
49337,521430,2016,david mulder,76561198075141715,True,2
49338,521570,2016,tamationgames,tfhuawgscvg,True,0
49339,521570,2016,tamationgames,76561198071122396,True,0


# Exportamos los Endpoints

* endpoint de la consulta 1

In [425]:
df_developer.to_parquet("../Datasets_endpoints/endpoint_Developer.parquet",compression="brotli")

* endpoint de la consulta 2

In [426]:
df_userdata_final.to_parquet("../Datasets_endpoints/endpoint_userdata.parquet",compression="brotli")

* endpoint de la consulta 3

In [427]:
df_userforgenre_final.to_parquet("../Datasets_endpoints/enpoint_userforgenre.parquet",compression="brotli")

* endpoint de la consulta 4 y 5

In [428]:
df_best_developer_year_final.to_parquet("../Datasets_endpoints/endpoint_games_reviews.parquet",compression="brotli")