## Modelado de Datos para Funciones de Machine Learning

Importación de librerias necesarias:

In [52]:
import pandas as pd
import numpy as np
import ast
import FuncionExtra as f

import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity

### Modelo de recomendación item - item:

Generamos un modelo de recomendación en base a las **tags** de cada item.

In [53]:
steam_games = pd.read_csv('../steam_games_limpio.csv', encoding='utf-8')



In [54]:
steam_games.drop_duplicates(subset= 'item_id', inplace= True)

Extraemos solo las columnas que utilizaremos para nuestro modelo de recomendación:

In [55]:
steam_games = steam_games[['item_id', 'app_name', 'tags']]

Visualizamos el DataFrame resultante:

In [56]:
steam_games

Unnamed: 0,item_id,app_name,tags
0,761140.0,Lost Summoner Kitty,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim..."
5,643980.0,Ironbound,"['Free to Play', 'Strategy', 'Indie', 'RPG', '..."
9,670290.0,Real Pool 3D - Poolians,"['Free to Play', 'Simulation', 'Sports', 'Casu..."
14,767400.0,弹炸人2222,"['Action', 'Adventure', 'Casual']"
17,773570.0,Log Challenge,"['Action', 'Indie', 'Casual', 'Sports']"
...,...,...,...
74821,773640.0,Colony On Mars,"['Strategy', 'Indie', 'Casual', 'Simulation']"
74825,733530.0,LOGistICAL: South Africa,"['Strategy', 'Indie', 'Casual']"
74828,610660.0,Russian Roads,"['Indie', 'Simulation', 'Racing']"
74831,658870.0,EXIT 2 - Directions,"['Indie', 'Casual', 'Puzzle', 'Singleplayer', ..."


Información general:

In [57]:
steam_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32133 entries, 0 to 74833
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   item_id   32132 non-null  float64
 1   app_name  32131 non-null  object 
 2   tags      31970 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1004.2+ KB


Porcentaje de nulos:

In [58]:
f.porcentaje_valores_nulos(steam_games)

La columna item_id tiene un  0.00 % de valores nulos
La columna app_name tiene un  0.01 % de valores nulos
La columna tags tiene un  0.51 % de valores nulos


In [59]:
steam_games

Unnamed: 0,item_id,app_name,tags
0,761140.0,Lost Summoner Kitty,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim..."
5,643980.0,Ironbound,"['Free to Play', 'Strategy', 'Indie', 'RPG', '..."
9,670290.0,Real Pool 3D - Poolians,"['Free to Play', 'Simulation', 'Sports', 'Casu..."
14,767400.0,弹炸人2222,"['Action', 'Adventure', 'Casual']"
17,773570.0,Log Challenge,"['Action', 'Indie', 'Casual', 'Sports']"
...,...,...,...
74821,773640.0,Colony On Mars,"['Strategy', 'Indie', 'Casual', 'Simulation']"
74825,733530.0,LOGistICAL: South Africa,"['Strategy', 'Indie', 'Casual']"
74828,610660.0,Russian Roads,"['Indie', 'Simulation', 'Racing']"
74831,658870.0,EXIT 2 - Directions,"['Indie', 'Casual', 'Puzzle', 'Singleplayer', ..."


Eliminamos los nulos, por más que su porcentaje sea bajo:

In [60]:
steam_games = steam_games.dropna()

Vemos el tipo de dato de 'steam_games':

In [63]:
f.tipo_datos(steam_games)

Unnamed: 0,nombre_columna,tipo_dato
0,item_id,[<class 'int'>]
1,app_name,[<class 'str'>]
2,tags,[<class 'str'>]


Cambiamos el tipo de dato de la columna 'item_id' de float a int:

In [64]:
steam_games['item_id'] = steam_games['item_id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  steam_games['item_id'] = steam_games['item_id'].astype(int)


Verificamos:

In [65]:
f.tipo_datos(steam_games)

Unnamed: 0,nombre_columna,tipo_dato
0,item_id,[<class 'int'>]
1,app_name,[<class 'str'>]
2,tags,[<class 'str'>]


Cambiamos el tipo de dato de la columna 'tags' a list:

In [66]:
steam_games['tags'] = steam_games['tags'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  steam_games['tags'] = steam_games['tags'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])


Verificamos:

In [67]:
type(steam_games['tags'][0])

list

Separamos las etiquetas:

In [68]:
games = steam_games['tags'].apply(lambda x: ','.join(map(str, x)))

games = games.str.split(',', expand=True)

Verificamos:

In [69]:
games

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,Strategy,Action,Indie,Casual,Simulation,,,,,,,,,,,,,,,
5,Free to Play,Strategy,Indie,RPG,Card Game,Trading Card Game,Turn-Based,Fantasy,Tactical,Dark Fantasy,Board Game,PvP,2D,Competitive,Replay Value,Character Customization,Female Protagonist,Difficult,Design & Illustration,
9,Free to Play,Simulation,Sports,Casual,Indie,Multiplayer,,,,,,,,,,,,,,
14,Action,Adventure,Casual,,,,,,,,,,,,,,,,,
17,Action,Indie,Casual,Sports,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74821,Strategy,Indie,Casual,Simulation,,,,,,,,,,,,,,,,
74825,Strategy,Indie,Casual,,,,,,,,,,,,,,,,,
74828,Indie,Simulation,Racing,,,,,,,,,,,,,,,,,
74831,Indie,Casual,Puzzle,Singleplayer,Atmospheric,Relaxing,,,,,,,,,,,,,,


Generamos Dummies:

In [70]:
games = pd.get_dummies(games)

In [71]:
games

Unnamed: 0,0_2D,0_2D Fighter,0_360 Video,0_3D Platformer,0_4 Player Local,0_4X,0_Action,0_Action RPG,0_Adventure,0_America,...,19_Underwater,19_Utilities,19_VR,19_Video Production,19_Violent,19_Visual Novel,19_Voxel,19_Walking Simulator,19_Wargame,19_Zombies
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
17,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74821,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
74825,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
74828,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
74831,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Concatenamos ambos DataFrames:

In [72]:
steam_games_final = pd.concat([steam_games, games], axis= 1)

In [73]:
steam_games_final

Unnamed: 0,item_id,app_name,tags,0_2D,0_2D Fighter,0_360 Video,0_3D Platformer,0_4 Player Local,0_4X,0_Action,...,19_Underwater,19_Utilities,19_VR,19_Video Production,19_Violent,19_Visual Novel,19_Voxel,19_Walking Simulator,19_Wargame,19_Zombies
0,761140,Lost Summoner Kitty,"[Strategy, Action, Indie, Casual, Simulation]",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,643980,Ironbound,"[Free to Play, Strategy, Indie, RPG, Card Game...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,670290,Real Pool 3D - Poolians,"[Free to Play, Simulation, Sports, Casual, Ind...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14,767400,弹炸人2222,"[Action, Adventure, Casual]",False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
17,773570,Log Challenge,"[Action, Indie, Casual, Sports]",False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74821,773640,Colony On Mars,"[Strategy, Indie, Casual, Simulation]",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
74825,733530,LOGistICAL: South Africa,"[Strategy, Indie, Casual]",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
74828,610660,Russian Roads,"[Indie, Simulation, Racing]",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
74831,658870,EXIT 2 - Directions,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Reindexamos:

In [74]:
steam_games_final.reset_index(drop=True, inplace=True)

Generamos el coeficiente del coseno:

In [75]:
similitud = cosine_similarity(steam_games_final.iloc[:,3:])

In [78]:
similitud.shape

(31969, 31969)

In [79]:
similitud

array([[1.        , 0.10259784, 0.18257419, ..., 0.        , 0.        ,
        0.36514837],
       [0.10259784, 1.        , 0.09365858, ..., 0.        , 0.        ,
        0.09365858],
       [0.18257419, 0.09365858, 1.        , ..., 0.23570226, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.23570226, ..., 1.        , 0.23570226,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.23570226, 1.        ,
        0.        ],
       [0.36514837, 0.09365858, 0.        , ..., 0.        , 0.        ,
        1.        ]])

Dividimos el DataFrame con la finalidad de poder montarlo en Render:

In [80]:
cantidad_filas = len(steam_games_final)

filas_divididas = cantidad_filas // 10

modelo_render = steam_games_final.iloc[:filas_divididas]

In [81]:
modelo_render.shape

(3196, 5457)

In [82]:
similitud_modelo_render = cosine_similarity(modelo_render.iloc[:,3:])

In [83]:
modelo_render.to_parquet('Data/modelo_render.parquet')

In [88]:
def recommend_games(id):

    game = modelo_render[modelo_render['item_id'].astype(int) == id]

    idx = game.index[0]

    
    sample_size = 2000  
    df_sample = modelo_render.sample(n=sample_size, random_state=42)  

    
    similitud = cosine_similarity([modelo_render.iloc[idx, 3:]], df_sample.iloc[:, 3:])

    
    similitud = similitud[0]

   
    juegos_similares = [(i, similitud[i]) for i in range(len(similitud)) if i != idx]
    juegos_similares = sorted(juegos_similares, key=lambda x: x[1], reverse=True)

    
    ind_juegos_similares = [i[0] for i in juegos_similares[:5]]

    
    nombre_juegos_similares = df_sample['app_name'].iloc[ind_juegos_similares].tolist()

    return {'Juegos_similares': nombre_juegos_similares}

In [89]:
recommend_games(767400)

{'Juegos_similares': ['Biozone',
  'Atomic Adam: Episode 1',
  'Scooby Doo! & Looney Tunes Cartoon Universe: Adventure',
  'Raid Mode Character: Albert Wesker',
  'Raid Mode: Album Storage B']}