## Modelado de Datos para Funciones de Machine Learning

Importación de librerias necesarias:

In [28]:
import pandas as pd
import numpy as np
import ast
import FuncionExtra as f

import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator

#import pyarrow as pa
#import pyarrow.parquet as pq

### Modelo de recomendación item - item:

Generamos un modelo de recomendación en base a las **tags** de cada item.

In [29]:
steam_games = pd.read_csv('steam_games_limpio.csv', encoding='utf-8')



In [30]:
steam_games.drop_duplicates(subset= 'id', inplace= True)

Extraemos solo las columnas que utilizaremos para nuestro modelo de recomendación:

In [31]:
steam_games = steam_games[['id', 'app_name', 'tags']]

Visualizamos el DataFrame resultante:

In [32]:
steam_games

Unnamed: 0,id,app_name,tags
0,761140.0,Lost Summoner Kitty,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim..."
5,643980.0,Ironbound,"['Free to Play', 'Strategy', 'Indie', 'RPG', '..."
9,670290.0,Real Pool 3D - Poolians,"['Free to Play', 'Simulation', 'Sports', 'Casu..."
14,767400.0,弹炸人2222,"['Action', 'Adventure', 'Casual']"
17,773570.0,Log Challenge,"['Action', 'Indie', 'Casual', 'Sports']"
...,...,...,...
74821,773640.0,Colony On Mars,"['Strategy', 'Indie', 'Casual', 'Simulation']"
74825,733530.0,LOGistICAL: South Africa,"['Strategy', 'Indie', 'Casual']"
74828,610660.0,Russian Roads,"['Indie', 'Simulation', 'Racing']"
74831,658870.0,EXIT 2 - Directions,"['Indie', 'Casual', 'Puzzle', 'Singleplayer', ..."


Información general:

In [33]:
steam_games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32133 entries, 0 to 74833
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        32132 non-null  float64
 1   app_name  32131 non-null  object 
 2   tags      31970 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1004.2+ KB


Porcentaje de nulos:

In [34]:
f.porcentaje_valores_nulos(steam_games)

La columna id tiene un  0.00 % de valores nulos
La columna app_name tiene un  0.01 % de valores nulos
La columna tags tiene un  0.51 % de valores nulos


In [35]:
steam_games

Unnamed: 0,id,app_name,tags
0,761140.0,Lost Summoner Kitty,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim..."
5,643980.0,Ironbound,"['Free to Play', 'Strategy', 'Indie', 'RPG', '..."
9,670290.0,Real Pool 3D - Poolians,"['Free to Play', 'Simulation', 'Sports', 'Casu..."
14,767400.0,弹炸人2222,"['Action', 'Adventure', 'Casual']"
17,773570.0,Log Challenge,"['Action', 'Indie', 'Casual', 'Sports']"
...,...,...,...
74821,773640.0,Colony On Mars,"['Strategy', 'Indie', 'Casual', 'Simulation']"
74825,733530.0,LOGistICAL: South Africa,"['Strategy', 'Indie', 'Casual']"
74828,610660.0,Russian Roads,"['Indie', 'Simulation', 'Racing']"
74831,658870.0,EXIT 2 - Directions,"['Indie', 'Casual', 'Puzzle', 'Singleplayer', ..."


Eliminamos los nulos, por más que su porcentaje sea bajo:

In [36]:
steam_games = steam_games.dropna()

Vemos el tipo de dato de la columna **tags**:

In [37]:
type(steam_games['tags'][0])

str

Lo cambiamos a lista:

In [38]:
steam_games['tags'] = steam_games['tags'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  steam_games['tags'] = steam_games['tags'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])


Verificamos:

In [39]:
type(steam_games['tags'][0])

list

Separamos las etiquetas:

In [40]:
games = steam_games['tags'].apply(lambda x: ','.join(map(str, x)))
games = games.str.split(',', expand=True)


Verificamos:

In [41]:
games

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,Strategy,Action,Indie,Casual,Simulation,,,,,,,,,,,,,,,
5,Free to Play,Strategy,Indie,RPG,Card Game,Trading Card Game,Turn-Based,Fantasy,Tactical,Dark Fantasy,Board Game,PvP,2D,Competitive,Replay Value,Character Customization,Female Protagonist,Difficult,Design & Illustration,
9,Free to Play,Simulation,Sports,Casual,Indie,Multiplayer,,,,,,,,,,,,,,
14,Action,Adventure,Casual,,,,,,,,,,,,,,,,,
17,Action,Indie,Casual,Sports,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74821,Strategy,Indie,Casual,Simulation,,,,,,,,,,,,,,,,
74825,Strategy,Indie,Casual,,,,,,,,,,,,,,,,,
74828,Indie,Simulation,Racing,,,,,,,,,,,,,,,,,
74831,Indie,Casual,Puzzle,Singleplayer,Atmospheric,Relaxing,,,,,,,,,,,,,,


Generamos Dummies:

In [42]:
games = pd.get_dummies(games)

In [43]:
games

Unnamed: 0,0_2D,0_2D Fighter,0_360 Video,0_3D Platformer,0_4 Player Local,0_4X,0_Action,0_Action RPG,0_Adventure,0_America,...,19_Underwater,19_Utilities,19_VR,19_Video Production,19_Violent,19_Visual Novel,19_Voxel,19_Walking Simulator,19_Wargame,19_Zombies
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74821,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74825,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74828,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Concatenamos ambos DataFrames:

In [44]:
steam_games_final = pd.concat([steam_games, games], axis= 1)

In [45]:
steam_games_final

Unnamed: 0,id,app_name,tags,0_2D,0_2D Fighter,0_360 Video,0_3D Platformer,0_4 Player Local,0_4X,0_Action,...,19_Underwater,19_Utilities,19_VR,19_Video Production,19_Violent,19_Visual Novel,19_Voxel,19_Walking Simulator,19_Wargame,19_Zombies
0,761140.0,Lost Summoner Kitty,"[Strategy, Action, Indie, Casual, Simulation]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,643980.0,Ironbound,"[Free to Play, Strategy, Indie, RPG, Card Game...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,670290.0,Real Pool 3D - Poolians,"[Free to Play, Simulation, Sports, Casual, Ind...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,767400.0,弹炸人2222,"[Action, Adventure, Casual]",0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
17,773570.0,Log Challenge,"[Action, Indie, Casual, Sports]",0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74821,773640.0,Colony On Mars,"[Strategy, Indie, Casual, Simulation]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74825,733530.0,LOGistICAL: South Africa,"[Strategy, Indie, Casual]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74828,610660.0,Russian Roads,"[Indie, Simulation, Racing]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74831,658870.0,EXIT 2 - Directions,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Reindexamos:

In [46]:
steam_games_final.reset_index(drop=True, inplace=True)

Guardamos el dataset en un archivo parquet:

In [47]:
steam_games_final.to_parquet('steamgames_items_items.parquet')

Generamos el coeficiente del coseno:

In [52]:
similitud = cosine_similarity(steam_games_final.iloc[:,3:])

In [53]:
similitud.shape

(31969, 31969)

In [54]:
similitud

array([[1.        , 0.10259784, 0.18257419, ..., 0.        , 0.        ,
        0.36514837],
       [0.10259784, 1.        , 0.09365858, ..., 0.        , 0.        ,
        0.09365858],
       [0.18257419, 0.09365858, 1.        , ..., 0.23570226, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.23570226, ..., 1.        , 0.23570226,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.23570226, 1.        ,
        0.        ],
       [0.36514837, 0.09365858, 0.        , ..., 0.        , 0.        ,
        1.        ]])

Pasamos a DataFrame la matriz generada:

In [55]:
similitud_df = pd.DataFrame(similitud, index= steam_games['app_name'], columns=steam_games['app_name'])

In [56]:
similitud_df.head()

app_name,Lost Summoner Kitty,Ironbound,Real Pool 3D - Poolians,弹炸人2222,Log Challenge,Battle Royale Trainer,SNOW - All Access Basic Pass,SNOW - All Access Pro Pass,SNOW - All Access Legend Pass,Race,...,The spy who shot me™,Raining blocks,Bravium,BAE 2,Kebab it Up!,Colony On Mars,LOGistICAL: South Africa,Russian Roads,EXIT 2 - Directions,Maze Run VR
app_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Lost Summoner Kitty,1.0,0.102598,0.182574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.258199,0.0,0.0,0.0,0.0,0.223607,0.258199,0.0,0.0,0.365148
Ironbound,0.102598,1.0,0.093659,0.0,0.0,0.0,0.114708,0.114708,0.114708,0.0,...,0.132453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093659
Real Pool 3D - Poolians,0.182574,0.093659,1.0,0.0,0.0,0.0,0.204124,0.204124,0.204124,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235702,0.0,0.0
弹炸人2222,0.0,0.0,0.0,1.0,0.57735,0.408248,0.0,0.0,0.0,0.0,...,0.666667,0.0,0.0,0.0,0.516398,0.288675,0.333333,0.0,0.0,0.235702
Log Challenge,0.0,0.0,0.0,0.57735,1.0,0.176777,0.5,0.5,0.5,0.0,...,0.288675,0.0,0.0,0.0,0.67082,0.5,0.57735,0.0,0.0,0.0


Probamos un juego para tener una referencia cuando probemos nuestra función:

In [97]:
similitud_df['Shadow Ops: Red Mercury'].sort_values(ascending=False)

app_name
Shadow Ops: Red Mercury                          1.000000
Call of Duty®: Advanced Warfare - Season Pass    0.750000
QUAKE II Mission Pack: The Reckoning             0.750000
Project: Snowblind                               0.750000
Call of Duty®: Ghosts - Wolf Skin                0.707107
                                                   ...   
Space Geekz - The Crunchy Flakes Conspiracy      0.000000
Speebot                                          0.000000
Gold Rush: The Game                              0.000000
BAFL - Brakes Are For Losers                     0.000000
Maze Run VR                                      0.000000
Name: Shadow Ops: Red Mercury, Length: 31969, dtype: float64

Ordenamos los indices de los juegos más similares de manera descendente, y luego seleccionamos los primeros 5:

In [58]:
similitud = np.argsort(-similitud, axis=1)[:, 1:6]

Guardamos lamatriz en un objeto Numpy Binario:

In [59]:
np.save('similitud.npy', similitud)

Creamos nuestra función:

In [4]:
def game_recomendation(id):
    cosine_sim = np.load('./similitud.npy')

    idx = steam_games_final[steam_games_final['id'] == float(id)].index[0]

    rec_indices = cosine_sim[idx]
    rec_games = steam_games_final.iloc[rec_indices]['app_name']

    print(f'TOP 5 juegos similares a {id}:')
    print('-----' * 8)

    recomendaciones = []  # Lista para almacenar las recomendaciones

    for count, game_id in enumerate(rec_games, start=1):
        recomendaciones.append(f'Número {count}: {game_id}')

        # Limitar a 5 recomendaciones
        if count == 5:
            break


Buscamos el **id** del juego que antes habiamos probado:

In [109]:
steam_games[steam_games['app_name'] == 'Shadow Ops: Red Mercury']

Unnamed: 0,id,app_name,tags
68414,286770.0,Shadow Ops: Red Mercury,"[Action, FPS, First-Person, Shooter]"


Y podemos verificar que nuestro sistema de recomendación funciona perfectamente:

In [3]:
game_recomendation(286770)

NameError: name 'np' is not defined