# Recomendación Juego

In [14]:
import pandas as pd 
import numpy as np
import nltk

sistema de recomendación item-item:

def recomendacion_juego( id de producto ): Ingresando el id de producto, deberíamos recibir una lista con 5 juegos recomendados similares al ingresado.

## Carga de data

In [15]:
## Cargamos la data necesaria para solucionar este problema.

# steam_games_col = ['id','app_name','publisher','genres','price','developer','release_year']
steam_games_col = ['id','app_name','genres']
# steam_colums = ['id','app_name','release_year']
steam_games = pd.read_csv('./Datasets/steam_games_cleaned.csv.gz',compression='gzip',usecols=steam_games_col)


user_items_cols = ['item_id']
user_items = pd.read_csv('./Datasets/user_items_cleaned.csv.gz',compression='gzip',usecols=user_items_cols)

user_review_col = ['item_id']
user_review = pd.read_csv('./Datasets/user_review_cleaned.csv.gz',compression='gzip', lineterminator='\n',usecols=user_review_col)


## renombraremos la columna id por steam_id
steam_games.rename(columns={'id':'item_id'}, inplace=True)


### Selección de usuarios 

- Dado que tenemos memoria limitada y espacio limitado vamos a seleccionar a los usuarios que se encuentran en steam y tambien hayan realizado algun comentario. Para ello realizaremos un merge con las tablas de información 

In [16]:
merge_1 = steam_games.merge(user_review, on='item_id',how='inner').groupby(['item_id']).first()

In [None]:
merge_1

Unnamed: 0_level_0,genres,app_name
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Counter-Strike
20,"['Action', 'FPS', 'Multiplayer', 'Classic', 'S...",Team Fortress Classic
30,"['FPS', 'World War II', 'Multiplayer', 'Action...",Day of Defeat
40,"['Action', 'FPS', 'Multiplayer', 'Classic', 'S...",Deathmatch Classic
50,"['FPS', 'Action', 'Sci-fi', 'Singleplayer', 'C...",Half-Life: Opposing Force
...,...,...
421770,"['Strategy', 'Massively Multiplayer', 'Indie',...",Pool Nation FX - Unlock Online
421890,"['Action', 'Casual', 'Simulation']",Avaris 2: The Return of the Empress
423120,"['Indie', 'RPG', 'Choose Your Own Adventure', ...",Community College Hero: Trial by Fire
423880,"['Free to Play', 'Anime', 'Visual Novel', 'Ind...",Carpe Diem


In [None]:
merge_2 = merge_1.merge(user_items, on='item_id', how='inner').groupby(['item_id']).first().reset_index()

In [None]:
merge_2

Unnamed: 0,item_id,genres,app_name
0,10,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Counter-Strike
1,20,"['Action', 'FPS', 'Multiplayer', 'Classic', 'S...",Team Fortress Classic
2,30,"['FPS', 'World War II', 'Multiplayer', 'Action...",Day of Defeat
3,40,"['Action', 'FPS', 'Multiplayer', 'Classic', 'S...",Deathmatch Classic
4,50,"['FPS', 'Action', 'Sci-fi', 'Singleplayer', 'C...",Half-Life: Opposing Force
...,...,...,...
2256,421630,"['Indie', 'RPG', 'Steampunk', 'Choose Your Own...",A Study in Steampunk: Choice by Gaslight
2257,421890,"['Action', 'Casual', 'Simulation']",Avaris 2: The Return of the Empress
2258,423120,"['Indie', 'RPG', 'Choose Your Own Adventure', ...",Community College Hero: Trial by Fire
2259,423880,"['Free to Play', 'Anime', 'Visual Novel', 'Ind...",Carpe Diem


## top 10 genero

In [None]:
from collections import Counter
from itertools import chain

def select_n_best(caracteristica, n = 10 ):
    
    # Dividir las palabras en cada fila y contar la frecuencia de cada palabra
    conteo_palabras = Counter(chain.from_iterable(caracteristica.str.split(', ')))

    # Obtener las n palabras más comunes
    palabras_mas_comunes = conteo_palabras.most_common(n)

    # print("Palabras más comunes:")
    # for palabra, frecuencia in palabras_mas_comunes:
    #     print(f"{palabra}: {frecuencia}")
    
    return([palabras_mas_comunes[i][0] for i in range(n)])

In [None]:
## Tomamos los n generos mas significativos
palabras_mas_relevantes = select_n_best(steam_games['genres'])

## convertir lista en texto
text_palabras = (', ').join(palabras_mas_relevantes)

def drop_not_important(genres):
  return(', ').join([palabra for palabra in genres.split(', ') if palabra.capitalize() in palabras_mas_relevantes])

merge_2['genres'] = merge_2['genres'].apply(drop_not_important)

## concatenar features

In [None]:
merge_2

Unnamed: 0,item_id,genres,app_name
0,10,,Counter-Strike
1,20,,Team Fortress Classic
2,30,,Day of Defeat
3,40,,Deathmatch Classic
4,50,,Half-Life: Opposing Force
...,...,...,...
2256,421630,,A Study in Steampunk: Choice by Gaslight
2257,421890,,Avaris 2: The Return of the Empress
2258,423120,,Community College Hero: Trial by Fire
2259,423880,,Carpe Diem


In [None]:
cols_concatenate = ['app_name','genres']

merge_2['features'] = merge_2[cols_concatenate].apply(lambda row: ', '.join(row), axis=1)

merge_2

Unnamed: 0,item_id,genres,app_name,features
0,10,,Counter-Strike,"Counter-Strike,"
1,20,,Team Fortress Classic,"Team Fortress Classic,"
2,30,,Day of Defeat,"Day of Defeat,"
3,40,,Deathmatch Classic,"Deathmatch Classic,"
4,50,,Half-Life: Opposing Force,"Half-Life: Opposing Force,"
...,...,...,...,...
2256,421630,,A Study in Steampunk: Choice by Gaslight,"A Study in Steampunk: Choice by Gaslight,"
2257,421890,,Avaris 2: The Return of the Empress,"Avaris 2: The Return of the Empress,"
2258,423120,,Community College Hero: Trial by Fire,"Community College Hero: Trial by Fire,"
2259,423880,,Carpe Diem,"Carpe Diem,"


In [None]:
consulta_06 = merge_2[['item_id','features']]
consulta_06.to_csv('recomendacion_juego.csv.gz',compression='gzip', index=False)

## Carga tabla consulta

In [17]:
consulta_06 = pd.read_csv('recomendacion_juego.csv.gz', compression='gzip')

## Lectura

In [18]:

import nltk
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#Eliminaremos las stopwords

stop_words_steams = ['aaaaaa', 'ab', 'abbey','abe', 'abramenko']
stop = list(stopwords.words('english'))
stop += stop_words_steams


tf = TfidfVectorizer(stop_words=stop, token_pattern=r'\b[a-zA-Z]\w+\b' )

data_vector = tf.fit_transform(consulta_06['features'])

data_vector_df = pd.DataFrame(data_vector.toarray(), index=consulta_06['item_id'], columns = tf.get_feature_names_out())


In [19]:
data_vector_df.sort_values('absolute',ascending=False).head(10)

Unnamed: 0_level_0,absolute,absolution,abyss,abyssal,academy,ace,act,action,activity,actual,...,zestiria,zigfrak,ziggurat,zoeds,zombi,zombie,zombies,zomboid,zoombinis,zuma
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
320140,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57600,0.59481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289690,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289760,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289930,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Matriz de correlación

In [20]:
vector_similitud_coseno = cosine_similarity(data_vector_df.values)

cos_sim_df = pd.DataFrame(vector_similitud_coseno, index=data_vector_df.index, columns=data_vector_df.index)

In [21]:
cos_sim_df.iloc[0:10 ,0:10]

item_id,10,20,30,40,50,60,70,80,130,220
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.665381,0.0,0.0
20,0.0,1.0,0.0,0.340948,0.0,0.0,0.0,0.0,0.0,0.0
30,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,0.0,0.340948,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.0,0.0,0.0,0.0,1.0,0.0,0.629223,0.0,0.398896,0.629223
60,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
70,0.0,0.0,0.0,0.0,0.629223,0.0,1.0,0.0,0.633951,1.0
80,0.665381,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
130,0.0,0.0,0.0,0.0,0.398896,0.0,0.633951,0.0,1.0,0.633951
220,0.0,0.0,0.0,0.0,0.629223,0.0,1.0,0.0,0.633951,1.0


## Consulta

In [22]:
item_id = 284950.0	 

juegos_similares = cos_sim_df.loc[item_id].nlargest(6)

top5 = juegos_similares.iloc[1:6]

top5

item_id
365900    0.384982
313040    0.351169
370480    0.345343
342970    0.337999
8970      0.327975
Name: 284950, dtype: float64

In [23]:

name_consulta = consulta_06.set_index('item_id').loc[item_id].values[0].split(',')[0]

In [24]:
name_consulta

'Pixel Puzzles: Japan'

In [25]:
consulta_06.set_index('item_id').loc[top5.index]

Unnamed: 0_level_0,features
item_id,Unnamed: 1_level_1
365900,"Pixel Dungeon,"
313040,"Pixel Hunter,"
370480,"Pixel Galaxy,"
342970,"Pixel Survivors,"
8970,"Axel & Pixel,"


In [26]:
resultado = consulta_06.set_index('item_id').loc[top5.index]['features'].apply(lambda x: x.split(',')[0]).values
print(f"Los juegos similares a {name_consulta} son :\n")
for name in resultado:
  print("\n",name)

Los juegos similares a Pixel Puzzles: Japan son :


 Pixel Dungeon

 Pixel Hunter

 Pixel Galaxy

 Pixel Survivors

 Axel & Pixel


In [29]:
def recomendacion_juego_v2(item_id :int):
  
  consulta_06 = pd.read_csv('recomendacion_juego.csv.gz',compression='gzip')
  
  nombre_juego = consulta_06.set_index('item_id').loc[item_id].values[0].split(',')[0]

  #Eliminaremos las stopwords

  stop_words_steams = ['aaaaaa', 'ab', 'abbey','abe', 'abramenko']
  stop = list(stopwords.words('english'))
  stop += stop_words_steams


  tf = TfidfVectorizer(stop_words=stop, token_pattern=r'\b[a-zA-Z]\w+\b' )

  data_vector = tf.fit_transform(consulta_06['features'])

  data_vector_df = pd.DataFrame(data_vector.toarray(), index=consulta_06['item_id'], columns = tf.get_feature_names_out())
    
  vector_similitud_coseno = cosine_similarity(data_vector_df.values)
  
  cos_sim_df = pd.DataFrame(vector_similitud_coseno, index=data_vector_df.index, columns=data_vector_df.index)
  
  ##top5
  juegos_similares = cos_sim_df.loc[item_id].nlargest(6)

  top5 = juegos_similares.iloc[1:6]
  
  
  resultado = consulta_06.set_index('item_id').loc[top5.index]['features'].apply(lambda x: x.split(',')[0]).values
  print(f"Los juegos similares a {nombre_juego} son :\n")
  for name in resultado:
    print("\n",name)
    
  resultado = consulta_06.set_index('item_id').loc[top5.index]['features'].apply(lambda x: x.split(',')[0]).values
    
  return list(resultado)
  

In [30]:
recomendacion_juego_v2(252490.0)

Los juegos similares a Rust son :


 Counter-Strike

 Team Fortress Classic

 Day of Defeat

 Deathmatch Classic

 Half-Life: Opposing Force


['Counter-Strike',
 'Team Fortress Classic',
 'Day of Defeat',
 'Deathmatch Classic',
 'Half-Life: Opposing Force']

Primero, se extrae el nombre del juego asociado al item_id proporcionado. Luego, se eliminan las stopwords y se utiliza la técnica TF-IDF para vectorizar las características de los juegos. Se calcula la similitud coseno entre los vectores de características, y se obtiene un DataFrame que almacena las similitudes entre los juegos.

La función identifica los cinco juegos más similares al juego dado y los presenta en orden descendente de similitud coseno. Finalmente, devuelve una lista con los nombres de los juegos recomendados.

Es importante destacar que la función podría mejorar su modularidad dividiendo algunas de sus operaciones en funciones más pequeñas, lo que facilitaría la comprensión y el mantenimiento del código. Además, se podría considerar el manejo de errores y excepciones para mejorar la robustez de la función.





