In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pinecone
from pinecone import Pinecone
from pinecone import Pinecone, ServerlessSpec
from sklearn.preprocessing import normalize
import warnings
import ast
from recommender import *

warnings.filterwarnings('ignore')

  from tqdm.autonotebook import tqdm


In [2]:
# Cargar los datos de juegos (descripciones)
df_juegos = pd.read_csv('../data/raw/boardgames_10000_juegos.csv')

In [3]:
df_juegos.columns

Index(['BGGId', 'Name', 'Year_Published', 'Description', 'Min_Players',
       'Max_Players', 'Min_Playtime', 'Max_Playtime', 'Average_Rating',
       'Bayesian_Average_Rating', 'Number_of_Ratings', 'Mechanics',
       'Categories'],
      dtype='object')

In [4]:
# Obtener la descripción de los juegos
df_descripciones = df_juegos[['BGGId','Name','Description', 'Mechanics', 'Categories', 'Average_Rating', 'Bayesian_Average_Rating']]  

In [5]:
df_descripciones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   BGGId                    10000 non-null  int64  
 1   Name                     10000 non-null  object 
 2   Description              9992 non-null   object 
 3   Mechanics                10000 non-null  object 
 4   Categories               10000 non-null  object 
 5   Average_Rating           10000 non-null  float64
 6   Bayesian_Average_Rating  10000 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 547.0+ KB


In [6]:
df_descripciones.dropna(inplace=True)
df_descripciones.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9992 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   BGGId                    9992 non-null   int64  
 1   Name                     9992 non-null   object 
 2   Description              9992 non-null   object 
 3   Mechanics                9992 non-null   object 
 4   Categories               9992 non-null   object 
 5   Average_Rating           9992 non-null   float64
 6   Bayesian_Average_Rating  9992 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 624.5+ KB


In [7]:
df_descripciones.sample(5)

Unnamed: 0,BGGId,Name,Description,Mechanics,Categories,Average_Rating,Bayesian_Average_Rating
4062,287749,AR Team,A cooperative hybrid game with augmented reali...,"['Area Movement', 'Cooperative Game']",['Electronic'],0.0,0.0
3466,297650,Arkeis: The Threat Beneath,The Sand Worm Expansion holds an additional st...,"['Action Points', 'Cooperative Game', 'Dice Ro...","['Expansion for Base-game', 'Adventure', 'Anci...",7.62194,5.55076
2377,242659,Game-o-Rama,Game-o-Rama is a boardgame themed around the a...,"['Area Movement', 'Pick-up and Deliver']",['Abstract Strategy'],7.25,0.0
701,220566,Siege of Stone,Publisher's blurb:&#10;&#10;The Ironfang Legio...,[],[],8.0,0.0
8553,5849,The Wonderful World of Music Game,Name that Disney tune from a toon. Electronic...,['Memory'],"[""Children's Game"", 'Electronic', 'Memory', 'M...",5.69643,0.0


In [8]:
# Convertir cadenas de texto que parecen listas a listas reales
df_descripciones['Mechanics'] = df_descripciones['Mechanics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_descripciones['Categories'] = df_descripciones['Categories'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Verificar si las listas están correctamente formateadas
print(df_juegos[['Mechanics', 'Categories']].head())

                                           Mechanics  \
0                  ['Action Points', 'Dice Rolling']   
1                                                 []   
2                                                 []   
3  ['Dice Rolling', 'Grid Movement', 'Hexagon Gri...   
4                                                 []   

                                          Categories  
0  ['Ancient', 'Book', 'Medieval', 'Miniatures', ...  
1                                                 []  
2                                                 []  
3                        ['Wargame', 'World War II']  
4                                                 []  


In [9]:
# Convertir listas a cadenas de texto
df_descripciones['Mecanicas'] = df_descripciones['Mechanics'].apply(lambda x: ', '.join(x) if isinstance(x, list) and len(x) > 0 else "")
df_descripciones['Categorias'] = df_descripciones['Categories'].apply(lambda x: ', '.join(x) if isinstance(x, list) and len(x) > 0 else "")

# Combinar la descripción limpia con las mecánicas y categorías
df_descripciones['Descripcion_completa'] = df_descripciones['Description'] + " " + df_descripciones['Mecanicas'] + " " + df_descripciones['Categorias']

In [10]:
# Aplicar el preprocesamiento mejorado a las descripciones
df_descripciones['Descripcion_completa'] = df_descripciones['Descripcion_completa'].apply(preprocesar_descripcion)

In [11]:
df_descripciones.sample(5)

Unnamed: 0,BGGId,Name,Description,Mechanics,Categories,Average_Rating,Bayesian_Average_Rating,Mecanicas,Categorias,Descripcion_completa
9753,110306,Die interaktive Quiz-Show,With the DVD and a gimmick called GameDisk one...,[],[Trivia],0.0,0.0,,Trivia,dvd gimmick called gamedisk one play solitaire...
7788,136531,Glimpse the Beyond,From publisher blurb:&#10;&#10;The world of Gl...,[],[],0.0,0.0,,,publisher blurb world glimpse beyond much like...
477,182297,Once Upon A Time In Arabia,From publisher blurb:&#10;&#10;Put yourself at...,[],[],6.5,0.0,,,publisher blurb put heart story life danger po...
1286,158363,Black Dreams and Other Bad Juju,From publisher blurb:&#10;&#10;Black Dreams an...,[],[],0.0,0.0,,,publisher blurb black dream bad juju contains ...
9576,230088,Abandoned Village,From publisher blurb:&#10;&#10;This full color...,[],[],0.0,0.0,,,publisher blurb full color battlemap th series...


In [12]:
# Entrenar Doc2Vec usando los índices del DataFrame como etiquetas
documents = [TaggedDocument(doc.split(), [str(idx)]) for idx, doc in zip(df_descripciones.index, df_descripciones['Descripcion_completa'])]


In [13]:
model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, workers=4, seed=42)

In [14]:
# Cargar la configuración
config = cargar_configuracion()

# Obtener la API key
api_key = config['pinecone']['api_key']

In [15]:
# Inicializar Pinecone
pc = Pinecone(api_key=api_key)

In [16]:
# Nombre del índice a crear
index_name = "boardgames-recommendation"

In [None]:
# Crear el índice
pc.create_index(
    name=index_name,
    dimension=100,  # La dimensión de los embeddings
    metric="cosine",  # Similaridad del coseno
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [18]:
# Conectar al índice existente
index = pc.Index(index_name)

In [18]:
# Subir los nuevos embeddings normalizados
for idx, row in df_descripciones.iterrows():
    embedding_vector = model.dv[str(idx)]
    embedding_vector_normalizado = normalize([embedding_vector])[0]  # Normalizamos el embedding
    game_id = row['BGGId']
    index.upsert([(str(game_id), embedding_vector_normalizado.tolist())]) # Insertar el embedding en Pinecone

print("Embeddings subidos a Pinecone correctamente.")

Embeddings subidos a Pinecone correctamente.


In [19]:
# Prompt del usuario
prompt_usuario = "I enjoy strategy games with high replayability and complex decision-making."

In [20]:
# Ejecutar la recomendación basada solo en similaridad del coseno
recomendar_juegos_pinecone(prompt_usuario, model, index, df_descripciones, top_n=5)

Juegos recomendados:
1891 (Similaridad: 0.8310)
18SY (Similaridad: 0.8194)
The 15 Greatest Board Games in the World (Similaridad: 0.8159)
Kinder-Rommé: Bauernhoftiere (Similaridad: 0.8122)
Petropoli (Similaridad: 0.8117)


In [21]:
# Ejecutar la recomendación con la valoración
recomendar_juegos_pinecone_con_valoraciones(prompt_usuario, model, index, df_descripciones, top_n=5, w=0.7)

Juegos recomendados (priorizando valoraciones):
18SY (Similaridad ponderada: 0.8158, Similaridad: 0.8333, Valoración: 7.75)
Politiko: Sabah & Sarawak Expansion Set (Similaridad ponderada: 0.8147, Similaridad: 0.8360, Valoración: 7.65)
Piece Packing Pirates (Similaridad ponderada: 0.7936, Similaridad: 0.8404, Valoración: 6.84)
The 15 Greatest Board Games in the World (Similaridad ponderada: 0.7841, Similaridad: 0.8233, Valoración: 6.93)
The March on India, 1944 (Similaridad ponderada: 0.7691, Similaridad: 0.8235, Valoración: 6.42)


In [22]:
model.save("../model/model_doc2vec.bin")

In [23]:
df_descripciones.to_csv('../data/cleaned/data_for_recommender.csv', index=False)

In [24]:
df_descripciones

Unnamed: 0,BGGId,Name,Description,Mechanics,Categories,Average_Rating,Bayesian_Average_Rating,Mecanicas,Categorias,Descripcion_completa
0,51429,L'Art de la Guerre,From the publisher's web site:&#10;&#10;L&rsqu...,"[Action Points, Dice Rolling]","[Ancient, Book, Medieval, Miniatures, Wargame]",7.78723,5.54687,"Action Points, Dice Rolling","Ancient, Book, Medieval, Miniatures, Wargame",publisher web site l art de la guerre set rule...
1,103219,Power League III,User Summary&#10;Power League III is the third...,[],[],0.00000,0.00000,,,user summary power league iii third turbografx...
2,216316,Cooper Versus Cobbler,An Introductory Living Greyhawk Perrenland Reg...,[],[],0.00000,0.00000,,,introductory living greyhawk perrenland region...
3,292608,"Operation Compass: North Africa, December 1940","Operation Compass: North Africa, December 1940...","[Dice Rolling, Grid Movement, Hexagon Grid, Mo...","[Wargame, World War II]",7.62500,0.00000,"Dice Rolling, Grid Movement, Hexagon Grid, Mov...","Wargame, World War II",operation compass north africa december operat...
4,213696,Bounties over Brotton,A One-Round Living Greyhawk County of Urnst Re...,[],[],0.00000,0.00000,,,one round living greyhawk county urnst regiona...
...,...,...,...,...,...,...,...,...,...,...
9995,278789,No Rest for the Dead,Publisher's blurb:&#10;&#10;Unlock The Mystery...,[],[],0.00000,0.00000,,,publisher blurb unlock mystery locked tomb hir...
9996,274662,Dead Rock Spires,Publisher's blurb:&#10;&#10;Dead Rock Spires t...,[],[],0.00000,0.00000,,,publisher blurb dead rock spire take name natu...
9997,171399,Year of the Yuppie Game,A game for 25 to 40 year olds on their way up ...,[],[],0.00000,0.00000,,,game year old way ladder success yuppie young ...
9998,58036,Nytt ormspel,New Snake Game&#10;&#10;A roll and move game. ...,[],[Dice],0.00000,0.00000,,Dice,new snake game roll move game move piece numbe...
