In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pinecone
from pinecone import Pinecone
from pinecone import Pinecone, ServerlessSpec
from sklearn.preprocessing import normalize
import warnings
import ast
warnings.filterwarnings('ignore')

In [2]:
# Cargar los datos de juegos (descripciones)
df_juegos = pd.read_csv('../data_extraction/boardgames_10000_juegos.csv')

In [3]:
df_juegos.columns

Index(['BGGId', 'Name', 'Year_Published', 'Description', 'Min_Players',
       'Max_Players', 'Min_Playtime', 'Max_Playtime', 'Average_Rating',
       'Bayesian_Average_Rating', 'Number_of_Ratings', 'Mechanics',
       'Categories'],
      dtype='object')

In [4]:
# Obtener la descripción de los juegos
df_descripciones = df_juegos[['BGGId','Name','Description', 'Mechanics', 'Categories', 'Average_Rating', 'Bayesian_Average_Rating']]  

In [5]:
df_descripciones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   BGGId                    10000 non-null  int64  
 1   Name                     10000 non-null  object 
 2   Description              9992 non-null   object 
 3   Mechanics                10000 non-null  object 
 4   Categories               10000 non-null  object 
 5   Average_Rating           10000 non-null  float64
 6   Bayesian_Average_Rating  10000 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 547.0+ KB


In [6]:
df_descripciones.dropna(inplace=True)
df_descripciones.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9992 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   BGGId                    9992 non-null   int64  
 1   Name                     9992 non-null   object 
 2   Description              9992 non-null   object 
 3   Mechanics                9992 non-null   object 
 4   Categories               9992 non-null   object 
 5   Average_Rating           9992 non-null   float64
 6   Bayesian_Average_Rating  9992 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 624.5+ KB


In [7]:
df_descripciones.sample(5)

Unnamed: 0,BGGId,Name,Description,Mechanics,Categories,Average_Rating,Bayesian_Average_Rating
768,76000,Zork: Grand Inquisitor,From Wikipedia&#10;&#10;Zork: Grand Inquisitor...,[],[],8.34706,6.15561
874,40077,Jungle Life DVD game,"Sequel to the popular ""Wildlife DVD Boardgame""...",['Set Collection'],"['Animals', 'Educational', 'Travel', 'Trivia']",4.54167,0.0
7623,54324,Signs & Portents (Issue 70 - Jul 2009),From the Contents page:&#10;&#10; Mongoose...,[],[],6.0,0.0
2660,217249,Right Flank,A One-Round Living Greyhawk Perrenland Interac...,[],[],0.0,0.0
4157,235409,Irrgang,"Each playes has to get his pawn from ""Start"" t...","['Point to Point Movement', 'Roll / Spin and M...","[""Children's Game""]",0.0,0.0


In [8]:
# Asegúrate de descargar stopwords y wordnet de nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Función mejorada de preprocesamiento de texto
def preprocesar_descripcion(texto):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    # Eliminar entidades HTML y otros caracteres no deseados
    texto = re.sub(r'&#?\w+;', ' ', texto)  # Reemplazar entidades HTML como &#10; 
    texto = re.sub(r'\W+', ' ', texto)  # Eliminar caracteres no alfanuméricos
    texto = re.sub(r'\d+', '', texto)  # Eliminar números
    
    # Convertir a minúsculas, eliminar stopwords y lematizar (solo si el texto es válido)
    if isinstance(texto, str):
        palabras = [lemmatizer.lemmatize(palabra) for palabra in texto.lower().split() if palabra not in stop_words]
        return " ".join(palabras)
    else:
        return ""  # Devolver cadena vacía si no es un texto válido



[nltk_data] Downloading package stopwords to C:\Users\Laura
[nltk_data]     Ortiz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Laura
[nltk_data]     Ortiz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# Convertir cadenas de texto que parecen listas a listas reales
df_descripciones['Mechanics'] = df_descripciones['Mechanics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_descripciones['Categories'] = df_descripciones['Categories'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Verificar si las listas están correctamente formateadas
print(df_juegos[['Mechanics', 'Categories']].head())

                                           Mechanics  \
0                  ['Action Points', 'Dice Rolling']   
1                                                 []   
2                                                 []   
3  ['Dice Rolling', 'Grid Movement', 'Hexagon Gri...   
4                                                 []   

                                          Categories  
0  ['Ancient', 'Book', 'Medieval', 'Miniatures', ...  
1                                                 []  
2                                                 []  
3                        ['Wargame', 'World War II']  
4                                                 []  


In [10]:
# Convertir listas a cadenas de texto
df_descripciones['Mecanicas'] = df_descripciones['Mechanics'].apply(lambda x: ', '.join(x) if isinstance(x, list) and len(x) > 0 else "")
df_descripciones['Categorias'] = df_descripciones['Categories'].apply(lambda x: ', '.join(x) if isinstance(x, list) and len(x) > 0 else "")

# Combinar la descripción limpia con las mecánicas y categorías
df_descripciones['Descripcion_completa'] = df_descripciones['Description'] + " " + df_descripciones['Mecanicas'] + " " + df_descripciones['Categorias']

In [11]:
# Aplicar el preprocesamiento mejorado a las descripciones
df_descripciones['Descripcion_completa'] = df_descripciones['Descripcion_completa'].apply(preprocesar_descripcion)

In [12]:
df_descripciones.sample(5)

Unnamed: 0,BGGId,Name,Description,Mechanics,Categories,Average_Rating,Bayesian_Average_Rating,Mecanicas,Categorias,Descripcion_completa
3242,37065,Associate!,Associate! is a racy team game suitable for pa...,"[Paper-and-Pencil, Team-Based Game]","[Party Game, Print & Play, Word Game]",0.0,0.0,"Paper-and-Pencil, Team-Based Game","Party Game, Print & Play, Word Game",associate racy team game suitable party simila...
4450,290842,Drinking Quest: Liquor Before Honor,"From publisher blurb:&#10;&#10;Fight monsters,...",[],[],0.0,0.0,,,publisher blurb fight monster find treasure sl...
4804,232200,Genesis: Mounted Mapboard,The original Genesis comes with a paper map. ...,[],[],7.5,0.0,,,original genesis come paper map deluxe map x i...
8723,214873,The Signal From Tölva,User Summary&#10;&#10;A large sci-fi FPS with ...,[],[],2.5,0.0,,,user summary large sci fi fps intriguing world...
4508,222764,Basic Back to the Dungeon Role Playing Game Pl...,From publisher blurb:&#10;&#10;This is a basic...,[],[],0.0,0.0,,,publisher blurb basic version bttdrpg level se...


In [13]:
# Entrenar Doc2Vec usando los índices del DataFrame como etiquetas
documents = [TaggedDocument(doc.split(), [str(idx)]) for idx, doc in zip(df_descripciones.index, df_descripciones['Descripcion_completa'])]


In [14]:
model = Doc2Vec(documents, vector_size=50, window=2, min_count=1, workers=4, seed=42)

In [15]:
# Inicializar Pinecone
pc = Pinecone(api_key="e2659d52-b976-4624-b8b8-8de36f8ea15a")

In [16]:
# Nombre del índice que vamos a crear
index_name = "boardgames-recommendation"

In [17]:
# Crear el índice
pc.create_index(
    name=index_name,
    dimension=50,  # La dimensión de tus embeddings
    metric="cosine",  # Similaridad del coseno
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [18]:
# Conectar al índice existente
index = pc.Index(index_name)

# Subir los nuevos embeddings normalizados
for idx, row in df_descripciones.iterrows():
    embedding_vector = model.dv[str(idx)]
    embedding_vector_normalizado = normalize([embedding_vector])[0]  # Normalizamos el embedding
    game_id = row['BGGId']
    index.upsert([(str(game_id), embedding_vector_normalizado.tolist())]) # Insertar el embedding en Pinecone

print("Embeddings subidos a Pinecone correctamente.")

Embeddings subidos a Pinecone correctamente.


In [19]:
# Función para recomendar juegos utilizando Pinecone y valoraciones ponderadas
def recomendar_juegos_pinecone_con_valoraciones(prompt, model, index, df_juegos, top_n=5, w=0.7):
    # Generar el embedding del prompt usando Doc2Vec
    prompt_vector = model.infer_vector(prompt.split())

    # Normalizar el embedding del prompt
    prompt_vector_normalizado = normalize([prompt_vector])[0]

    # Realizar la consulta en Pinecone para obtener los juegos más similares
    result = index.query(vector=prompt_vector_normalizado.tolist(), top_k=top_n * 2, include_values=False)

    # Obtener las recomendaciones
    recomendaciones = []
    for match in result['matches']:
        game_id = match['id']
        score = match['score']  # Similaridad del coseno devuelta por Pinecone

        # Buscar el juego en el DataFrame por su ID
        game_row = df_juegos[df_juegos['BGGId'] == int(game_id)]
        if not game_row.empty:
            valoracion = game_row['Average_Rating'].values[0]
            
            if valoracion > 0:  # Filtrar juegos con valoración mayor que 0
                valoracion = valoracion / 10  # Normalizamos la valoración (si está en escala de 1 a 10)
                similaridad_ponderada = (score * w) + (valoracion * (1 - w))
                recomendaciones.append((game_row['Name'].values[0], similaridad_ponderada, score, valoracion))

    # Ordenar por similaridad ponderada de mayor a menor
    recomendaciones.sort(key=lambda x: x[1], reverse=True)

    # Mostrar los resultados
    print("Juegos recomendados:")
    for rec in recomendaciones[:top_n]:  # Solo mostramos el top_n
        print(f"{rec[0]} (Similaridad ponderada: {rec[1]:.4f}, Similaridad: {rec[2]:.4f}, Valoración: {rec[3]:.2f})")





In [20]:
# Prompt del usuario
prompt_usuario = 'I am looking for a game that is great for solo play and lasts about 30 minutes. I love games with a rich storytelling experience and immersive themes.'
# Ejecutar la recomendación
recomendar_juegos_pinecone_con_valoraciones(prompt_usuario, model, index, df_descripciones)

Juegos recomendados:
The Murder of Mr. Crow (Similaridad ponderada: 0.7427, Similaridad: 0.6325, Valoración: 1.00)
Ancestors Legacy (Similaridad ponderada: 0.5786, Similaridad: 0.5694, Valoración: 0.60)
Crazy Machines 2 (Similaridad ponderada: 0.5376, Similaridad: 0.5465, Valoración: 0.52)


In [21]:
def recomendar_juegos_pinecone_solo_similaridad(prompt, model, index, df_juegos, top_n=5):
    # Generar el embedding del prompt usando Doc2Vec
    prompt_vector = model.infer_vector(prompt.split())

    # Normalizar el embedding del prompt
    prompt_vector_normalizado = normalize([prompt_vector])[0]

    # Realizar la consulta en Pinecone para obtener los juegos más similares
    result = index.query(vector=prompt_vector_normalizado.tolist(), top_k=top_n, include_values=False)

    # Obtener las recomendaciones
    recomendaciones = []
    for match in result['matches']:
        game_id = match['id']
        score = match['score']  # Similaridad del coseno devuelta por Pinecone

        # Buscar el juego en el DataFrame por su ID
        game_row = df_juegos[df_juegos['BGGId'] == int(game_id)]
        if not game_row.empty:
            recomendaciones.append((game_row['Name'].values[0], score))

    # Ordenar por la similaridad de mayor a menor
    recomendaciones.sort(key=lambda x: x[1], reverse=True)

    # Mostrar los resultados
    print("Juegos recomendados (basado solo en la similaridad del coseno):")
    for rec in recomendaciones:
        print(f"{rec[0]} (Similaridad del coseno: {rec[1]:.4f})")



In [27]:
# Ejecutar la recomendación basada solo en similaridad del coseno
recomendar_juegos_pinecone_solo_similaridad(prompt_usuario, model, index, df_descripciones)

Juegos recomendados (basado solo en la similaridad del coseno):
The Murder of Mr. Crow (Similaridad del coseno: 0.6149)
SEGA 3D Classics Collection (Similaridad del coseno: 0.5976)
Hidden Evil (Similaridad del coseno: 0.5873)
Little Heroes (Similaridad del coseno: 0.5622)
Knights of the Round Table (Similaridad del coseno: 0.5428)


In [23]:
model.save("model_doc2vec.bin")

In [24]:
df_descripciones.to_csv('data_for_recommender.csv', index=False)

In [25]:
df_descripciones

Unnamed: 0,BGGId,Name,Description,Mechanics,Categories,Average_Rating,Bayesian_Average_Rating,Mecanicas,Categorias,Descripcion_completa
0,51429,L'Art de la Guerre,From the publisher's web site:&#10;&#10;L&rsqu...,"[Action Points, Dice Rolling]","[Ancient, Book, Medieval, Miniatures, Wargame]",7.78723,5.54687,"Action Points, Dice Rolling","Ancient, Book, Medieval, Miniatures, Wargame",publisher web site l art de la guerre set rule...
1,103219,Power League III,User Summary&#10;Power League III is the third...,[],[],0.00000,0.00000,,,user summary power league iii third turbografx...
2,216316,Cooper Versus Cobbler,An Introductory Living Greyhawk Perrenland Reg...,[],[],0.00000,0.00000,,,introductory living greyhawk perrenland region...
3,292608,"Operation Compass: North Africa, December 1940","Operation Compass: North Africa, December 1940...","[Dice Rolling, Grid Movement, Hexagon Grid, Mo...","[Wargame, World War II]",7.62500,0.00000,"Dice Rolling, Grid Movement, Hexagon Grid, Mov...","Wargame, World War II",operation compass north africa december operat...
4,213696,Bounties over Brotton,A One-Round Living Greyhawk County of Urnst Re...,[],[],0.00000,0.00000,,,one round living greyhawk county urnst regiona...
...,...,...,...,...,...,...,...,...,...,...
9995,278789,No Rest for the Dead,Publisher's blurb:&#10;&#10;Unlock The Mystery...,[],[],0.00000,0.00000,,,publisher blurb unlock mystery locked tomb hir...
9996,274662,Dead Rock Spires,Publisher's blurb:&#10;&#10;Dead Rock Spires t...,[],[],0.00000,0.00000,,,publisher blurb dead rock spire take name natu...
9997,171399,Year of the Yuppie Game,A game for 25 to 40 year olds on their way up ...,[],[],0.00000,0.00000,,,game year old way ladder success yuppie young ...
9998,58036,Nytt ormspel,New Snake Game&#10;&#10;A roll and move game. ...,[],[Dice],0.00000,0.00000,,Dice,new snake game roll move game move piece numbe...
