# Propuesta Proyecto RecSys: Recomendación Grupal de Juegos de Mesa (setup)

Link de dataset: https://www.kaggle.com/datasets/threnjen/board-games-database-from-boardgamegeek?select=games.csv

In [None]:
!pip install -q kaggle
!pip install pyreclab
!pip install implicit
!pip install surprise
!pip install elliot
!pip install torch torchvision
!pip install kagglehub

Tutorial usado: https://www.kaggle.com/discussions/general/74235

Tutorial adicional usado: https://www.youtube.com/watch?v=yEXkEUqK52Q

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("threnjen/board-games-database-from-boardgamegeek")

print("Path to dataset files:", path)

Arriba de esto aparecerá un "path to dataset files", ese path se debe copiar y pegar en la línea de abajo:

In [None]:
path_to_dataset_files = '/home/nico/.cache/kagglehub/datasets/threnjen/board-games-database-from-boardgamegeek/versions/4'
import os
# Guardamos el directorio actual
base_dir = os.getcwd()
# Cambiamos al directorio donde se encuentra el dataset
os.chdir(path_to_dataset_files)

# Importamos las librerias
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import pyreclab
import tempfile
import implicit
import random
from surprise import accuracy

Generamos los datos a utilizar como un muestreo del dataset original pues es muy grande

In [None]:
# Leemos el csv y volvemos al directorio base del proyecto
user_ratings = pd.read_csv('user_ratings.csv')
mechanics_df = pd.read_csv('mechanics.csv')
# Volvemos al directorio base del proyecto
os.chdir(base_dir)
print(base_dir)

# Cambiamos username por un userid
a=list(set(list(user_ratings.Username)))
d = {a[i]: i for i in range(len(a))}
a_mod = [d[i] for i in list(user_ratings.Username)]
user_ratings["Username"] = a_mod

# Separamos training y testing
train     = list(set(user_ratings.Username))[:8000]
test      = list(set(user_ratings.Username))[8000:11000]
test_set  = user_ratings[user_ratings["Username"].isin(test)].sample(9000)
train_set = user_ratings[user_ratings["Username"].isin(train)].sample(35000)

# Generamos nuevo csv de training y testing
train_set.to_csv("train.csv", index=False, sep=',', header=True)
test_set.to_csv("test.csv", index=False, sep=',', header=True)

# Evaluación de baselines

In [None]:
# Trabajaremos con un top 10
top_n = 10
test_set.head(top_n)

In [None]:
# Revisemos el tamaño del dataset para asegurarnos de que tiene un tamaño trabajable:
print("Tamaño del dataset completo:", user_ratings.shape)
print("Tamaño del dataset de entrenamiento:", train_set.shape)
print("Tamaño del dataset de prueba:", test_set.shape)

In [None]:
# Evaluamos UserKNN
myUserKnn = pyreclab.UserKnn(dataset='train.csv', dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1)
myUserKnn.train(k=7, similarity='pearson')
_, maeUK, rmseUK = myUserKnn.test(input_file = 'test.csv', dlmchar = b',', header = False, usercol = 2, itemcol = 0, ratingcol = 1)
_, mapUK, ndcgUK = myUserKnn.testrec(input_file="test.csv", dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1, topn=top_n)

print(f"mae = {maeUK} y rmse = {rmseUK}")
print(f"map = {mapUK} y ndcg = {ndcgUK}")

In [None]:
# Evaluamos ItemKNN
myItemKnn = pyreclab.ItemKnn(dataset='train.csv', dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1)
myItemKnn.train(k=7, similarity='pearson')
_, maeIK, rmseIK = myItemKnn.test(input_file = 'test.csv', dlmchar = b',', header = False, usercol = 2, itemcol = 0, ratingcol = 1)
_, mapIK, ndcgIK = myItemKnn.testrec(input_file="test.csv", dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1, topn=top_n)

print(f"mae = {maeIK} y rmse = {rmseIK}")
print(f"map = {mapIK} y ndcg = {ndcgIK}")

In [None]:
# Evaluamos SVD
mySVD = pyreclab.SVD(dataset='train.csv', dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1)
mySVD.train(factors=50, maxiter=80, lr=0.1, lamb=0.5)
_, maeSVD, rmseSVD = mySVD.test(input_file = 'test.csv', dlmchar = b',', header = False, usercol = 2, itemcol = 0, ratingcol = 1)
_, mapSVD, ndcgSVD = mySVD.testrec(input_file="test.csv", dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1, topn=top_n)

print(f"mae = {maeSVD} y rmse = {rmseSVD}")
print(f"map = {mapSVD} y ndcg = {ndcgSVD}")

In [None]:
# Evaluamos Most Popular
myMP = pyreclab.MostPopular(dataset='train.csv', dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1)
myMP.train(progress=False)
_, mapMP, ndcgMP = myMP.testrec(input_file="test.csv", dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1, topn=top_n)

print(f"map = {mapMP} y ndcg = {ndcgMP}")

In [None]:
# Evaluamos Random ratings
predictions = []

rating_scale = (1, 10)

for _, row in test_set.iterrows():
    itemId = row["BGGId"]; rating = row["Rating"]; userId = row["Username"]
    random_rating = random.uniform(rating_scale[0], rating_scale[1])
    predictions.append((userId, itemId, rating, random_rating, None))

accuracy.rmse(predictions)
accuracy.mae(predictions)


In [None]:
!pip install lightfm

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k


In [None]:

df = train_set
cols_with_id = {col: idx for idx, col in enumerate(mechanics_df.columns[1:])}

# Conjunto de features
item_styles = {}

for _, row in train_set.iterrows():
    bgg_id = row['BGGId']
    style_row = mechanics_df[mechanics_df['BGGId'] == bgg_id].drop(columns=['BGGId'])
    styles = style_row.columns[style_row.iloc[0] == 1].tolist()
    item_styles[bgg_id] = styles

print(item_styles)



itemslist = df['BGGId'].unique()
userslist = df['Username'].unique()
stylelist = [i for i in range(len(cols_with_id))]

In [None]:
#Conjunto de features, pero numericos

item_styles_with_ids = {}
for item_id, styles in item_styles.items():
    style_ids = [cols_with_id[style] for style in styles]
    item_styles_with_ids[item_id] = style_ids
print(item_styles_with_ids)

In [None]:
interactions = [(row['Username'], row['BGGId'], row['Rating']) for index, row in df.iterrows()]

In [None]:
dataset = Dataset()
all_features = set(feature for features in item_styles.values() for feature in features)

dataset.fit(users=userslist, items=itemslist, item_features=all_features)

In [None]:
(interactions_matrix, weights_matrix) = dataset.build_interactions(
    ((x[0], x[1], x[2]) for x in interactions)
)

item_features = dataset.build_item_features(
    ((item_id, features) for item_id, features in item_styles.items())
)
print(interactions_matrix)
# print(item_features)

In [None]:
model = LightFM(no_components=30, loss='warp')
model.fit(interactions_matrix, item_features=item_features, epochs=10, num_threads=4)

In [None]:
def recommend(model, dataset, user_ids, n_items=5):
    n_users, n_items_total = interactions_matrix.shape
    item_ids = np.arange(n_items_total)
    recommendations_per_user = {}
    
    for user_id in user_ids:
        scores = model.predict(user_id, item_ids, item_features=item_features)
        top_items = item_ids[np.argsort(-scores)][:n_items]
        item_mapping = dataset.mapping()[2]
        item_id_mapping = {v: k for k, v in item_mapping.items()}
        recommended_items = [item_id_mapping[item] for item in top_items]
        print(f"User {user_id} recommended items: {recommended_items}")
        recommendations_per_user[user_id] = recommended_items
    
    return recommendations_per_user
    

In [None]:
train_precision = precision_at_k(model, interactions_matrix, item_features=item_features, k=5).mean()
train_recall = recall_at_k(model, interactions_matrix, item_features=item_features, k=5).mean()

print(f'Train precision at k: {train_precision}')
print(f'Train Recall: {train_recall}')

In [None]:
recommendations_group = recommend(model, dataset, [1, 2, 3], n_items=1000)
print(type(userslist))
print(recommendations_group)


---
---