# Propuesta Proyecto RecSys: Recomendación Grupal de Juegos de Mesa (setup)

Link de dataset: https://www.kaggle.com/datasets/threnjen/board-games-database-from-boardgamegeek?select=games.csv

In [None]:
!pip install -q kaggle
!pip install pyreclab
!pip install implicit
!pip install surprise
!pip install elliot
!pip install kagglehub

Tutorial usado: https://www.kaggle.com/discussions/general/74235

Tutorial adicional usado: https://www.youtube.com/watch?v=yEXkEUqK52Q

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("threnjen/board-games-database-from-boardgamegeek")

print("Path to dataset files:", path)

Arriba de esto aparecerá un "path to dataset files", ese path se debe copiar y pegar en la línea de abajo:

In [None]:
path_to_dataset_files = path
#path_to_dataset_files = '/home/nico/.cache/kagglehub/datasets/threnjen/board-games-database-from-boardgamegeek/versions/4'
#path_to_dataset_files = '/root/.cache/kagglehub/datasets/threnjen/board-games-database-from-boardgamegeek/versions/4'

import os
# Guardamos el directorio actual
base_dir = os.getcwd()
# Cambiamos al directorio donde se encuentra el dataset
os.chdir(path_to_dataset_files)

# Importamos las librerias
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import pyreclab
import tempfile
import implicit
import random
from surprise import accuracy

from abc import ABC, abstractmethod
import random

Generamos los datos a utilizar como un muestreo del dataset original pues es muy grande

In [None]:
# Leemos el csv y volvemos al directorio base del proyecto
user_ratings = pd.read_csv('user_ratings.csv')
mechanics_df = pd.read_csv('mechanics.csv')
# Volvemos al directorio base del proyecto
os.chdir(base_dir)
print(base_dir)

# Cambiamos username por un userid
a=list(set(list(user_ratings.Username)))
d = {a[i]: i for i in range(len(a))}
a_mod = [d[i] for i in list(user_ratings.Username)]
user_ratings["Username"] = a_mod

# Separamos training y testing
train     = list(set(user_ratings.Username))[:8000]
test      = list(set(user_ratings.Username))[8000:11000]
test_set  = user_ratings[user_ratings["Username"].isin(test)].sample(9000)
train_set = user_ratings[user_ratings["Username"].isin(train)].sample(35000)

# Generamos nuevo csv de training y testing
train_set.to_csv("train.csv", index=False, sep=',', header=True)
test_set.to_csv("test.csv", index=False, sep=',', header=True)

# Creación de grupos

## Código importado

In [None]:
# Groups generator from: https://github.com/barnap/group-recommenders-offline-evaluation/blob/main/synthetic_groups_generation/groups_generators.py

class GroupsGenerator(ABC):

    @staticmethod
    def getGroupsGenerator(type):
        if type == "RANDOM":
            return RandomGroupsGenerator()
        elif type == "SIMILAR":
            return SimilarGroupsGenerator()
        elif type == "DIVERGENT":
            return DivergentGroupsGenerator()
        elif type == "SIMILAR_ONE_DIVERGENT":
            return MinorityGroupsGenerator()
        return None

    @staticmethod
    def compute_average_similarity(group, user_id_indexes, sim_matrix):
        similarities = list()
        for user_1 in group:
            user_1_index = user_id_indexes.tolist().index(user_1)
            for user_2 in group:
                user_2_index = user_id_indexes.tolist().index(user_2)
                if user_1 != user_2:
                    similarities.append(sim_matrix[user_1_index][user_2_index])
        return np.mean(similarities)

    @abstractmethod
    def generateGroups(self, user_id_indexes, user_id_set, similarity_matrix, group_sizes_to_create, group_number_to_create):
        pass


class RandomGroupsGenerator(GroupsGenerator):

    def generateGroups(self, user_id_indexes, user_id_set, similarity_matrix, group_sizes_to_create, group_number_to_create):
        groups_list = list()
        for group_size in group_sizes_to_create:
            for i in range(group_number_to_create):
                group = random.sample(user_id_set, group_size)
                groups_list.append(
                    {
                        "group_size": group_size,
                        "group_similarity": 'random',
                        "group_members": group,
                        "avg_similarity": GroupsGenerator.compute_average_similarity(group, user_id_indexes, similarity_matrix)
                    }
                )
            print(len(groups_list))
        return groups_list


class SimilarGroupsGenerator(GroupsGenerator):

    @staticmethod
    def select_user_for_sim_group(group, sim_matrix, user_id_indexes, sim_threshold=0.4):
        '''
        Helper function to the generate_similar_user_groups function. Given already selected group members, it randomly
        selects from the remaining users that has a PCC value >= sim_threshold to any of the existing members.
        :param group:
        :param sim_matrix:
        :param user_id_indexes:
        :param sim_threshold:
        :return:
        '''
        ids_to_select_from = set()
        for member in group:
            member_index = user_id_indexes.tolist().index(member)
            indexes = np.where(sim_matrix[member_index] >= sim_threshold)[0].tolist()
            user_ids = [user_id_indexes[index] for index in indexes]
            ids_to_select_from = ids_to_select_from.union(set(user_ids))
        candidate_ids = ids_to_select_from.difference(set(group))
        if len(candidate_ids) == 0:
            return None
        else:
            selection = random.sample(candidate_ids, 1)
            return selection[0]

    def generateGroups(self, user_id_indexes, user_id_set, similarity_matrix, group_sizes_to_create, group_number_to_create):
        groups_list = list()
        for group_size in group_sizes_to_create:
            groups_size_list = list()
            while (len(groups_size_list) < group_number_to_create):
                group = random.sample(user_id_set, 1)
                while len(group) < group_size:
                    new_member = SimilarGroupsGenerator.select_user_for_sim_group(group, similarity_matrix,
                                                                                  user_id_indexes,
                                                                                  sim_threshold=0.5)
                    if new_member is None:
                        break
                    group.append(new_member)
                if len(group) == group_size:
                    groups_size_list.append(
                        {
                            "group_size": group_size,
                            "group_similarity": 'similar',
                            "group_members": group,
                            "avg_similarity": GroupsGenerator.compute_average_similarity(group, user_id_indexes, similarity_matrix)
                        }
                    )
            groups_list.extend(groups_size_list)
            print(len(groups_list))
        return groups_list


class DivergentGroupsGenerator(GroupsGenerator):

    @staticmethod
    def select_user_for_divergent_group(group, sim_matrix, user_id_indexes, sim_threshold=0.0):
        '''
        Helper function to the generate_similar_user_groups function. Given already selected group members, it randomly
        selects from the remaining users that has a PCC value < sim_threshold to any of the existing members.
        :param group:
        :param sim_matrix:
        :param user_id_indexes:
        :param sim_threshold:
        :return:
        '''
        ids_to_select_from = set()
        for member in group:
            member_index = user_id_indexes.tolist().index(member)
            indexes = np.where(sim_matrix[member_index] < sim_threshold)[0].tolist()
            user_ids = [user_id_indexes[index] for index in indexes]
            ids_to_select_from = ids_to_select_from.union(set(user_ids))
        candidate_ids = ids_to_select_from.difference(set(group))
        if len(candidate_ids) == 0:
            return None
        else:
            selection = random.sample(candidate_ids, 1)
            return selection[0]

    def generateGroups(self, user_id_indexes, user_id_set, similarity_matrix, group_sizes_to_create, group_number_to_create):
        groups_list = list()
        for group_size in group_sizes_to_create:
            groups_size_list = list()
            while (len(groups_size_list) < group_number_to_create):
                group = random.sample(user_id_set, 1)
                while len(group) < group_size:
                    new_member = DivergentGroupsGenerator.select_user_for_divergent_group(group, similarity_matrix,
                                                                                     user_id_indexes,
                                                                                     sim_threshold=-0.1)
                    if new_member is None:
                        break
                    group.append(new_member)
                if len(group) == group_size:
                    groups_size_list.append(
                        {
                            "group_size": group_size,
                            "group_similarity": 'divergent',
                            "group_members": group,
                            "avg_similarity": GroupsGenerator.compute_average_similarity(group, user_id_indexes, similarity_matrix)
                        }
                    )
            groups_list.extend(groups_size_list)
            print(len(groups_list))
        return groups_list


class MinorityGroupsGenerator(GroupsGenerator):
    def generateGroups(self, user_id_indexes, user_id_set, similarity_matrix, group_sizes_to_create, group_number_to_create):
        groups_list = list()
        for group_size in group_sizes_to_create:
            groups_size_list = list()
            while (len(groups_size_list) < group_number_to_create):
                group = random.sample(user_id_set, 1)
                while len(group) < (group_size - 1):
                    new_member = SimilarGroupsGenerator.select_user_for_sim_group(group, similarity_matrix,
                                                                                     user_id_indexes,
                                                                                     sim_threshold=0.5)
                    if new_member is None:
                        break
                    group.append(new_member)

                dissimilar_member = DivergentGroupsGenerator.select_user_for_divergent_group(group, similarity_matrix,
                                                                                              user_id_indexes,
                                                                                              sim_threshold=-0.1)
                if dissimilar_member is not None:
                    group.append(dissimilar_member)
                if len(group) == group_size:
                    groups_size_list.append(
                        {
                            "group_size": group_size,
                            "group_similarity": 'similar_one_divergent',
                            "group_members": group,
                            "avg_similarity": GroupsGenerator.compute_average_similarity(group, user_id_indexes, similarity_matrix)
                        }
                    )
            groups_list.extend(groups_size_list)
            print(len(groups_list))
        return groups_list

## Código nuestro

Informacion para crear grupos

In [None]:
group_sizes_to_create = [4]        # [2, 3, 4, 5, 6, 7, 8]
group_similarity_to_create = "RANDOM"  # ["RANDOM", "SIMILAR", "DIVERGENT", "SIMILAR_ONE_DIVERGENT"]
group_number = 50000

Creacion de los grupos

In [None]:
# Extraccion de un sample para poder manejarlo
user_ratings = user_ratings.sample(5000)

# Informacion del dataset
user_matrix = user_ratings.pivot_table(columns='BGGId', index='Username', values='Rating')
user_id_set = set(user_ratings['Username'])
user_id_indexes = user_matrix.index.values
user_matrix = user_matrix.fillna(0)
numpy_array = user_matrix.to_numpy()
sim_matrix = np.corrcoef(numpy_array)

#Creacion del generador
grpGenerator = GroupsGenerator.getGroupsGenerator(group_similarity_to_create)
grpList = grpGenerator.generateGroups(user_id_indexes, user_id_set, sim_matrix, group_sizes_to_create, group_number)

#display(pd.DataFrame.from_records(grpList))
pd.DataFrame.from_records(grpList).to_csv('synthetic_groups.csv', index=False)

# Evaluación de baselines para 1 usuario

In [None]:
# Trabajaremos con un top 10
top_n = 10
test_set.head(top_n)

In [None]:
# Revisemos el tamaño del dataset para asegurarnos de que tiene un tamaño trabajable:
print("Tamaño del dataset completo:", user_ratings.shape)
print("Tamaño del dataset de entrenamiento:", train_set.shape)
print("Tamaño del dataset de prueba:", test_set.shape)

In [None]:
# Evaluamos UserKNN
myUserKnn = pyreclab.UserKnn(dataset='train.csv', dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1)
myUserKnn.train(k=7, similarity='pearson')
_, maeUK, rmseUK = myUserKnn.test(input_file = 'test.csv', dlmchar = b',', header = False, usercol = 2, itemcol = 0, ratingcol = 1)
_, mapUK, ndcgUK = myUserKnn.testrec(input_file="test.csv", dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1, topn=top_n)

print(f"mae = {maeUK} y rmse = {rmseUK}")
print(f"map = {mapUK} y ndcg = {ndcgUK}")

In [None]:
# Evaluamos ItemKNN
myItemKnn = pyreclab.ItemKnn(dataset='train.csv', dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1)
myItemKnn.train(k=7, similarity='pearson')
_, maeIK, rmseIK = myItemKnn.test(input_file = 'test.csv', dlmchar = b',', header = False, usercol = 2, itemcol = 0, ratingcol = 1)
_, mapIK, ndcgIK = myItemKnn.testrec(input_file="test.csv", dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1, topn=top_n)

print(f"mae = {maeIK} y rmse = {rmseIK}")
print(f"map = {mapIK} y ndcg = {ndcgIK}")

In [None]:
# Evaluamos SVD
mySVD = pyreclab.SVD(dataset='train.csv', dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1)
mySVD.train(factors=50, maxiter=80, lr=0.1, lamb=0.5)
_, maeSVD, rmseSVD = mySVD.test(input_file = 'test.csv', dlmchar = b',', header = False, usercol = 2, itemcol = 0, ratingcol = 1)
_, mapSVD, ndcgSVD = mySVD.testrec(input_file="test.csv", dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1, topn=top_n)

print(f"mae = {maeSVD} y rmse = {rmseSVD}")
print(f"map = {mapSVD} y ndcg = {ndcgSVD}")

In [None]:
# Evaluamos Most Popular
myMP = pyreclab.MostPopular(dataset='train.csv', dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1)
myMP.train(progress=False)
_, mapMP, ndcgMP = myMP.testrec(input_file="test.csv", dlmchar=b',', header=False, usercol=2, itemcol=0, ratingcol=1, topn=top_n)

print(f"map = {mapMP} y ndcg = {ndcgMP}")

In [None]:
# Evaluamos Random ratings
predictions = []

rating_scale = (1, 10)

for _, row in test_set.iterrows():
    itemId = row["BGGId"]; rating = row["Rating"]; userId = row["Username"]
    random_rating = random.uniform(rating_scale[0], rating_scale[1])
    predictions.append((userId, itemId, rating, random_rating, None))

accuracy.rmse(predictions)
accuracy.mae(predictions)


# Recomendación multimodal para un usuario

Setup del metadata

In [None]:
!pip install lightfm

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from PIL import Image
import requests
from io import BytesIO

In [None]:

df = train_set
cols_with_id = {col: idx for idx, col in enumerate(mechanics_df.columns[1:])}

# Conjunto de features
item_styles = {}

for _, row in train_set.iterrows():
    bgg_id = row['BGGId']
    style_row = mechanics_df[mechanics_df['BGGId'] == bgg_id].drop(columns=['BGGId'])
    styles = style_row.columns[style_row.iloc[0] == 1].tolist()
    item_styles[bgg_id] = styles

print(item_styles)



itemslist = df['BGGId'].unique()
userslist = df['Username'].unique()
stylelist = [i for i in range(len(cols_with_id))]

In [None]:
#Conjunto de features, pero numericos

item_styles_with_ids = {}
for item_id, styles in item_styles.items():
    style_ids = [cols_with_id[style] for style in styles]
    item_styles_with_ids[item_id] = style_ids
print(item_styles_with_ids)

In [None]:
interactions = [(row['Username'], row['BGGId'], row['Rating']) for index, row in df.iterrows()]

In [None]:
dataset = Dataset()
all_features = set(feature for features in item_styles.values() for feature in features)

dataset.fit(users=userslist, items=itemslist, item_features=all_features)

In [None]:
(interactions_matrix, weights_matrix) = dataset.build_interactions(
    ((x[0], x[1], x[2]) for x in interactions)
)

item_features = dataset.build_item_features(
    ((item_id, features) for item_id, features in item_styles.items())
)
print(interactions_matrix)
# print(item_features)

In [None]:
model = LightFM(no_components=30, loss='warp')
model.fit(interactions_matrix, item_features=item_features, epochs=10, num_threads=4)

In [None]:
def recommend(model, dataset, user_ids, n_items=5):
    n_users, n_items_total = interactions_matrix.shape
    item_ids = np.arange(n_items_total)
    recommendations_per_user = {}
    
    for user_id in user_ids:
        scores = model.predict(user_id, item_ids, item_features=item_features)
        top_items = item_ids[np.argsort(-scores)][:n_items]
        item_mapping = dataset.mapping()[2]
        item_id_mapping = {v: k for k, v in item_mapping.items()}
        recommended_items = [item_id_mapping[item] for item in top_items]
        print(f"User {user_id} recommended items: {recommended_items}")
        recommendations_per_user[user_id] = recommended_items
    
    return recommendations_per_user
    

In [None]:
train_precision = precision_at_k(model, interactions_matrix, item_features=item_features, k=5).mean()
train_recall = recall_at_k(model, interactions_matrix, item_features=item_features, k=5).mean()

print(f'Train precision at k: {train_precision}')
print(f'Train Recall: {train_recall}')

In [None]:
recommendations_group = recommend(model, dataset, [1, 2, 3], n_items=1000)
print(type(userslist))
print(recommendations_group)


---
---

# Ponderación para grupos de usuarios con recomendaciones de metadata

In [None]:
groups_df = pd.read_csv('synthetic_groups.csv')
groups_df.head()

Código adaptado de repositorio mencionado anteriormente, en este caso tomamos los items que en promedio son más preferidos para recomendarlos al grupo.

In [None]:
def recommend_for_group(model, dataset, group_members, n_items=5):
    n_users, n_items_total = interactions_matrix.shape
    item_ids = np.arange(n_items_total)
    user_mapping = dataset.mapping()[0]
    
    group_scores = []
    for user_id in group_members:
        try:
            internal_user_id = user_mapping[user_id]
            scores = model.predict(internal_user_id, item_ids, item_features=item_features)
            group_scores.append(scores)
        except KeyError:
            continue
        except Exception:
            continue
    
    if not group_scores:
        return []
    
    average_scores = np.mean(group_scores, axis=0)
    top_items = item_ids[np.argsort(-average_scores)][:n_items]
    
    item_mapping = dataset.mapping()[2]
    item_id_mapping = {v: k for k, v in item_mapping.items()}
    recommended_items = [item_id_mapping[item] for item in top_items]
    
    return recommended_items

group_recommendations = []
for _, row in groups_df.iterrows():
    group_members = eval(row['group_members']) if isinstance(row['group_members'], str) else row['group_members']
    recommendations = recommend_for_group(model, dataset, group_members, n_items=10)
    group_recommendations.append({
        'group_members': group_members,
        'recommendations': recommendations
    })

recommendations_df = pd.DataFrame(group_recommendations)
recommendations_df.to_csv('group_recommendations.csv', index=False)