In [None]:
! pip install -qU keras-rs

In [1]:
import os
# os.environ["KERAS_BACKEND"] = "jax" # TensorFlow
os.environ["KERAS_BACKEND"] = "tensorflow" # TensorFlow

import keras_rs
import keras
import tensorflow as tf
import sqlite3
import pandas as pd

# Datos

In [2]:
conn = sqlite3.connect('sr_gaming/datos/metacritics.db')


df_scores = pd.read_sql("SELECT id_usuario, id_juego, score FROM interacciones WHERE score > 0", conn)
df_scores.head()

Unnamed: 0,id_usuario,id_juego,score
0,attack-of-the-fanboy,baldurs-gate-3,100
1,gamesradar,baldurs-gate-3,100
2,gamingtrend,baldurs-gate-3,100
3,ggrecon,baldurs-gate-3,100
4,gfinity,baldurs-gate-3,100


In [3]:
unique_user_id = df_scores["id_usuario"].unique().tolist()
user_id_2_id_mapping = {t: i for i, t in enumerate(unique_user_id)}
id_2_user_user_mapping = {user_id_2_id_mapping[k]: k for k in user_id_2_id_mapping}

unique_item_id = df_scores["id_juego"].unique().tolist()
item_id_2_id_mapping = {t: i for i, t in enumerate(unique_item_id)}
id_2_item_id_mapping = {item_id_2_id_mapping[k]: k for k in item_id_2_id_mapping}

In [4]:
df_scores["id_usuario"] = df_scores["id_usuario"].apply(lambda x: user_id_2_id_mapping[x])
df_scores["id_juego"] = df_scores["id_juego"].apply(lambda x: item_id_2_id_mapping[x])

df_scores.head()

Unnamed: 0,id_usuario,id_juego,score
0,0,0,100
1,1,0,100
2,2,0,100
3,3,0,100
4,4,0,100


# Recuperación

In [5]:
dataset_inputs = df_scores['id_usuario'].to_numpy(dtype="float32")
dataset_outputs = {
    'item_id': df_scores['id_juego'].to_numpy(dtype="float32"),
    'score': df_scores['score'].to_numpy()/ 100.0
}

dataset = tf.data.Dataset.from_tensor_slices((dataset_inputs, dataset_outputs))
dataset = dataset.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

cut = int(df_scores.shape[0] * 0.8)
train_dataset = dataset.take(cut).batch(1024).cache()
test_dataset = dataset.skip(df_scores.shape[0] - cut).batch(1024).cache()

## Modelo

In [None]:
class RetrievalModel(keras.Model):
    """Modelo de recuperación.

    Su objetivo NO es predecir un score exacto, sino:

    Dado un usuario → encontrar los K ítems más relevantes
    usando similitud entre embeddings.

    Args:
      num_users: Número de usuarios para el embedding.
      num_candidates: Número de candidatos para el embedding..
      embedding_dimension: Dimensión del embedding.
    """

    def __init__(
        self,
        num_users,
        num_candidates,
        embedding_dimension=32, #hiperparámetro a optimizar dependiendo la cantidad de items
        k=10, #devuelve los K más similares
        **kwargs,
    ):
        super().__init__(**kwargs)

        # Torre de consulta (users), sólo un embedding
        self.user_embedding = keras.layers.Embedding(num_users, embedding_dimension)

        # Torre de candidatos (items), sólo un embedding
        self.candidate_embedding = keras.layers.Embedding(num_candidates, embedding_dimension)

        # La capa que hace la recuperación
        # Tomas los embedi
        self.retrieval = keras_rs.layers.BruteForceRetrieval(k=k, return_scores=False)

        self.loss_fn = keras.losses.MeanSquaredError()

    def build(self, input_shape):
        self.user_embedding.build(input_shape)
        self.candidate_embedding.build(input_shape)
        # En este caso, los candidatos son directamente los embeddings del item.
        # Reusamos la variable para ahorrarnos pasos.
        self.retrieval.candidate_embeddings = self.candidate_embedding.embeddings
        self.retrieval.build(input_shape)
        super().build(input_shape)

    def call(self, inputs, training=False):
        user_embeddings = self.user_embedding(inputs)
        result = { "user_embeddings": user_embeddings }

        if not training:
            # Durante el entrenamiento no necesito calcular predicciones
            result["predictions"] = self.retrieval(user_embeddings)
        return result

    def compute_loss(self, x, y, y_pred, sample_weight, training=True):
        candidate_id = y["item_id"]
        score = y["score"]
        user_embeddings = y_pred["user_embeddings"]
        candidate_embeddings = self.candidate_embedding(candidate_id)

        labels = keras.ops.expand_dims(score, -1)

        # Calcula el score de afinidad multiplicando los dos embeddings
        scores = keras.ops.sum(
            keras.ops.multiply(user_embeddings, candidate_embeddings),
            axis=1,
            keepdims=True,
        )
        return self.loss_fn(labels, scores, sample_weight)

## Entrenamiento y evaluación

In [7]:
retrieval_model = RetrievalModel(len(unique_user_id) + 1, len(unique_item_id) + 1)
retrieval_model.compile(optimizer=keras.optimizers.Adagrad(learning_rate=0.1))

In [8]:
history = retrieval_model.fit(train_dataset, validation_data=test_dataset, validation_freq=5, epochs=10, batch_size=256)

Epoch 1/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.6038
Epoch 2/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6037
Epoch 3/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6037
Epoch 4/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6037
Epoch 5/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.6036 - val_loss: 0.5351
Epoch 6/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6036
Epoch 7/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6035
Epoch 8/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6035
Epoch 9/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6034
Epoch 10/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1

## Predicción

In [None]:
id_usuario = 42
predictions = retrieval_model.predict(keras.ops.convert_to_tensor([id_usuario]))
predictions = keras.ops.convert_to_numpy(predictions["predictions"])

print(f"Juegos recomendados para el lector {id_usuario}:")
for id in predictions[0]:
    print(id, id_2_item_id_mapping[id])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Libros recomendados para el lector 42:
13025 singstar-ultimate-party
13535 bomberman-act-zero
11873 death-by-cube
9864 stretch-panic
6573 watch-dogs-legion-bloodline
9883 powerstar-golf
8959 shattered-tale-of-the-forgotten-king
11007 violett
9967 bloo-kid-2
4116 warhammer-40000-rogue-trader


# Ranking

In [14]:
dataset_inputs = {
    "user_id": df_scores['id_usuario'].to_numpy(dtype="float32"),
    'item_id': df_scores['id_juego'].to_numpy(dtype="float32")
}

dataset_outputs = df_scores['score'].to_numpy()/ 100.0

dataset = tf.data.Dataset.from_tensor_slices((dataset_inputs, dataset_outputs))
dataset = dataset.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

cut = int(df_scores.shape[0] * 0.8)
train_dataset = dataset.take(cut).batch(1024).cache()
test_dataset = dataset.skip(df_scores.shape[0] - cut).batch(1024).cache()

## Modelo

In [15]:
class RankingModel(keras.Model):
    """Crea un modelo de ranking

    Args:
      num_users: Number of entries in the user embedding table.
      num_items: Number of entries in the candidate embedding table.
      embedding_dimension: Output dimension for user and movie embedding tables.
    """

    def __init__(
        self,
        num_users,
        num_items,
        embedding_dimension=32,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # Embedding de los usuarios
        self.user_embedding = keras.layers.Embedding(num_users, embedding_dimension)

        # Embedding de los items
        self.item_embedding = keras.layers.Embedding(num_items, embedding_dimension)

        # Predicciones
        self.scores = keras.Sequential(
            [
                # Capas densas
                keras.layers.Dense(256, activation="relu"),
                keras.layers.Dense(64, activation="relu"),

                # Predicciones en la capa final
                keras.layers.Dense(1),
            ]
        )

    def call(self, inputs):
        user_id, movie_id = inputs["user_id"], inputs["item_id"]
        user_embeddings = self.user_embedding(user_id)
        item_embeddings = self.item_embedding(movie_id)
        return self.scores(keras.ops.concatenate([user_embeddings, item_embeddings], axis=1))

## Entrenamiento y evaluación

In [16]:
ranking_model = RankingModel(len(unique_user_id) + 1, len(unique_item_id) + 1)
ranking_model.compile(
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.RootMeanSquaredError()],
    optimizer=keras.optimizers.Adagrad(learning_rate=0.1),
)

In [17]:
history = ranking_model.fit(train_dataset, validation_data=test_dataset, validation_freq=5, epochs=10, batch_size=512)

Epoch 1/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.0188 - root_mean_squared_error: 0.1373
Epoch 2/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0150 - root_mean_squared_error: 0.1226
Epoch 3/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.0148 - root_mean_squared_error: 0.1218
Epoch 4/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.0147 - root_mean_squared_error: 0.1212
Epoch 5/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - loss: 0.0146 - root_mean_squared_error: 0.1207 - val_loss: 0.0289 - val_root_mean_squared_error: 0.1700
Epoch 6/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0145 - root_mean_squared_error: 0.1203
Epoch 7/10
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0144 - root_mean_squared_error: 0.12

## Predicción

In [20]:
user_id = 42
item_ids = [1,7,9,122,3425]
predictions = ranking_model.predict({
    "user_id": keras.ops.array([user_id] * len(item_ids)),
    "item_id": keras.ops.array(item_ids),
})

predictions = keras.ops.convert_to_numpy(keras.ops.squeeze(predictions, axis=1))

for id_juego, prediction in zip(item_ids, predictions):
    print(f"{id_2_item_id_mapping[id_juego]}: {prediction *100:,.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
bioshock: 67.51
halo-combat-evolved: 66.90
metroid-prime: 68.72
story-of-seasons-grand-bazaar: 66.89
sesame-street-once-upon-a-monster: 67.15
