In [1]:
import os
# os.environ["KERAS_BACKEND"] = "jax" # TensorFlow
os.environ["KERAS_BACKEND"] = "tensorflow" # TensorFlow


import keras
from keras.saving import register_keras_serializable
import tensorflow as tf
import sqlite3
import pandas as pd
import pickle
import numpy as np

# Datos

In [2]:
conn = sqlite3.connect('sr_gaming/datos/metacritics.db')

df_scores = pd.read_sql("SELECT id_usuario, id_juego, score FROM interacciones WHERE score > 0", conn)
df_scores.head()

Unnamed: 0,id_usuario,id_juego,score
0,attack-of-the-fanboy,baldurs-gate-3,100
1,gamesradar,baldurs-gate-3,100
2,gamingtrend,baldurs-gate-3,100
3,ggrecon,baldurs-gate-3,100
4,gfinity,baldurs-gate-3,100


In [3]:
# diccionarios de traducción
# Para id de usuario se pasa a un num entero incluyendo todas las variables categorícas
# Para que funcionen en Red Neuronal de Tensor Flow
unique_user_id = df_scores["id_usuario"].unique().tolist()
user_id_2_id_mapping = {t: i for i, t in enumerate(unique_user_id)}
id_2_user_user_mapping = {user_id_2_id_mapping[k]: k for k in user_id_2_id_mapping}

unique_item_id = df_scores["id_juego"].unique().tolist()
item_id_2_id_mapping = {t: i for i, t in enumerate(unique_item_id)}
id_2_item_id_mapping = {item_id_2_id_mapping[k]: k for k in item_id_2_id_mapping}

mappings = {
    "user_id_2_id": user_id_2_id_mapping,
    "id_2_user_id": id_2_user_user_mapping,
    "item_id_2_id": item_id_2_id_mapping,
    "id_2_item_id": id_2_item_id_mapping,
}

with open("sr_gaming/datos/mappings_dt.pkl", "wb") as f:
    pickle.dump(mappings, f)


In [4]:
id_2_item_id_mapping

{0: 'baldurs-gate-3',
 1: 'bioshock',
 2: 'goldeneye-007',
 3: 'grand-theft-auto-iii',
 4: 'grand-theft-auto-iv',
 5: 'grand-theft-auto-v',
 6: 'half-life-2',
 7: 'halo-combat-evolved',
 8: 'mass-effect-2',
 9: 'metroid-prime',
 10: 'nfl-2k1',
 11: 'perfect-dark-2000',
 12: 'red-dead-redemption-2',
 13: 'resident-evil-4-2005',
 14: 'soulcalibur',
 15: 'super-mario-galaxy',
 16: 'super-mario-galaxy-2',
 17: 'super-mario-odyssey',
 18: 'tekken-3',
 19: 'the-legend-of-zelda-breath-of-the-wild',
 20: 'the-legend-of-zelda-ocarina-of-time',
 21: 'the-orange-box',
 22: 'tony-hawks-pro-skater-3',
 23: 'uncharted-2-among-thieves',
 24: 'baldurs-gate-ii-shadows-of-amn',
 25: 'elden-ring',
 26: 'gran-turismo',
 27: 'gran-turismo-3-a-spec',
 28: 'grand-theft-auto-double-pack',
 29: 'grand-theft-auto-san-andreas',
 30: 'grand-theft-auto-vice-city',
 31: 'half-life',
 32: 'halo-2',
 33: 'littlebigplanet',
 34: 'madden-nfl-2003',
 35: 'metal-gear-solid-2-sons-of-liberty',
 36: 'portal-companion-colle

In [5]:
df_scores["id_usuario"] = df_scores["id_usuario"].apply(lambda x: user_id_2_id_mapping[x])
df_scores["id_juego"] = df_scores["id_juego"].apply(lambda x: item_id_2_id_mapping[x])

df_scores.head()

Unnamed: 0,id_usuario,id_juego,score
0,0,0,100
1,1,0,100
2,2,0,100
3,3,0,100
4,4,0,100


In [6]:
# Entradas
dataset_inputs = {
    "user_id": df_scores['id_usuario'].to_numpy(dtype="float32"),
    'item_id': df_scores['id_juego'].to_numpy(dtype="float32")
}

# Salidas
#Normalizar para que los datos numéricos para que redes neuoranles funcionen
# y que no haya quilombo con los pesos
dataset_outputs = df_scores['score'].to_numpy()/ 100.0

dataset = tf.data.Dataset.from_tensor_slices((dataset_inputs, dataset_outputs))
dataset = dataset.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

cut = int(df_scores.shape[0] * 0.8)
train_dataset = dataset.take(cut).batch(256).cache()
test_dataset = dataset.skip(df_scores.shape[0] - cut).batch(256).cache()

In [7]:
dataset_outputs 

array([1.  , 1.  , 1.  , ..., 0.15, 0.1 , 0.1 ], shape=(344977,))

## Modelo

In [8]:
class RankingModel(keras.Model):
    """Crea un modelo de ranking

    Args:
      num_users: Number of entries in the user embedding table.
      num_items: Number of entries in the candidate embedding table.
      embedding_dimension: Output dimension for user and movie embedding tables.
    """

    def __init__(
        self,
        num_users,
        num_items,
        embedding_dimension=32,
        **kwargs,
    ):
        super().__init__(**kwargs)

        # Embedding de los usuarios
        self.user_embedding = keras.layers.Embedding(num_users, embedding_dimension)

        # Embedding de los items
        self.item_embedding = keras.layers.Embedding(num_items, embedding_dimension)

        # Predicciones
        self.scores = keras.Sequential( #Se puede mejorar la arquitectura
            [
                # Capas densas
                keras.layers.Dense(256, activation="relu"),
                keras.layers.Dense(64, activation="relu"),

                # Predicciones en la capa final
                keras.layers.Dense(1),
            ]
        )

    def call(self, inputs):
        user_id = inputs["user_id"]
        item_id = inputs["item_id"]

        user_embeddings = self.user_embedding(user_id) # Le mete los embeddings de usuario
        item_embeddings = self.item_embedding(item_id) # Le mete los embeddings de item

        #Concatenar horizontalmente como estructura de 2 
        # Sería lo mismo que hacer un filtrado colaborativo
        return self.scores(keras.ops.concatenate([user_embeddings, item_embeddings], axis=1))

In [9]:
@register_keras_serializable()
class RankingModel(keras.Model):
    def __init__(
        self,
        num_users,
        num_items,
        embedding_dimension=32,
        **kwargs,
    ):
        super().__init__(**kwargs)

        # guardar estos parámetros para get_config
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dimension = embedding_dimension

        self.user_embedding = keras.layers.Embedding(
            self.num_users, self.embedding_dimension
        )
        self.item_embedding = keras.layers.Embedding(
            self.num_items, self.embedding_dimension
        )

        self.scores = keras.Sequential(
            [
                keras.layers.Dense(256, activation="relu"),
                keras.layers.Dense(64, activation="relu"),
                keras.layers.Dense(1),
            ]
        )

    def call(self, inputs):
        user_id = inputs["user_id"]
        item_id = inputs["item_id"]

        user_embeddings = self.user_embedding(user_id)
        item_embeddings = self.item_embedding(item_id)

        x = keras.ops.concatenate([user_embeddings, item_embeddings], axis=1)
        return self.scores(x)

    def get_config(self):
        # config base de keras.Model (name, trainable, etc.)
        config = super().get_config()
        # agregamos nuestros parámetros del constructor
        config.update(
            {
                "num_users": self.num_users,
                "num_items": self.num_items,
                "embedding_dimension": self.embedding_dimension,
            }
        )
        return config

    @classmethod
    def from_config(cls, config):
        # esto permite reconstruir correctamente desde el config
        return cls(**config)

## Entrenamiento y evaluación

In [10]:
ranking_model = RankingModel(len(unique_user_id) + 1, len(unique_item_id) + 1)
ranking_model.compile(
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.RootMeanSquaredError()],
    optimizer=keras.optimizers.Adagrad(learning_rate=0.1), #hiperparámetros que se podría optimizar
)

# Detiene el entrenamiento cuando no hay mejora en loss en tres épocas consecutivas
es_callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)

# Graba el modelo cuando cambia val_root_mean_squared_error
#mc_callback = keras.callbacks.ModelCheckpoint(filepath="checkpoint.{val_root_mean_squared_error:.2f}.weights.h5", monitor="val_root_mean_squared_error", mode="min",  save_weights_only=True, save_best_only=True)
mc_callback = keras.callbacks.ModelCheckpoint(filepath="checkpoint.{val_root_mean_squared_error:.2f}.keras", monitor="val_root_mean_squared_error", mode="min",  save_weights_only=False, save_best_only=True)

history = ranking_model.fit(train_dataset,
                    validation_data=test_dataset,
                    validation_freq=1,
                    epochs=100,
                    batch_size=1024,
                    callbacks=[es_callback, mc_callback]
                    )

Epoch 1/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - loss: 0.0159 - root_mean_squared_error: 0.1260 - val_loss: 0.0287 - val_root_mean_squared_error: 0.1694
Epoch 2/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 0.0143 - root_mean_squared_error: 0.1197 - val_loss: 0.0276 - val_root_mean_squared_error: 0.1661
Epoch 3/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - loss: 0.0141 - root_mean_squared_error: 0.1189 - val_loss: 0.0272 - val_root_mean_squared_error: 0.1649
Epoch 4/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - loss: 0.0140 - root_mean_squared_error: 0.1185 - val_loss: 0.0269 - val_root_mean_squared_error: 0.1639
Epoch 5/100
[1m1079/1079[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 0.0139 - root_mean_squared_error: 0.1180 - val_loss: 0.0265 - val_root_mean_squared_error: 0.1627
Epoch 6/100
[1m1079/10

## Predicción

In [11]:
user_id = 42
item_ids = [241, 141, 131, 122, 2232]
predictions = ranking_model.predict({
    "user_id": keras.ops.array([user_id] * len(item_ids)),
    "item_id": keras.ops.array(item_ids),
})

predictions = keras.ops.convert_to_numpy(keras.ops.squeeze(predictions, axis=1))

for id_juego, prediction in zip(item_ids, predictions):
    print(f"{id_2_item_id_mapping[id_juego]}: {prediction * 100.0:,.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
resident-evil-remake: 92.48
resident-evil-4: 90.36
forza-horizon-4: 91.77
story-of-seasons-grand-bazaar: 81.77
outer-wilds-echoes-of-the-eye: 86.02


Predecir todos los items para un ususario

In [12]:
keras.ops.array([user_id] * len(item_ids))
keras.ops.array(item_ids)

<tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 241,  141,  131,  122, 2232], dtype=int32)>

In [13]:
user_id = 42

# Obtiene los juegos que el usuario no ha puntuado
df_item_ids = pd.read_sql("SELECT id_juego FROM juegos WHERE id_juego NOT IN (SELECT id_juego FROM interacciones WHERE id_usuario = ?)", conn, params=[user_id])
item_ids = [item_id_2_id_mapping[u] for u in df_item_ids["id_juego"].to_list()]

predictions = ranking_model.predict({
    "user_id": keras.ops.array([user_id] * len(item_ids)),
    "item_id": keras.ops.array(item_ids),
}).squeeze()

predictions

[1m428/428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


array([0.6045486 , 0.7489085 , 0.6949247 , ..., 0.76386225, 0.69388723,
       0.7850012 ], shape=(13672,), dtype=float32)

In [14]:
item_ids_internal

NameError: name 'item_ids_internal' is not defined

In [None]:
user_id = 42

# Juegos NO vistos por el usuario
df_item_ids = pd.read_sql(
    """
    SELECT id_juego 
    FROM juegos 
    WHERE id_juego NOT IN (
        SELECT id_juego 
        FROM interacciones 
        WHERE id_usuario = ?
    )
    """, 
    conn, 
    params=[user_id]
)

item_ids_raw = df_item_ids["id_juego"].tolist()

item_ids_internal = [item_id_2_id_mapping[j] for j in item_ids_raw]


preds = ranking_model.predict({
    "user_id": keras.ops.array([user_id] * len(item_ids_internal)),
    "item_id": keras.ops.array(item_ids_internal),
}).squeeze()



df_preds = pd.DataFrame({
    "item_id_raw": item_ids_raw,               # ID real del juego
    "item_id_int": item_ids_internal,          # ID interno
    "score": preds                       # score del modelo
})

df_juegos = df_preds.sort_values("score", ascending=False)

id_juegos = [i for i in df_juegos["item_id_raw"]]

id_juegos


[1m428/428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


['the-legend-of-zelda-breath-of-the-wild',
 'grand-theft-auto-iv',
 'the-elder-scrolls-iv-oblivion',
 'the-legend-of-zelda-tears-of-the-kingdom',
 'neverwinter-nights',
 'uncharted-2-among-thieves',
 'the-last-of-us-part-ii',
 'red-dead-redemption',
 'mass-effect-2',
 'grand-theft-auto-v',
 'the-elder-scrolls-v-skyrim',
 'baldurs-gate-3',
 'final-fantasy-vii-remake',
 'elden-ring',
 'uncharted-4-a-thiefs-end',
 'super-mario-galaxy-2',
 'god-of-war-ragnarok',
 'castlevania-dawn-of-sorrow',
 'astro-bot',
 'god-of-war',
 'bioshock',
 'super-smash-bros-ultimate',
 'bloodborne',
 'red-dead-redemption-2',
 'divinity-original-sin-ii',
 'forza-motorsport-4',
 'littlebigplanet',
 'the-sims-2',
 'sid-meiers-civilization-v',
 'project-gotham-racing-2',
 'super-mario-bros-wonder',
 'tom-clancys-splinter-cell-pandora-tomorrow',
 'dark-souls-ii',
 'super-mario-galaxy',
 'madden-nfl-2004',
 'gears-of-war-3',
 'goodbye-volcano-high',
 'othello',
 'diablo-iii',
 'god-of-war-chains-of-olympus',
 'unchar

In [None]:
[(i,j) for i,j in zip(df_juegos["item_id_raw"], df_juegos["score"])]

[('the-legend-of-zelda-breath-of-the-wild', 0.8989982008934021),
 ('grand-theft-auto-iv', 0.8752790689468384),
 ('the-elder-scrolls-iv-oblivion', 0.8674384951591492),
 ('the-legend-of-zelda-tears-of-the-kingdom', 0.8666650056838989),
 ('neverwinter-nights', 0.8645586371421814),
 ('uncharted-2-among-thieves', 0.8611711859703064),
 ('the-last-of-us-part-ii', 0.8600700497627258),
 ('red-dead-redemption', 0.8578047156333923),
 ('mass-effect-2', 0.8545241951942444),
 ('grand-theft-auto-v', 0.852694571018219),
 ('the-elder-scrolls-v-skyrim', 0.8481279611587524),
 ('baldurs-gate-3', 0.8458714485168457),
 ('final-fantasy-vii-remake', 0.8437809944152832),
 ('elden-ring', 0.8429677486419678),
 ('uncharted-4-a-thiefs-end', 0.8423599004745483),
 ('super-mario-galaxy-2', 0.8418518900871277),
 ('god-of-war-ragnarok', 0.8400197625160217),
 ('castlevania-dawn-of-sorrow', 0.8385034799575806),
 ('astro-bot', 0.8378661274909973),
 ('god-of-war', 0.8350633382797241),
 ('bioshock', 0.8346384167671204),
 ('