##CF con deep learning

Predición del score para un usuario

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow as tf
from tensorflow.keras import layers, models


In [2]:
df = pd.read_csv('/content/DataAnimeFinaLTF.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Anime Title,scoreByUser,Action,Adventure,Avant Garde,Award Winning,Boys Love,Comedy,Drama,...,Supernatural,Suspense,Rating_G - All Ages,Rating_PG - Children,Rating_PG-13 - Teens 13 or older,Rating_R - 17+ (violence & profanity),Rating_R+ - Mild Nudity,Rating_Rx - Hentai,new_user_id,new_anime_id
0,23072147,High School DxD Hero,6,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,876736,Steamboy,10,1,1,0,0,0,0,1,...,0,0,0,0,1,0,0,0,1,1
2,4534530,Shinreigari,8,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,2,2
3,24041367,Re:␣Hamatora,8,1,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,3,3
4,1951700,Magi: The Labyrinth of Magic,8,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,4,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 32 columns):
 #   Column                                 Non-Null Count   Dtype 
---  ------                                 --------------   ----- 
 0   Unnamed: 0                             100000 non-null  int64 
 1   Anime Title                            100000 non-null  object
 2   scoreByUser                            100000 non-null  int64 
 3   Action                                 100000 non-null  int64 
 4   Adventure                              100000 non-null  int64 
 5   Avant Garde                            100000 non-null  int64 
 6   Award Winning                          100000 non-null  int64 
 7   Boys Love                              100000 non-null  int64 
 8   Comedy                                 100000 non-null  int64 
 9   Drama                                  100000 non-null  int64 
 10  Ecchi                                  100000 non-null  int64 
 11  E

In [5]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [20]:
anime_features = df.iloc[:, 3:-2].to_numpy()
anime_features

array([[0, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [22]:
# Extraer new_user_id y new_anime_id (las dos últimas columnas)
X = df[["new_user_id", "new_anime_id"]]
y = df["scoreByUser"]


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Extraer las IDs para entrenamiento y prueba
user_train = X_train["new_user_id"].to_numpy().astype("int32")
anime_train = X_train["new_anime_id"].to_numpy().astype("int32")
user_test = X_test["new_user_id"].to_numpy().astype("int32")
anime_test = X_test["new_anime_id"].to_numpy().astype("int32")

# Obtener las características de anime para el entrenamiento y prueba
anime_features_train = anime_features[anime_train]
anime_features_test = anime_features[anime_test]

# Convertir los scores a float para entrenamiento y prueba
score_train = y_train.to_numpy().astype("float32")
score_test = y_test.to_numpy().astype("float32")

# Confirmar las dimensiones
print("Dimensiones del dataset:")
print(f"user_train: {user_train.shape}")
print(f"anime_train: {anime_train.shape}")
print(f"anime_features_train: {anime_features_train.shape}")
print(f"score_train: {score_train.shape}")


Dimensiones del dataset:
user_train: (80000,)
anime_train: (80000,)
anime_features_train: (80000, 26)
score_train: (80000,)


In [24]:
def create_model(num_users, num_animes, embedding_dim=32, feature_dim=anime_features.shape[1]):
    # Entrada para los IDs de los usuarios
    user_input = layers.Input(shape=(1,), name="user_input")

    # Entrada para los IDs de los animes
    anime_input = layers.Input(shape=(1,), name="anime_input")

    # Capa de embedding para los usuarios
    user_embedding = layers.Embedding(input_dim=num_users, output_dim=embedding_dim, name="user_embedding")(user_input)
    user_embedding = layers.Flatten()(user_embedding)

    # Capa de embedding para los animes
    anime_embedding = layers.Embedding(input_dim=num_animes, output_dim=embedding_dim, name="anime_embedding")(anime_input)
    anime_embedding = layers.Flatten()(anime_embedding)

    # Concatenamos las representaciones de los usuarios y los animes
    concatenated = layers.Concatenate()([user_embedding, anime_embedding])

    # Añadimos una capa densa para procesar la información combinada
    dense_layer = layers.Dense(128, activation='relu')(concatenated)

    # Añadimos las características del anime (como los géneros) a la red
    features_input = layers.Input(shape=(feature_dim,), name="features_input")
    merged = layers.Concatenate()([dense_layer, features_input])

    # Capa de salida: predecir la calificación (score)
    output = layers.Dense(1, activation='linear')(merged)

    # Crear el modelo
    model = models.Model(inputs=[user_input, anime_input, features_input], outputs=output)

    # Compilamos el modelo
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    return model

In [27]:
num_users = len(X['new_user_id'].unique())
num_animes = len(X['new_anime_id'].unique())

# Crear el modelo
model = create_model(num_users, num_animes, embedding_dim=32, feature_dim=anime_features.shape[1])

# Entrenar el modelo
model.fit(
    [user_train, anime_train, anime_features_train],  # Entradas: IDs de usuarios, IDs de animes, características del anime
    score_train,  # Salida: calificación
    epochs=10,  # Número de épocas
    batch_size=64,  # Tamaño de batch
    validation_data=([user_test, anime_test, anime_features_test], score_test)  # Datos de validación
)

Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 24ms/step - loss: 17.3987 - mae: 3.0959 - val_loss: 2.3903 - val_mae: 1.1877
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 24ms/step - loss: 1.5755 - mae: 0.9384 - val_loss: 2.4271 - val_mae: 1.2097
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 24ms/step - loss: 0.9323 - mae: 0.7045 - val_loss: 2.6190 - val_mae: 1.2522
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 24ms/step - loss: 0.7401 - mae: 0.6202 - val_loss: 2.6050 - val_mae: 1.2450
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 24ms/step - loss: 0.6349 - mae: 0.5669 - val_loss: 2.7196 - val_mae: 1.2707
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 24ms/step - loss: 0.5453 - mae: 0.5233 - val_loss: 2.6946 - val_mae: 1.2659
Epoch 7/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x7a2861a62b60>

In [28]:
# Evaluar el modelo en el conjunto de test
loss, mae = model.evaluate([user_test, anime_test, anime_features_test], score_test)
print(f"Loss: {loss}, MAE: {mae}")

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 2.7346 - mae: 1.2659
Loss: 2.761171340942383, MAE: 1.2736656665802002


In [30]:
import numpy as np

# Calcular RMSE
rmse = np.sqrt(loss)
print(f"RMSE: {rmse}")

RMSE: 1.661677267384489


In [31]:
from sklearn.metrics import r2_score

# Calcular R²
r2 = r2_score(score_test, predicted_scores)
print(f"R²: {r2}")

R²: 0.008044540882110596


In [29]:
# Hacer predicciones (por ejemplo, para un conjunto de usuarios y animes específicos)
predicted_scores = model.predict([user_test, anime_test, anime_features_test])

# Mostrar las predicciones
print(predicted_scores)

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[[5.7735176]
 [6.864357 ]
 [6.444844 ]
 ...
 [7.6722426]
 [8.335224 ]
 [7.6904616]]
