In [None]:
#!pip install tensorflow-datasets tensorflow

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf

In [None]:
data = tfds.load("movielens/1m-ratings")

df = tfds.as_dataframe(data["train"])
df.head(5)

In [None]:
filtered_data = (
    df.filter(["timestamp", "user_id", "movie_id", "user_rating"])
    .sort_values("timestamp")
    .astype({"user_id": int, "movie_id": int, "user_rating": int})
    .drop(columns=["timestamp"])
)

train = filtered_data.iloc[:900000]
test = filtered_data.iloc[900000:]

In [None]:
filtered_data

In [None]:
print(train.query("user_id == 1").shape[0])
print(test.query("user_id == 1").shape[0])

In [None]:
X_train = train.drop(columns=["user_rating"])
y_train = train["user_rating"]
X_test = test.drop(columns=["user_rating"])
y_test = test["user_rating"]

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

hgb = HistGradientBoostingRegressor(random_state=0)
hgb.fit(X_train, y_train)
print(hgb.score(X_test, y_test), mean_absolute_error(y_test, hgb.predict(X_test)))

In [None]:
all_users = train["user_id"].unique()
all_movies = train["movie_id"].unique()

# First version

In [None]:
user_input = tf.keras.layers.Input(shape=(1,), name="user")
user_as_integer = tf.keras.layers.IntegerLookup(vocabulary=all_users)(user_input)
user_embedding = tf.keras.layers.Embedding(input_dim=len(all_users) + 1, output_dim=32)(user_as_integer)

movie_input = tf.keras.layers.Input(shape=(1,), name="movie")
movie_as_integer = tf.keras.layers.IntegerLookup(vocabulary=all_movies)(movie_input)
movie_embedding = tf.keras.layers.Embedding(input_dim=len(all_movies) + 1, output_dim=32)(movie_as_integer)

dot = tf.keras.layers.Dot(axes=2)([user_embedding, movie_embedding])
flatten = tf.keras.layers.Flatten()(dot)

model = tf.keras.Model(inputs=[user_input, movie_input], outputs=flatten)

model.compile(loss="mse", metrics=[tf.keras.metrics.MeanAbsoluteError()])

# Second version

In [None]:
user_input = tf.keras.layers.Input(shape=(1,), name="user")
user_as_integer = tf.keras.layers.IntegerLookup(vocabulary=all_users)(user_input)
user_embedding = tf.keras.layers.Embedding(input_dim=len(all_users) + 1, output_dim=32)(user_as_integer)

movie_input = tf.keras.layers.Input(shape=(1,), name="movie")
movie_as_integer = tf.keras.layers.IntegerLookup(vocabulary=all_movies)(movie_input)
movie_embedding = tf.keras.layers.Embedding(input_dim=len(all_movies) + 1, output_dim=32)(movie_as_integer)

dot = tf.keras.layers.Dot(axes=2)([user_embedding, movie_embedding])
flatten = tf.keras.layers.Flatten()(dot)
squash = tf.keras.layers.Lambda(lambda x: 4 * tf.nn.sigmoid(x) + 1)(flatten)

model = tf.keras.Model(inputs=[user_input, movie_input], outputs=squash)

model.compile(loss="mse", metrics=[tf.keras.metrics.MeanAbsoluteError()])

# Final version

In [None]:
user_input = tf.keras.layers.Input(shape=(1,), name="user")
user_as_integer = tf.keras.layers.IntegerLookup(vocabulary=all_users, name="user_as_integer")(user_input)
user_embedding = tf.keras.layers.Embedding(input_dim=len(all_users) + 1, output_dim=32, name="user_embedding")(user_as_integer)
user_bias = tf.keras.layers.Embedding(input_dim=len(all_users) + 1, output_dim=1, name="user_bias")(user_as_integer)

movie_input = tf.keras.layers.Input(shape=(1,), name="movie")
movie_as_integer = tf.keras.layers.IntegerLookup(vocabulary=all_movies, name="movie_as_integer")(movie_input)
movie_embedding = tf.keras.layers.Embedding(input_dim=len(all_movies) + 1, output_dim=32, name="movie_embedding")(movie_as_integer)
movie_bias = tf.keras.layers.Embedding(input_dim=len(all_movies) + 1, output_dim=1, name="movie_bias")(movie_as_integer)

dot = tf.keras.layers.Dot(axes=2, name="dot_product")([user_embedding, movie_embedding])
add = tf.keras.layers.Add(name="sum")([dot, user_bias, movie_bias])
flatten = tf.keras.layers.Flatten(name="flatten")(add)
squash = tf.keras.layers.Lambda(lambda x: 4 * tf.nn.sigmoid(x) + 1, name="squash")(flatten)

model = tf.keras.Model(inputs=[user_input, movie_input], outputs=squash)

model.compile(loss="mse", metrics=[tf.keras.metrics.MeanAbsoluteError()])

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.fit(
    x={"user": X_train["user_id"], "movie": X_train["movie_id"]},
    y=y_train.values,
    batch_size=256,
    epochs=100,
    validation_split=0.1,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True)],
)

In [None]:
model.evaluate(x={"user": X_test["user_id"], "movie": X_test["movie_id"]}, y=y_test)

In [None]:
from sklearn.metrics import r2_score

r2_score(
    y_test,
    model.predict({"user": X_test["user_id"], "movie": X_test["movie_id"]}).ravel(),
)