In [None]:
#!pip install tensorflow-datasets tensorflow

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf

In [None]:
data = tfds.load("movielens/1m-ratings")

df = tfds.as_dataframe(data["train"])
df.head(5)

In [None]:
filtered_data = (
    df.sort_values("timestamp")
    .astype(
        {
            "bucketized_user_age": int,
            "movie_id": int,
            "movie_title": str,
            "user_gender": int,
            "user_id": int,
            "user_occupation_label": int,
            "user_occupation_text": str,
            "user_rating": int,
            "user_zip_code": str,
        }
    )
    .drop(columns=["timestamp"])
)

In [None]:
# temporal split
train = filtered_data.iloc[:80000]
evaluation = filtered_data.iloc[80000:90000]
test = filtered_data.iloc[90000:]

In [None]:
# induce cold start problem
train = filtered_data.query("user_id >= 2000")
evaluation = filtered_data.query("1000 <= user_id < 2000")
test = filtered_data.query("user_id < 1000")

In [None]:
X_train = train.drop(columns=["user_rating"])
y_train = train["user_rating"]
X_eval = evaluation.drop(columns=["user_rating"])
y_eval = evaluation["user_rating"]
X_test = test.drop(columns=["user_rating"])
y_test = test["user_rating"]

In [None]:
features_config = {
    "user_id": {"entity": "user", "dtype": tf.int64},
    "bucketized_user_age": {"entity": "user", "dtype": tf.int64},
    "user_gender": {"entity": "user", "dtype": tf.int64},
    "user_occupation_label": {"entity": "user", "dtype": tf.int64},
    "movie_id": {"entity": "movie", "dtype": tf.int64},
    "user_zip_code": {"entity": "user", "dtype": tf.string},
    "user_occupation_text": {"entity": "user", "dtype": tf.string},
}

for name, config in features_config.items():
    if config["dtype"] == tf.int64:
        config["encoding_layer_class"] = tf.keras.layers.IntegerLookup
    elif config["dtype"] == tf.string:
        config["encoding_layer_class"] = tf.keras.layers.StringLookup
    else:
        raise Exception

    config["vocab"] = train[name].unique()


inputs = {
    name: tf.keras.layers.Input(shape=(1,), name=name, dtype=config["dtype"])
    for name, config in features_config.items()
}

inputs_encoded = {
    name: config["encoding_layer_class"](vocabulary=config["vocab"])(inputs[name])
    for name, config in features_config.items()
}

embeddings = {
    name: tf.keras.layers.Embedding(
        input_dim=len(config["vocab"]) + 1,
        output_dim=32,
    )(inputs_encoded[name])
    for name, config in features_config.items()
}

biases = {
    name: tf.keras.layers.Embedding(input_dim=len(config["vocab"]) + 1, output_dim=1)(
        inputs_encoded[name]
    )
    for name, config in features_config.items()
}

user_embedding = tf.keras.layers.Add()(
    [
        embeddings[name]
        for name, config in features_config.items()
        if config["entity"] == "user"
    ]
)

# movie genres
all_movie_genres = train["movie_genres"].explode().unique().astype(int)
movie_genres_input = tf.keras.layers.Input(shape=(None,), name="movie_genres")
movie_genres_as_integer = tf.keras.layers.IntegerLookup(vocabulary=all_movie_genres)(movie_genres_input)
movie_genres_embeddings = tf.keras.layers.Embedding(input_dim=len(all_movie_genres) + 1, output_dim=32)(movie_genres_as_integer)
movie_genres_biases = tf.keras.layers.Embedding(input_dim=len(all_movie_genres) + 1, output_dim=1)(movie_genres_as_integer)
movie_genres_embedding = tf.keras.layers.GlobalAveragePooling1D(keepdims=True)(movie_genres_embeddings)
movie_genres_bias = tf.keras.layers.GlobalAveragePooling1D(keepdims=True)(movie_genres_biases)

movie_embedding = tf.keras.layers.Add()(
    [
        embeddings[name]
        for name, config in features_config.items()
        if config["entity"] == "movie"
    ] + [movie_genres_embedding]
)

user_bias = tf.keras.layers.Add()(
    [
        biases[name]
        for name, config in features_config.items()
        if config["entity"] == "user"
    ]
)

movie_bias = tf.keras.layers.Add()(
    [
        biases[name]
        for name, config in features_config.items()
        if config["entity"] == "movie"
    ] + [movie_genres_bias]
)

dot = tf.keras.layers.Dot(axes=2)([user_embedding, movie_embedding])
add = tf.keras.layers.Add()([dot, user_bias, movie_bias])
flatten = tf.keras.layers.Flatten()(add)
squash = tf.keras.layers.Lambda(lambda x: 4 * tf.nn.sigmoid(x) + 1)(flatten)

model = tf.keras.Model(
    inputs=[inputs[name] for name in features_config.keys()] + [movie_genres_input], outputs=squash
)

model.compile(loss="mse", metrics=[tf.keras.metrics.MeanAbsoluteError()])

In [None]:
model.fit(
    x={**{name: X_train[name].values for name in features_config.keys()}, "movie_genres": tf.ragged.constant(X_train["movie_genres"].values)},
    y=y_train.values,
    batch_size=256,
    epochs=100,
    validation_data=(
        {**{name: X_eval[name].values for name in features_config.keys()}, "movie_genres": tf.ragged.constant(X_eval["movie_genres"].values)},
        y_eval.values,
    ),
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True)],
)

In [None]:
model.evaluate(
    x={**{name: X_test[name].values for name in features_config.keys()}, "movie_genres": tf.ragged.constant(X_test["movie_genres"].values)},
    y=y_test.values,
    batch_size=1_000_000,
)