In [None]:
!pip install -q tensorflow-recommenders

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m92.2/96.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import pprint
import tempfile
from typing import Dict, List, Text
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [None]:
ratings = tfds.load("movielens/100k-ratings", split = "train")
movies = tfds.load("movielens/100k-movies", split = "train")

Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-ratings/0.1.1.incompleteVB75CS/movielens-train.tfrecord*...…

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1. Subsequent calls will reuse this data.
Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 150.35 KiB, total: 4.84 MiB) to /root/tensorflow_datasets/movielens/100k-movies/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1682 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-movies/0.1.1.incomplete1CTPW1/movielens-train.tfrecord*...:…

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-movies/0.1.1. Subsequent calls will reuse this data.


In [None]:
for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)


{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


In [None]:
for x in movies.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


In [None]:
ratings = ratings.map(lambda x: {
    "movie_title" : x["movie_title"],
    "user_id" : x["user_id"]
})

movies = movies.map(lambda x: x["movie_title"])

In [None]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed = 42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)


In [None]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [None]:
pprint.pprint(unique_movie_titles[:10])
pprint.pprint(unique_user_ids[:10])

array([b"'Til There Was You (1997)", b'1-900 (1994)',
       b'101 Dalmatians (1996)', b'12 Angry Men (1957)', b'187 (1997)',
       b'2 Days in the Valley (1996)',
       b'20,000 Leagues Under the Sea (1954)',
       b'2001: A Space Odyssey (1968)',
       b'3 Ninjas: High Noon At Mega Mountain (1998)',
       b'39 Steps, The (1935)'], dtype=object)
array([b'1', b'10', b'100', b'101', b'102', b'103', b'104', b'105',
       b'106', b'107'], dtype=object)


In [None]:
embedding_dimension = 32

In [None]:
user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary = unique_user_ids, mask_token = None),
    tf.keras.layers.Embedding(len(unique_user_ids)+1, embedding_dimension)
])

In [None]:
movie_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary = unique_movie_titles, mask_token = None
    ),
    tf.keras.layers.Embedding(len(unique_movie_titles)+1, embedding_dimension)
])

In [None]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates = movies.batch(128).map(movie_model)
)

In [None]:
task = tfrs.tasks.Retrieval(
    metrics = metrics,
)

In [None]:
class MovieLensModel(tfrs.Model):

    def __init__(self, user_model, movie_model):
        super().__init__()
        self.movie_model : tf.keras.Model = movie_model
        self.user_model : tf.keras.Model = user_model
        self.task : tf.keras.layers.Layer = task

    def compute_loss(self, features : Dict[Text, tf.Tensor], training = False):
        user_embeddings = self.user_model(features["user_id"])
        positive_movie_embeddings = self.movie_model(features["movie_title"])

        return self.task(user_embeddings, positive_movie_embeddings)


In [None]:
class NoBaseClassMovieLensModel(tf.keras.Model):
    def __init__(self, user_model, movie_model):
        super().__init__()
        self.movie_model : tf.keras.Model = movie_model
        self.user_model : tf.keras.Model = user_model
        self.task : tf.keras.layers.Layer = task

    def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        with tf.GradientTape() as tape:
            user_embeddings = self.user_model(features["user_id"])
            positive_movie_embeddings = self.movie_model(features["movie_title"])
            loss = self.task(user_embeddings, positive_movie_embeddings)

            regularization_loss = sum(self.losses)
            total_loss = loss + regularization_loss

        gradients = tape.gradient(total_loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        metrics = {metrics.name: metric.result() for metric in self.metrics}
        metrics["loss"] = loss
        metrics["regularization_loss"] = regularization_loss
        metrics["total_loss"] = total_loss

        return metrics
    def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        user_embeddings = self.user_model(features["user_id"])
        positive_movie_embeddings = self.movie_model(features["movie_title"])
        loss = self.task(user_embeddings, positive_movie_embeddings)

        regularization_loss = sum(self.losses)

        total_loss = loss + regularization_loss

        metrics = {metric.name: metric.result() for metric in self.metrics}
        metrics["loss"] = loss
        metrics["regularization_loss"] = regularization_loss
        metrics["total_loss"] = total_loss



In [None]:
model = MovieLensModel(user_model, movie_model)
model.compile(optimizer = tf.keras.optimizers.Adagrad(learning_rate = 0.1))

In [None]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [None]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7bd1b45651b0>

In [None]:
model.evaluate(cached_test, return_dict = True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0010000000474974513,
 'factorized_top_k/top_5_categorical_accuracy': 0.009050000458955765,
 'factorized_top_k/top_10_categorical_accuracy': 0.020749999210238457,
 'factorized_top_k/top_50_categorical_accuracy': 0.12075000256299973,
 'factorized_top_k/top_100_categorical_accuracy': 0.23520000278949738,
 'loss': 28233.435546875,
 'regularization_loss': 0,
 'total_loss': 28233.435546875}

In [None]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)

_, titles = index(tf.constant(["42"]))
print('for 42', titles)

for 42 tf.Tensor(
[[b'Rudy (1993)' b'101 Dalmatians (1996)'
  b"Kid in King Arthur's Court, A (1995)"
  b'Father of the Bride Part II (1995)' b'Grumpier Old Men (1995)'
  b'Sabrina (1995)' b'Fried Green Tomatoes (1991)' b'Michael (1996)'
  b'Cool Runnings (1993)' b'Christmas Carol, A (1938)']], shape=(1, 10), dtype=string)


In [None]:
with tempfile.TemporaryDirectory() as tmp:
    path = os.path.join(tmp, "model")
    tf.saved_model.save(index, path)
    loaded = tf.saved_model.load(path)
    scores, title = loaded(["42"])
    print(f"Recommendations: {titles[0][:3]}")



Recommendations: [b'Rudy (1993)' b'101 Dalmatians (1996)'
 b"Kid in King Arthur's Court, A (1995)"]


In [None]:
scann_index = tfrs.layers.factorized_top_k.ScaNN(model.user_model)
scann_index.index_from_dataset(
    tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)