In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs




In [3]:
# Ratings data
ratings = tfds.load('movielens/100k-ratings', split='train')
# Features of all the available movies.
movies = tfds.load('movielens/100k-movies', split = 'train')

In [4]:
for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7], dtype=int64),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


In [5]:
for x in movies.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'movie_genres': array([4], dtype=int64),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


In [6]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
})
movies = movies.map(lambda x: x["movie_title"])

In [7]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [8]:
# Figure out unique user ids and movie titles present in the data
# Map the raw values of our categorical features to embedding vectors in our model
# Need a vocabulary that maps a raw feature value to an integer in a continguous range:
# Allow us to look up the corresponding embeddings in our embedding tables

movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x['user_id'])
unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [9]:
unique_movie_titles[:10]

array([b"'Til There Was You (1997)", b'1-900 (1994)',
       b'101 Dalmatians (1996)', b'12 Angry Men (1957)', b'187 (1997)',
       b'2 Days in the Valley (1996)',
       b'20,000 Leagues Under the Sea (1954)',
       b'2001: A Space Odyssey (1968)',
       b'3 Ninjas: High Noon At Mega Mountain (1998)',
       b'39 Steps, The (1935)'], dtype=object)

# Implementing a model

- Choosing the architecture of our model is a key part of modelling
- Build each tower separately and then combine them in the final model

## The query tower


- Decide on the dimensionality of the query and candidate representations
    - Higher values - More accurate, slower to fit and more prone to overfitting

In [10]:
embedding_dimension = 32

- Define the model itself
    - Use Keras preprocessing layers
        - Convert user ids to integers
        - Convert those to user embeddings via an Embedding layer

In [11]:
user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary = unique_user_ids, mask_token = None),
    # We add an additonal embedding to account for unknown tokens
    tf.keras.layers.Embedding(
        len(unique_user_ids) + 1, embedding_dimension
    )
    ])







## The candidate tower

- Same process

In [12]:
movie_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary = unique_movie_titles, mask_token = None
    ),
    tf.keras.layers.Embedding(
        len(unique_movie_titles) + 1, embedding_dimension
    )
])

## Metrics

- Positive (user, movie) pairs

- Compare the affinity score:
    - The model calculates for this pair
    - The scores of all the other possible candidates
        - Score for the positive pair is higher than for all other candidates
        - Model is highly accurate

- Computes metrics for across top K candidates surfaced by a retrieval model

In [13]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates = movies.batch(128).map(movie_model)
)







## Loss

- TFRS has several loss layers and tasks to make this easy
- Use Retrieval task object:
    - A convenience wrapper
    - Bundles together the loss function and metric computation

- Task itself is a Keras layer
    - Takes the query and candidate embeddings as arguments
    - Returns the computed loss

In [14]:
task = tfrs.tasks.Retrieval(
    metrics = metrics
) 

## The full model

In [15]:
class MovielensModel(tfrs.Model):
    def __init__(self, user_model, movie_model):
        super().__init__()
        self.movie_model: tf.keras.Model = movie_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task
    
    def compute_loss(self, features: Dict[Text, tf.Tensor], training = False) -> tf.Tensor:
        # Pick out user features and pass them into the user model
        user_embeddings = self.user_model(features['user_id'])
        # Pick out the movie features and pass them into the movie model,
        # getting embeddings back
        positive_movie_embeddings = self.movie_model(features['movie_title'])
        # Task computes the loss and the metrics
        return self.task(user_embeddings, positive_movie_embeddings)

- tfrs.Model base class
    - Convenience class
    - Compute both training and test losses using the same method
(Still a plain Keras model, achieve the same functionality by inheriting from tf.keras.Model
and overriding the train_step and test_step functions)

## Fitting and evaluating

In [16]:
# Instantitate the model
model = MovielensModel(user_model, movie_model)
model.compile(optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [17]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [18]:
model.fit(cached_train, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1552f7ec340>

In [19]:
model.evaluate(cached_test, return_dict = True)



{'factorized_top_k/top_1_categorical_accuracy': 0.000699999975040555,
 'factorized_top_k/top_5_categorical_accuracy': 0.008449999615550041,
 'factorized_top_k/top_10_categorical_accuracy': 0.020899999886751175,
 'factorized_top_k/top_50_categorical_accuracy': 0.12264999747276306,
 'factorized_top_k/top_100_categorical_accuracy': 0.23399999737739563,
 'loss': 28248.1484375,
 'regularization_loss': 0,
 'total_loss': 28248.1484375}

- Performance is worse than training performance
    - Perform better on the data that is has seen 
    (Overfitting is strong when models have many parameters)
    (Can be mediated by model regularization and use of user and movie features that help the model generalize better to unseen data)
    - The model is re-recommending some of users' already watched movies
    (Known-positive watches can crowd out test movies out of top K recommendations)
    (Tackled by excluding previously seen movies from test recommendations)

## Making predictions

In [20]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
    tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)
# Get recommendations
_, titles = index(tf.constant(['42']))
print(f'Recommendations for user 42: {titles[0, :3]}')

Recommendations for user 42: [b"Kid in King Arthur's Court, A (1995)"
 b'Homeward Bound: The Incredible Journey (1993)' b'101 Dalmatians (1996)']


## Model serving

- Serving has 2 components:
    - A serving query model, taking in features of the query and transforming them into a 
    query embedding, and
    - A serving candidate model.
        - Takes the form of an approximate nearest neighbours (ANN) index
        - Allows fast approximate lookup of candidates in response to a query produced
        by the query model

- Both components can be packaged into a single exportable model
    - Takes the raw user id
    - Returns the titles of top movies for that user
        - Done via exporting the model to a SavedModel format
        - Makes it possible to serve using TensorFlow Serving

In [21]:
# Export BruteForce layer
with tempfile.TemporaryDirectory() as tmp:
    path = os.path.join(tmp, 'model')

    # Save the index.
    tf.saved_model.save(index, path)

    # Load it back; can also be done in TensorFlow Serving.
    loaded = tf.saved_model.load(path)

    # Pass a user id in, get top predicted movie titles back.
    scores, titles = loaded(['42'])

    print(f'Recommendations: {titles[0][:3]}')









INFO:tensorflow:Assets written to: C:\Users\huynn5\AppData\Local\Temp\tmpqnxxxdg9\model\assets


INFO:tensorflow:Assets written to: C:\Users\huynn5\AppData\Local\Temp\tmpqnxxxdg9\model\assets


Recommendations: [b"Kid in King Arthur's Court, A (1995)"
 b'Homeward Bound: The Incredible Journey (1993)' b'101 Dalmatians (1996)']


- Export an approximate retrieval index to speed up predictions
    - Possible to efficiently surface recommendations from sets of tens of millions of candidates
- Use the scann package
    - Optional dependency of TFRS and we installed it separately at the beginning of this tutorial by calling !pip install -q scann

In [28]:
# scann_index = tfrs.layers.factorized_top_k.ScaNN(model.user_model)
# scann_index.index_from_dataset(
#     tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
# )
# # Get recommendations.
# _, titles = scann_index(tf.constant(["42"]))
# print(f"Recommendations for user 42: {titles[0, :3]}")
# # Export the query model.
# with tempfile.TemporaryDirectory() as tmp:
#   path = os.path.join(tmp, "model")

#   # Save the index.
#   tf.saved_model.save(
#       scann_index,
#       path,
#       options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"])
#   )

#   # Load it back; can also be done in TensorFlow Serving.
#   loaded = tf.saved_model.load(path)

#   # Pass a user id in, get top predicted movie titles back.
#   scores, titles = loaded(["42"])

#   print(f"Recommendations: {titles[0][:3]}")

- Item-to-Item recommendation
    - Created user-movie model
    - Commong to perform item-to-item recommendations
    - Same pattern but different training data
    - Have 2 item towers (for the query and candidate item), Train the model using (query item, candidate item) pairs
        - Constructed from clicks on product detail pages

- Next steps
    - Learning multi-task models: jointly optimizing for ratings and clicks
    - Using movie metadata: build a more complex movie model to alleviate cold-start