In [74]:
import os
import tempfile

%matplotlib inline
import matplotlib.pyplot as plt
import pprint
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

plt.style.use('seaborn-whitegrid')

In [78]:
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")
for i in ratings.take(1).as_numpy_iterator():
    pprint.pprint(i)
for i in movies.take(1).as_numpy_iterator():
    pprint.pprint(i)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}
{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


In [79]:
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")

ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "timestamp": x["timestamp"],
    "movie_id": x["movie_id"],
})
# movies_feature = movies
# movies = movies.map(lambda x: x["movie_title"])
movies = movies.map(lambda x: {
    "movie_title": x["movie_title"],
    "movie_id": x["movie_id"],
})

In [80]:
for i in ratings.take(2).as_numpy_iterator():
    pprint.pprint(i)
for i in movies.take(2).as_numpy_iterator():
    pprint.pprint(i)

{'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'timestamp': 879024327,
 'user_id': b'138'}
{'movie_id': b'709',
 'movie_title': b'Strictly Ballroom (1992)',
 'timestamp': 875654590,
 'user_id': b'92'}
{'movie_id': b'1681', 'movie_title': b'You So Crazy (1994)'}
{'movie_id': b'1457', 'movie_title': b'Love Is All There Is (1996)'}


In [81]:
timestamps = np.concatenate(list(ratings.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

unique_movie_titles = np.unique(np.concatenate(list(movies.map(lambda x: x['movie_title']).batch(1000))))
unique_movie_ids = np.unique(np.concatenate(list(movies.map(lambda x: x['movie_id']).batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["user_id"]))))

### Query model

In [82]:
class UserModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.experimental.preprocessing.StringLookup(
            vocabulary=unique_user_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
    ])
    self.timestamp_embedding = tf.keras.Sequential([
        tf.keras.layers.experimental.preprocessing.Discretization(timestamp_buckets.tolist()),
        tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
    ])
    self.normalized_timestamp = tf.keras.layers.experimental.preprocessing.Normalization(axis=None)
    self.normalized_timestamp.adapt(timestamps)

  def call(self, inputs):
    # Take the input dictionary, pass it through each input layer,
    # and concatenate the result.
    return tf.concat([
        self.user_embedding(inputs["user_id"]),
        self.timestamp_embedding(inputs["timestamp"]),
        #self.normalized_timestamp(inputs["timestamp"]),
        tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1)),
    ], axis=1)

In [83]:
class QueryModel(tf.keras.Model):
  """Model for encoding user queries."""

  def __init__(self, layer_sizes):
    """Model for encoding user queries.
    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    # We first use the user model for generating embeddings.
    self.embedding_model = UserModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
        self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
        self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    """
    Inputs embedding and go through the dense layers
    Arguments:
      -- inputs: tf dictionary - tensor dictionary contains multiple features of users
    Return:
      -- dense_layers(feature_embedding): tf tensor - values of dense layers
    """
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

### Candidate model

In [84]:
class MovieModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.title_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
          vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
    ])

    self.title_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
        max_tokens=max_tokens)

    self.title_text_embedding = tf.keras.Sequential([
      self.title_vectorizer,
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    self.title_vectorizer.adapt(movies.map(lambda x: x['movie_title']))
    
    self.id_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
          vocabulary=unique_movie_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_ids) + 1, 32)
    ])

  def call(self, titles):
    return tf.concat([
        self.title_embedding(titles['movie_title']),
        self.title_text_embedding(titles['movie_title']),
        self.id_embedding(titles['movie_id']),
    ], axis=1)

In [85]:
class CandidateModel(tf.keras.Model):
  """Model for encoding movies."""

  def __init__(self, layer_sizes):
    """Model for encoding movies.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.embedding_model = MovieModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
        self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
        self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    """
    Inputs embedding and go through the dense layers
    Arguments:
      -- inputs: tf dictionary - tensor dictionary contains multiple features of movies
    Return:
      -- dense_layers(feature_embedding): tf tensor - values of dense layers
    """
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

### Combined model

In [86]:
layer_sizes = [32]
candidate_model = CandidateModel(layer_sizes)
candidates=movies.batch(128).map(candidate_model)
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(candidate_model),),)

In [87]:
for i in candidates.take(2).as_numpy_iterator():
    print(len(i), len(i[0]))
    print(i)

128 32
[[-0.03409795 -0.03644416 -0.02098895 ... -0.01194747 -0.06696635
   0.00989223]
 [-0.06053327 -0.01656315 -0.00986068 ... -0.02516921 -0.04961792
  -0.02841071]
 [-0.00669995  0.01118299 -0.02884758 ...  0.00024492 -0.03789013
  -0.02046539]
 ...
 [ 0.03002698 -0.03659357  0.01598025 ...  0.00870787 -0.00054528
  -0.02680092]
 [-0.01308324 -0.02944615 -0.04190268 ... -0.09199509 -0.03737338
  -0.0405663 ]
 [ 0.00802164 -0.02482525 -0.02637029 ... -0.01610369  0.00195497
   0.01651701]]
128 32
[[ 0.04301016  0.02948082  0.00935837 ... -0.00646872  0.03375939
  -0.04576859]
 [ 0.025393    0.01938764  0.0187553  ...  0.02809391  0.00445712
  -0.00931548]
 [ 0.00193058 -0.03970129 -0.01796028 ...  0.04740249 -0.00832203
  -0.00067864]
 ...
 [ 0.01283692  0.01815203 -0.04665133 ... -0.03296039  0.03845068
   0.02637332]
 [ 0.02237841 -0.00752508  0.02284264 ... -0.01389819  0.02680524
  -0.02358806]
 [ 0.02205449  0.03353945 -0.00692357 ...  0.06080664  0.05270093
  -0.00709063]]


In [90]:
class MovielensModel(tfrs.models.Model):

  def __init__(self, layer_sizes):
    super().__init__()
    self.query_model = QueryModel(layer_sizes)
    self.candidate_model = CandidateModel(layer_sizes)
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model({
        "user_id": features["user_id"],
        "timestamp": features["timestamp"],
    })
    movie_embeddings = self.candidate_model({
        "movie_id": features["movie_id"],
        "movie_title": features["movie_title"],
    })     # """ This is where the problem is """

    return self.task(query_embeddings, movie_embeddings, compute_metrics=not training)

In [91]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [92]:
for i in cached_train.take(2).as_numpy_iterator():
    print(i)

{'movie_title': array([b'Godfather, The (1972)', b'Escape to Witch Mountain (1975)',
       b'Fargo (1996)', ..., b'Dangerous Minds (1995)',
       b'Clueless (1995)', b'Scream (1996)'], dtype=object), 'user_id': array([b'424', b'429', b'53', ..., b'766', b'125', b'478'], dtype=object), 'timestamp': array([880859493, 882386848, 879442537, ..., 891310875, 892836551,
       889388862]), 'movie_id': array([b'127', b'1133', b'100', ..., b'366', b'367', b'288'], dtype=object)}
{'movie_title': array([b'Vertigo (1958)', b"Singin' in the Rain (1952)",
       b'Saint, The (1997)', ..., b'Bound (1996)', b'Hoodlum (1997)',
       b'Bio-Dome (1996)'], dtype=object), 'user_id': array([b'326', b'370', b'287', ..., b'663', b'489', b'595'], dtype=object), 'timestamp': array([879875432, 879434666, 875333873, ..., 889492503, 891447522,
       886921977]), 'movie_id': array([b'479', b'705', b'748', ..., b'129', b'299', b'368'], dtype=object)}


In [93]:
num_epochs = 3

model = MovielensModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=2,
    epochs=num_epochs,
    verbose=1)

accuracy = one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

model.evaluate(cached_test, return_dict=True)

Epoch 1/3












Epoch 2/3




Epoch 3/3
Top-100 accuracy: 0.23.


{'factorized_top_k/top_1_categorical_accuracy': 0.0009500000160187483,
 'factorized_top_k/top_5_categorical_accuracy': 0.008700000122189522,
 'factorized_top_k/top_10_categorical_accuracy': 0.02070000022649765,
 'factorized_top_k/top_50_categorical_accuracy': 0.12139999866485596,
 'factorized_top_k/top_100_categorical_accuracy': 0.23499999940395355,
 'loss': 27970.861328125,
 'regularization_loss': 0,
 'total_loss': 27970.861328125}