In [1]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann

[K     |████████████████████████████████| 89 kB 3.6 MB/s 
[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
[K     |████████████████████████████████| 10.4 MB 5.3 MB/s 
[K     |████████████████████████████████| 578.0 MB 14 kB/s 
[K     |████████████████████████████████| 438 kB 53.2 MB/s 
[K     |████████████████████████████████| 5.9 MB 44.1 MB/s 
[K     |████████████████████████████████| 1.7 MB 50.2 MB/s 
[?25h

Import the packages 

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

For this demo, we will use Tensorflow Recommenders to build a simple retrieval model

In [3]:
import tensorflow_recommenders as tfrs

We will use the MovieLens dataset from Tensorflow Datasets. Loading movielens/100k_ratings yields a tf.data.Dataset object containing the ratings data and loading movielens/100k_movies yields a tf.data.Dataset object containing only the movies data.

Note that since the MovieLens dataset does not have predefined splits, all data are under train split.

In [4]:
# Ratings data.
ratings = tfds.load("movielens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movielens/100k-movies", split="train")

[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-ratings/0.1.1.incomplete0CGKH2/movielens-train.tfrecord*...…

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1. Subsequent calls will reuse this data.[0m
[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 150.35 KiB, total: 4.84 MiB) to /root/tensorflow_datasets/movielens/100k-movies/0.1.1...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1682 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-movies/0.1.1.incomplete37YMLV/movielens-train.tfrecord*...:…

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-movies/0.1.1. Subsequent calls will reuse this data.[0m


Ratings data returns a dictionay containing movie ID and the corresponding rating as well as other information that we will not use

In [5]:
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


Movie data returns movie ID, movie, title, and genre

In [6]:
for x in movies.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


In this demo, we're going to focus on the ratings data.

We keep only the user_id, and movie_title fields in the dataset.

In [7]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
})
movies = movies.map(lambda x: x["movie_title"])

Splitting the data into 80% training and 20% test

In [8]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

 Let's get unique user ids and movie titles present in the data. This will help us change categorical data into numerical data through a process called embedding

In [9]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_movie_titles[:10]

array([b"'Til There Was You (1997)", b'1-900 (1994)',
       b'101 Dalmatians (1996)', b'12 Angry Men (1957)', b'187 (1997)',
       b'2 Days in the Valley (1996)',
       b'20,000 Leagues Under the Sea (1954)',
       b'2001: A Space Odyssey (1968)',
       b'3 Ninjas: High Noon At Mega Mountain (1998)',
       b'39 Steps, The (1935)'], dtype=object)

Now let's build our model

In [10]:
embedding_dimension = 32

User model

In [11]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

Movie model

In [12]:
movie_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

METRICS - how good is our model?

In [13]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(movie_model)
)

We will now use TensorFlow Recommenders to compute our loss 

In [14]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

Now let's build the full model

In [15]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features["movie_title"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

In [16]:
class NoBaseClassMovielensModel(tf.keras.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

    # Set up a gradient tape to record gradients.
    with tf.GradientTape() as tape:

      # Loss computation.
      user_embeddings = self.user_model(features["user_id"])
      positive_movie_embeddings = self.movie_model(features["movie_title"])
      loss = self.task(user_embeddings, positive_movie_embeddings)

      # Handle regularization losses as well.
      regularization_loss = sum(self.losses)

      total_loss = loss + regularization_loss

    gradients = tape.gradient(total_loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics

  def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

    # Loss computation.
    user_embeddings = self.user_model(features["user_id"])
    positive_movie_embeddings = self.movie_model(features["movie_title"])
    loss = self.task(user_embeddings, positive_movie_embeddings)

    # Handle regularization losses as well.
    regularization_loss = sum(self.losses)

    total_loss = loss + regularization_loss

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics

Let us compile the model

In [17]:
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [18]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

Model Training

In [19]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f52451e0cd0>

Model Evaluation on the test set

In [20]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0008999999845400453,
 'factorized_top_k/top_5_categorical_accuracy': 0.009200000204145908,
 'factorized_top_k/top_10_categorical_accuracy': 0.02135000005364418,
 'factorized_top_k/top_50_categorical_accuracy': 0.1251000016927719,
 'factorized_top_k/top_100_categorical_accuracy': 0.23669999837875366,
 'loss': 28254.259765625,
 'regularization_loss': 0,
 'total_loss': 28254.259765625}

Making predictions

In [21]:
!pip install -q anvil-uplink

[?25l[K     |███▊                            | 10 kB 22.0 MB/s eta 0:00:01[K     |███████▌                        | 20 kB 7.5 MB/s eta 0:00:01[K     |███████████▏                    | 30 kB 10.3 MB/s eta 0:00:01[K     |███████████████                 | 40 kB 4.6 MB/s eta 0:00:01[K     |██████████████████▋             | 51 kB 4.4 MB/s eta 0:00:01[K     |██████████████████████▍         | 61 kB 5.2 MB/s eta 0:00:01[K     |██████████████████████████      | 71 kB 5.7 MB/s eta 0:00:01[K     |█████████████████████████████▉  | 81 kB 6.4 MB/s eta 0:00:01[K     |████████████████████████████████| 87 kB 3.4 MB/s 
[?25h[?25l[K     |██████▍                         | 10 kB 23.3 MB/s eta 0:00:01[K     |████████████▊                   | 20 kB 28.4 MB/s eta 0:00:01[K     |███████████████████▏            | 30 kB 34.3 MB/s eta 0:00:01[K     |█████████████████████████▌      | 40 kB 37.3 MB/s eta 0:00:01[K     |███████████████████████████████▉| 51 kB 38.4 MB/s eta 0:00:01[K 

In [22]:

# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)

import anvil.server

anvil.server.connect("PEGWWLAREKUYE3ADTFIS222P-PWMSQKKHMMFTDCYM")
@anvil.server.callable
def recommend(usernum):
  #print("Hello from the uplink, %s!" % name)
  #usernum = input('Enter a constant between 1 and 50: ')
  _, titles = np.array(index(tf.constant([usernum])))
  
  '''
  for mov in titles[0,:1]:
    print("User: ", usernum, "recommended movie is: ", titles[0, :1][0])
  #print(f"Recommendations for user: {titles[0, :3]}")
  # st.write(f"Recommendations for user: {titles[0, :3]}")
  '''
  return titles[0, :3][0].decode("utf-8")


Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment" as SERVER


In [23]:
anvil.server.wait_forever()

KeyboardInterrupt: ignored