# TFRS

This notebook takes the master data that was prepared in the "data_preprocessing" notebook, creates a test holdout set from the 20% of the data that the same split are used for other experimentations for consistency.

The model is created by TensorFlow Recommenders.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

from typing import Dict, Text

## Train-test Split

The train test split is done by only taking the 20% of the data as the test holdout set. For making sure the train and test data is consistent in all experiments, the following test holdout split will be the same for each experiment.

It is an important detail that the split is done in a stratified way to ensure that the user rankings will be splitted as evenly as possible.

In [3]:
#Do not load the "timestamp" column since it is not needed for building the recommender engine
df = pd.read_csv('data/master_data.zip', compression="zip")[["userId", "movieId", "rating"]]
#This split will be standard for all experiments
from sklearn import model_selection

X = df.copy()
y = df["userId"]

#There is no need for the target values since we are splitting the whole dataset
#y is only given for stratifying

trainset, testset, _, _ = model_selection.train_test_split(X, y, test_size = 0.20, stratify=y, random_state=42)

Data is transformed into TensorFlow dataset format 

In [4]:
trainset = tf.data.Dataset.from_tensor_slices(trainset.values)

trainset = trainset.map(lambda x: {
    "user_id": tf.as_string(tf.cast(x[0], tf.int32))  ,
    "movie_id": tf.as_string(tf.cast(x[1], tf.int32)) ,
    "rating": tf.cast(x[2], tf.float32)
})

testset = tf.data.Dataset.from_tensor_slices(testset.values)

testset = testset.map(lambda x: {
    "user_id": tf.as_string(tf.cast(x[0], tf.int32))  ,
    "movie_id": tf.as_string(tf.cast(x[1], tf.int32)) ,
    "rating": tf.cast(x[2], tf.float32)
})

Unique user and movie IDs are determined for embedding generation

In [9]:
movie_ids = trainset.batch(1_000_000).map(lambda x: x["movie_id"])
user_ids = trainset.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_ids = np.unique(np.concatenate(list(movie_ids)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

## Model Build

In [10]:
class ModelRanking(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dims = 32

    # User embeddings
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dims)
    ])

    # Movie Embeddings
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_ids) + 1, embedding_dims)
    ])

    # Predictions
    self.ratings = tf.keras.Sequential([
      # multiple dense layers
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Ratings in output layer
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):
    user_id, movie_id = inputs

    user_embed = self.user_embeddings(user_id)
    movie_embed = self.movie_embeddings(movie_id)

    return self.ratings(tf.concat([user_embed, movie_embed], axis=1))
  
  
  
class ModelMovielens(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = ModelRanking()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["user_id"], features["movie_id"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("rating")
    rating_predictions = self(features)

    # Compute loss and metric
    return self.task(labels=labels, predictions=rating_predictions)

In [11]:
model = ModelMovielens()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005))

In [12]:
cached_train = trainset.shuffle(100_000).batch(8192).cache()
cached_test = testset.batch(4096).cache()

In [29]:
model.fit(cached_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f3b40a35fd0>

In [None]:
# Save the weights
model.save_weights('./tfrs_weights')

# Restore the weights 
#model.load_weights('tfrs_model/tfrs_weights')

#Save the model
#model.save('./tfrs_model')
#model = tf.keras.models.load_model('tfrs_model/tfrs_model')


## Model Evaluate

Here only the RMSE metric is focused to have an overview of the accuracy of the model to be compared with other recommendation systems.

In [30]:
model.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 0.766929566860199,
 'loss': 0.5473156571388245,
 'regularization_loss': 0,
 'total_loss': 0.5473156571388245}

Get Predictions

In [15]:
df_names = pd.read_csv('data/master_data_with_movie_info.zip', compression="zip")[["movieId", "title"]]
df_names = df_names.set_index('movieId')
movie_dict = df_names['title'].to_dict()

In [16]:
test_ratings = {}
test_movie_ids = ["0", "11", "199"]

user_id_test = "42"


for movie_id in test_movie_ids:
  movie_name = movie_dict.get(int(movie_id))
  test_ratings[movie_name] = model({
      "user_id": np.array([user_id_test]),
      "movie_id": np.array([movie_id])
  })

print("Recommendations:")
for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

Recommendations:
Toy Story (1995): [[3.5536563]]
Jefferson in Paris (1995): [[2.3756332]]
Dracula: Dead and Loving It (1995): [[2.265089]]
