In [268]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [269]:
!pip install tensorflow-recommenders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [270]:
# import required packages
# utility
import html
import pprint
import numpy as np
import pandas as pd
from typing import Dict, Text

# tensorflow
import tensorflow as tf
import tensorflow_recommenders as tfrs

In [271]:
# load ratings dataset
rating_df = pd.read_csv('/content/drive/MyDrive/ml-latest-small/ratings.csv')

rating_df.shape

(100836, 4)

In [272]:
# first 5 data points
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [273]:
# info
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [274]:
# convert id columns to string
rating_df['userId'] = rating_df['userId'].map(lambda x: str(x))
rating_df['movieId'] = rating_df['movieId'].map(lambda x: str(x))

In [275]:
rating_df.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [276]:
# load anime metadata
movie_df = pd.read_csv('/content/drive/MyDrive/ml-latest-small/movies.csv')

movie_df.shape

(9742, 3)

In [277]:
# first 5 data points
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [278]:
# info
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [279]:
# convert id column to string
movie_df['movieId'] = movie_df['movieId'].apply(lambda x: str(x))

In [280]:
# sample anime name with invalid characters
movie_df['title'].loc[8441]

'Million Dollar Arm (2014)'

In [281]:
# clean anime name column
# remove html encodings
# convert to regular characters
movie_df['title'] = movie_df['title'].apply(html.unescape)

# remove emojis and other miscellaneous unicode characters
# encode as ascii to remove miscellaneous unicode characters
# replaces the characters with '?'
# converts str to array of bytes
movie_df['title'] = movie_df['title'].map(lambda x: x.encode('ascii', 'replace'))

# decode to convert to str
movie_df['title'] = movie_df['title'].map(lambda x: x.decode())

# replace '?' with a space
movie_df['title'] = movie_df['title'].map(lambda x: x.replace('?', ' '))

In [282]:
# verify cleaning
movie_df['title'].loc[8441]

'Million Dollar Arm (2014)'

In [283]:
# add anime name column from anime dataframe to rating dataframe
rating_df = pd.merge(rating_df, movie_df.loc[:, ['movieId', 'title']], on='movieId')

rating_df.shape

(100836, 5)

In [284]:
# first 5 data points
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story (1995)
1,5,1,4.0,847434962,Toy Story (1995)
2,7,1,4.5,1106635946,Toy Story (1995)
3,15,1,2.5,1510577970,Toy Story (1995)
4,17,1,4.5,1305696483,Toy Story (1995)


In [285]:
# convert rating data from dataframe to tensorflow Dataset
ratings = tf.data.Dataset.from_tensor_slices(dict(rating_df))

# keep only user_id and anime_name column
ratings = ratings.map(lambda x: {
    'title': x['title'],
    'userId': x['userId']
})

for x in ratings.take(5).as_numpy_iterator():
    pprint.pprint(x)

{'title': b'Toy Story (1995)', 'userId': b'1'}
{'title': b'Toy Story (1995)', 'userId': b'5'}
{'title': b'Toy Story (1995)', 'userId': b'7'}
{'title': b'Toy Story (1995)', 'userId': b'15'}
{'title': b'Toy Story (1995)', 'userId': b'17'}


In [286]:
# convert anime metadata from dataframe to tensorflow Dataset
# keep only anime name
movies = tf.data.Dataset.from_tensor_slices(movie_df['title'])

for x in movies.take(5).as_numpy_iterator():
    pprint.pprint(x)

b'Toy Story (1995)'
b'Jumanji (1995)'
b'Grumpier Old Men (1995)'
b'Waiting to Exhale (1995)'
b'Father of the Bride Part II (1995)'


In [287]:
# shuffle and split data: train, valid, test
# set seed
tf.random.set_seed(42)

# total data points
N = rating_df.shape[0]

# total train data points
N_train = int(0.6 * N)

# total valid data points
N_valid = int(0.2 * N)

# total test data points
N_test = N - (N_train + N_valid)

# shuffle data
shuffled = ratings.shuffle(N, seed=42, reshuffle_each_iteration=False)

# # split data
train = shuffled.take(N_train)
valid = shuffled.skip(N_train).take(N_valid)
test = shuffled.skip((N_train + N_valid)).take(N_test)

In [288]:
# extract list of unique anime titles
unique_movie_titles = np.concatenate(list(movies.apply(tf.data.experimental.unique()).batch(1000)))

unique_movie_titles[:10]

array([b'Toy Story (1995)', b'Jumanji (1995)', b'Grumpier Old Men (1995)',
       b'Waiting to Exhale (1995)', b'Father of the Bride Part II (1995)',
       b'Heat (1995)', b'Sabrina (1995)', b'Tom and Huck (1995)',
       b'Sudden Death (1995)', b'GoldenEye (1995)'], dtype=object)

In [289]:
# extract list of unique user ids
user_ids = ratings.batch(1_000_000).map(lambda x: x['userId'])
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_user_ids[:4]

array([b'1', b'10', b'100', b'101'], dtype=object)

## Retrieval Model: Two Tower Architecture
- Query Tower
- Candidate Tower

### Model Architecture

In [290]:
# query and candidate embedding dimension
embedding_dimension = 32
strategy = tf.distribute.MirroredStrategy()

- ### Query Tower

In [291]:
# query tower
with strategy.scope():
    user_model = tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),

        # add an additional embedding to account for unknown tokens
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

- ### Query Tower

In [292]:
# candidate tower
with strategy.scope():
    anime_model = tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary=unique_movie_titles, mask_token=None),

        # add an additional embedding to account for unknown tokens
        tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])

- ### Metrics and Loss

In [293]:
with strategy.scope():
    
    # metrics
    metrics = tfrs.metrics.FactorizedTopK(
      candidates=movies.batch(1024).map(anime_model)
    )
    
    # loss
    task = tfrs.tasks.Retrieval(
      metrics=metrics
    )

- ### Retrieval Model

In [294]:
# retrieval model
class MovieModel(tfrs.Model):

    def __init__(self, user_model, anime_model):
        super().__init__()
        self.user_model: tf.keras.Model = user_model
        self.anime_model: tf.keras.Model = anime_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False):
        # pass the user features into the user model
        # returns the embeddings
        user_embeddings = self.user_model(features["userId"])
        
        # pass the movie features into the movie model
        # returns the embeddings
        positive_anime_embeddings = self.anime_model(features["title"])

        # the task computes the loss and the metrics
        # compute_metrics=not training:
        # - turns metric calculation off while training
        # - speeds up training
        return self.task(user_embeddings, positive_anime_embeddings, compute_metrics=not training)

In [295]:
with strategy.scope():
    # initialize model
    model = MovieModel(user_model, anime_model)

    # compile model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01))

In [296]:
# prepare train and test sets for training
cached_train = train.shuffle(N_train).batch(8192).cache()
cached_valid = valid.batch(4096).cache()

### Train and Evaluate Model

In [297]:
# train model
history = model.fit(cached_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [298]:
# evaluate the model
model.evaluate(cached_valid, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 4.958595673087984e-05,
 'factorized_top_k/top_5_categorical_accuracy': 0.0005950314807705581,
 'factorized_top_k/top_10_categorical_accuracy': 0.00203302432782948,
 'factorized_top_k/top_50_categorical_accuracy': 0.02553676813840866,
 'factorized_top_k/top_100_categorical_accuracy': 0.05603213235735893,
 'loss': 32478.599609375,
 'regularization_loss': 0,
 'total_loss': 32478.599609375}

### Making Predictions

In [299]:
# use fatorized_top_k.BruteForce layer to make predictions
# facilitates input raw query features,
# embedding it and recommending movies out of the entire dataset

# create the single layer model
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

# generate recommendations
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.anime_model)))
)

# get recommendation for specific user
# user_id: 42
_, titles = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :3]}")

Recommendations for user 42: [b'Toxic Avenger Part III: The Last Temptation of Toxie, The (1989)'
 b'Being Human (1993)' b'Toxic Avenger, Part II, The (1989)']


## Ranking Model

### Prepare data for Ranking Model:
- Add the rating column to the data used for Retrieval Model

In [300]:
# convert rating data from dataframe to tensorflow Dataset
ratings = tf.data.Dataset.from_tensor_slices(dict(rating_df))

# keep only user_id and anime_name column
ratings = ratings.map(lambda x: {
    'title': x['title'],
    'userId': x['userId'],
    'rating': x['rating']
})

for x in ratings.take(5).as_numpy_iterator():
    pprint.pprint(x)

{'rating': 4.0, 'title': b'Toy Story (1995)', 'userId': b'1'}
{'rating': 4.0, 'title': b'Toy Story (1995)', 'userId': b'5'}
{'rating': 4.5, 'title': b'Toy Story (1995)', 'userId': b'7'}
{'rating': 2.5, 'title': b'Toy Story (1995)', 'userId': b'15'}
{'rating': 4.5, 'title': b'Toy Story (1995)', 'userId': b'17'}


In [301]:
# shuffle and split data: train, valid, test
# total data points
N = rating_df.shape[0]

# total train data points
N_train = int(0.6 * N)

# total valid data points
N_valid = int(0.2 * N)

# total test data points
N_test = N - (N_train + N_valid)

# shuffle data
shuffled = ratings.shuffle(N, seed=42, reshuffle_each_iteration=False)

# # split data
train = shuffled.take(N_train)
valid = shuffled.skip(N_train).take(N_valid)
test = shuffled.skip((N_train + N_valid)).take(N_test)

### Model Architecture

In [302]:
class RankingModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        # user model
        # computes user embeddings
        self.user_embeddings = tf.keras.Sequential([
          tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
          tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])

        # anime model
        # computes anime embeddings
        self.movie_embeddings = tf.keras.Sequential([
          tf.keras.layers.StringLookup(vocabulary=unique_movie_titles, mask_token=None),
          tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
        ])

        # ratings model
        # predicts ratings
        self.ratings = tf.keras.Sequential([
          tf.keras.layers.Dense(256, activation="relu"),
          tf.keras.layers.Dense(64, activation="relu"),
          tf.keras.layers.Dense(1)
        ])

    def call(self, inputs):
        userId, title = inputs
        
        # generate embeddings for the user id
        user_embedding = self.user_embeddings(userId)
        
        # generate embeddings for the anime title
        movie_embedding = self.movie_embeddings(title)
        
        # predict and return the ratings for user id and anime title pair
        return self.ratings(tf.concat([user_embedding, movie_embedding], axis=1))

- ### Loss and Metrics

In [303]:
with strategy.scope():
    task = tfrs.tasks.Ranking(
        loss = tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

- ### Full Model

In [304]:
class MovieRankingModel(tfrs.models.Model):

    def __init__(self):
        super().__init__()
        
        self.ranking_model: tf.keras.Model = RankingModel()
            
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
          loss = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM),
          metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features: Dict[str, tf.Tensor]):
        return self.ranking_model((features["userId"], features["title"]))

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False):
        labels = features.pop("rating")

        rating_predictions = self(features)

        # The task computes the loss and the metrics.
        return self.task(labels=labels, predictions=rating_predictions)

In [305]:
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # initialize model
    model = MovieRankingModel()
    
    # compile model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01))

In [306]:
# prepare train and test sets for training
cached_train = train.shuffle(N_train).batch(8192).cache()
cached_valid = valid.batch(4096).cache()

### Train and Evaluate Model

In [307]:
# train the model
history = model.fit(cached_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [308]:
# evaluate the model
model.evaluate(cached_valid, return_dict=True)



{'root_mean_squared_error': 0.8988497853279114,
 'loss': 0.8190810084342957,
 'regularization_loss': 0,
 'total_loss': 0.8190810084342957}

### Making Predictions

In [309]:
for x in test.take(5).as_numpy_iterator():
    pprint.pprint(x)

{'rating': 2.5, 'title': b'Client, The (1994)', 'userId': b'448'}
{'rating': 3.5,
 'title': b'Scenes From a Marriage (Scener ur ett  ktenskap) (1973)',
 'userId': b'105'}
{'rating': 3.5,
 'title': b'Tokyo Drifter (T ky  nagaremono) (1966)',
 'userId': b'599'}
{'rating': 4.0, 'title': b'Goldfinger (1964)', 'userId': b'603'}
{'rating': 0.5, 'title': b'Julie & Julia (2009)', 'userId': b'10'}
