## TensorFlow Multitask Recommender

Tensorflow recommenders is a library for building recommender system models.

It does this by performing two tasks:
- Retrieval: Finding candidates that are likely to be relevant to the user given history or context features
- Ranking: Sorting retrieved items in order of relevance to the user using an assign score or explicit feedback.

For more information, please go through [TensorFLow Recommenders: Quickstart](https://www.tensorflow.org/recommenders)

In [None]:
# importing packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
import tensorflow_recommenders as tfrs
import os
import datetime

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import logging
logger = tf.get_logger()
logger.setLevel(logging.ERROR)

In [None]:
# import data
df = pd.read_csv("../data/processed/clean_data.csv")

In [None]:
# checking the percentage of unique values we have in each feature
for col in df.columns:
    print(f"{col:} has {df[col].nunique():,} unique values")

In [None]:
# converting the df to dictionary
df_dict = {name: np.array(val) for name, val in df.items()}

# converting df dictionary to tensor slices
data = tf.data.Dataset.from_tensor_slices(df_dict)

In [None]:
# dictionary of unique values 
vocabularies = {}

for feature in df_dict:
    if feature != 'Feedback':
        vocab = np.unique(df_dict[feature])
        vocabularies[feature] = vocab

In [None]:
# converting unique anime names to tensorflow dataset
anime_names = tf.data.Dataset.from_tensor_slices(vocabularies['name'])

In [None]:
# shuffling and splitting the dataset

tf.random.set_seed(42)

shuffled = data.shuffle(500_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(240_000)
validation = shuffled.skip(240_000).take(55_000)
test = shuffled.skip(295_000).take(10_358)

### Model building

Three classes will be constructed to build a multitask hybrid recommender

In [None]:
# responsible for building the user model

class UserModel(tf.keras.Model):
    
    def __init__(self):
        super().__init__()
        
        max_tokens = 10_000
        
        # Genre -> splits genres, creates vectors based on splits and then a 32 dimensional embedding
        self.genre_vectorizer = keras.layers.TextVectorization(max_tokens=max_tokens, split="whitespace")
        self.genre_vectorizer.adapt(vocabularies['genre'])
        self.genre_text_embedding = keras.Sequential([
            self.genre_vectorizer,
            keras.layers.Embedding(max_tokens, 32, mask_zero=True),
            keras.layers.GlobalAveragePooling1D()
        ])
        
        
        # Type -> 5 unique types, so StringLookup to map type to int value, and then 32 dimensional embedding
        self.type_embedding = keras.Sequential([
            keras.layers.StringLookup(
                vocabulary=vocabularies['type'],
                mask_token=None),
            keras.layers.Embedding(len(vocabularies['type'])+1, 32)
        ])
        
        
        # Audience -> same as Type, but with 3 unique types
        self.audience_embedding = keras.Sequential([
            keras.layers.StringLookup(
                vocabulary=vocabularies['Audience'],
                mask_token=None),
            keras.layers.Embedding(len(vocabularies['Audience'])+1, 32)
        ])
        
        
    
    def call(self, inputs):
        # concatenating all embeddings
        return tf.concat([
            self.genre_text_embedding(inputs['genre']),
            self.type_embedding(inputs['type']),
            self.audience_embedding(inputs['Audience'])
        ], axis=1)

In [None]:
# responsible for building the anime model

class AnimeModel(keras.Model):
    
    def __init__(self,):
        super().__init__()
        
        max_tokens = 10_000
        
        # Anime name -> 990+ unique names so Vectorization will be applied w/out splits, then embedded
        self.anime_vectorizer = keras.layers.TextVectorization(max_tokens=max_tokens)
        self.anime_vectorizer.adapt(anime_names)
        self.anime_text_embedding = keras.Sequential([
            self.anime_vectorizer,
            keras.layers.Embedding(max_tokens, 32, mask_zero=True),
            keras.layers.GlobalAveragePooling1D()
        ])
        
        # stringlookup 
        self.anime_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=vocabularies['name'],
                mask_token=None),
            tf.keras.layers.Embedding(len(vocabularies['name'])+1, 32)
        ])
    
    def call(self, inputs):
        # concatenating all embeddings
        return tf.concat([
            self.anime_embedding(inputs),
            self.anime_text_embedding(inputs),
        ], axis=1)


In [None]:
# setting random seed
tf.random.set_seed(7)
np.random.seed(7)

# tensorflow recommenders model
class TFRSModel(tfrs.models.Model):
    
    def __init__(self,):
        super().__init__()
        
        # handles how much weight we want to assign to the rating and retrieval task when computing loss
        self.rating_weight = 0.5
        self.retrieval_weight = 0.5
        
        
        # UserModel
        self.user_model = keras.Sequential([
            UserModel(),
            keras.layers.Dense(32)
        ])
        
        
        # AnimeModel
        self.anime_model = keras.Sequential([
            AnimeModel(),
            keras.layers.Dense(32)
        ])
        
        
        # Deep & Cross layer
        self._cross_layer = tfrs.layers.dcn.Cross(projection_dim=None, kernel_initializer='he_normal')
        
        
        # Dense layers with l2 regularization to prevent overfitting (basic layers - activation can be 'swish' with a he_normal kernel_initializer to improve performace - no need for experimenting here)
        # keras.layers.Dense(512, activation='swish', kernel_initializer='he_normal')
        self._deep_layers = [
            keras.layers.Dense(512, activation='relu', kernel_regularizer='l2'),
            keras.layers.Dense(256, activation='relu', kernel_regularizer='l2'),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(128, activation='relu', kernel_regularizer='l2'),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(64, activation='relu', kernel_regularizer='l2'),
            keras.layers.Dense(32, activation='relu', kernel_regularizer='l2'),
        ]
        
        
        # output layer
        self._logit_layer = keras.layers.Dense(1)
    
        # Multi-task Retrieval & Ranking
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=anime_names.batch(128).map(self.anime_model)
            )
        )
        
        
    # calls user and anime embeddings, applies cross_layers, keras.layers, and output layer defined above -> returns embeddings and output layer
    def call(self, features) -> tf.Tensor:
        user_embeddings = self.user_model({
            'genre': features['genre'],
            'type': features["type"],
            'Audience': features["Audience"]
        })
        
        
        anime_embeddings = self.anime_model(
            features['name']
        )
        
        x = self._cross_layer(tf.concat([
                user_embeddings,
                anime_embeddings], axis=1))
        
        for layer in self._deep_layers:
            x = layer(x)
            
        
        return (
            user_embeddings, 
            anime_embeddings,
            self._logit_layer(x)
        )
        
    
    # computes the ranking and retrieval loss using embeddings and dense layers
    def compute_loss(self, features, training=False):
        user_embeddings, anime_embeddings, rating_predictions = self.call(features)
        # Retrieval loss
        retrieval_loss = self.retrieval_task(user_embeddings, anime_embeddings)
        # Rating loss
        rating_loss = self.rating_task(
            labels=features['Feedback'],
            predictions=rating_predictions
        )
        
        # Combine two losses with hyper-parameters (to be tuned)
        return (self.rating_weight * rating_loss + self.retrieval_weight * retrieval_loss)

#### Model Training

In [None]:
# batching and caching our datasets to improve performance

cached_train = train.shuffle(400_000).batch(2000).cache()
cached_validation = validation.shuffle(100_000).batch(2000).cache()
cached_test = test.shuffle(50_000).batch(2000).cache()

In [None]:
# function for building the model

def create_model():
    """ instantiates a model anmd compiles it """
    
    keras.backend.clear_session()
    tf.random.set_seed(42)
    np.random.seed(42)

    # instantiating the model
    model = TFRSModel()
    model.compile(optimizer=keras.optimizers.Adam(0.01))
    
    return model

In [None]:
# building the model

model = create_model()

# keras callbacks
lr_scheduler = keras.callbacks.ReduceLROnPlateau(monitor="val_factorized_top_k/top_10_categorical_accuracy", patience=5) # reduces learning rate if 'monitor' doesn't improve after 'patience' steps 
ModelCheckpoint_cb = keras.callbacks.ModelCheckpoint(filepath="../model/checkpoints/TFRS_MultiTaskModel {epoch:02d}", save_weights_only=True) # saves model weights after each epoch
EarlyStopping_cb = keras.callbacks.EarlyStopping(monitor="val_factorized_top_k/top_10_categorical_accuracy", restore_best_weights=True, patience=10) # stops training if model performance doesn't improve

log_dir = "../model/logs/fit/" + datetime.datetime.now().strftime("%Y_%m_%d-%H:%M:%S")
tensorboar_cb = keras.callbacks.TensorBoard(log_dir=log_dir)

In [None]:
# fitting the model 
history_fit = model.fit(cached_train, validation_data=cached_validation, callbacks=[ModelCheckpoint_cb, EarlyStopping_cb, lr_scheduler, tensorboar_cb], epochs=20, verbose=0)

In [None]:
# saving model
filepath = "../model/model_weights/"

model.save_weights(filepath=filepath, save_format="tf")

In [None]:
# loading the saved model weights

filepath = "../model/model_weights/"

# loading the model
model = create_model()

# load the weights back to the new model
model.load_weights(filepath)

In [None]:
# viewing training and validation logs in Tensorboard

# %load_ext tensorboard
# %tensorboard --log_dir="../model/logs/fit/"

an easier option is just plotting the training and validation learning curves

In [None]:
# learning curves
learning_curve_data = history_fit.history

In [None]:
# visualizing model learning curves

fig, ax = plt.subplots(1, 2, figsize=(20,6))

ax[0].plot(learning_curve_data['root_mean_squared_error'], label='training')
ax[0].plot(learning_curve_data['val_root_mean_squared_error'], label='validation')
ax[0].set_xlabel('steps', fontsize=12)
ax[0].set_ylabel('root_mean_squared_error', fontsize=12)

ax[1].plot(learning_curve_data['factorized_top_k/top_5_categorical_accuracy'], label='training')
ax[1].plot(learning_curve_data['val_factorized_top_k/top_5_categorical_accuracy'], label='validation')
ax[1].set_xlabel('steps', fontsize=12)
ax[1].set_ylabel('top_5_categorical_accuracy', fontsize=12)

ax[0].legend()
ax[1].legend()

fig.text(x=0.5, y=0.95, s="Model learning curves", fontsize=15, weight='bold', ha='center', va='center')

plt.show()

## Model Evaluation

In [None]:
# evaluating the model
scores = model.evaluate(cached_test, return_dict=True, verbose=False)
scores

`top_k_categorical_accuracy` represents the percentage of times the top-ranked recommendation matches the user's preference.

i.e a `top_5_categorical_accuracy` of 0.9963 means that in 99.63% of the cases, the preferred / correct item appeard in the top 5 recommended items. 

## Model Inference

More Experimentation can be done to improve model performance, but a top_5 of 99.63% is great! for this purpose of this project, so it will be left as is.

Check `scripts` and `app` on how to deploy the model