# Imports

In [1]:
import os
import pprint
import tempfile
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import pandas as pd
import sampling

import tensorflow_recommenders as tfrs

import pickle

import datetime

from tensorflow.keras.layers import Flatten   
from tensorflow.keras.layers import Dense     

import TensorflowRichFeatures as tfrs_rich

# Constants

## Load

In [2]:
RATINGS_BASE = "../Data/base/ratings_base.parquet"
RECIPES_BASE = "../Data/base/recipes_base.parquet"

INGREDIENTS_CLEAN = '../Data/cleaned_files/ingredients_clean_without_common_words.obj'

# Load data

In [46]:
recipes_small = pd.read_parquet(RECIPES_BASE)
ratings_small = pd.read_parquet(RATINGS_BASE)

with open(INGREDIENTS_CLEAN, "rb") as input_file:
    ingredients_clean = pickle.load(input_file)

## Ratings

In [47]:
ratings_sample = sampling.get_ratings_with_min_number_list(ratings_small, [20,20])
recipe_ids_in_sample = list(set(ratings_sample.RecipeId))

## Recipes

In [48]:
recipes_subset = recipes_small[["RecipeId"]].merge(ingredients_clean, on="RecipeId", how="inner")
recipes_subset["Ingredients"] = recipes_subset["Ingredients"].map(lambda x: " ".join(x))
recipes_subset = recipes_subset[recipes_subset.RecipeId.isin(recipe_ids_in_sample)]

In [49]:
merged_dataset = ratings_sample.merge(recipes_subset, on="RecipeId", how="inner")

In [50]:
merged_dataset.drop(columns=["DateSubmitted"], inplace=True)

In [51]:
merged_dataset

Unnamed: 0,RecipeId,AuthorId,Rating,Ingredients
0,4807,2695,2,butter turkey
1,4807,74652,5,butter turkey
2,4807,73272,5,butter turkey
3,4807,111526,5,butter turkey
4,4807,128803,5,butter turkey
...,...,...,...,...
310017,58901,582561,5,bay beef broth beef meat celery rib corn garli...
310018,519642,182312,4,brown sugar butter cinnamon egg honey milk pow...
310019,533699,109030,5,italian sausage mozzarella cheese parmesan che...
310020,500502,53859,4,butter coconut oil egg flour milk soda sugar v...


# Prepare dataset

## Recipes

In [52]:
recipes_subset.RecipeId = recipes_subset.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))

recipes_dict = recipes_subset[['RecipeId','Ingredients']]
recipes_dict = {name: np.array(value) for name, value in recipes_dict.items()}
recipes = tf.data.Dataset.from_tensor_slices(recipes_dict)


recipes = recipes.map(lambda x: {'RecipeId' : x['RecipeId'],
                                 'Ingredients' : x['Ingredients']})

In [53]:
for x in recipes.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'Ingredients': b'black pepper butter button mushroom celery chicken flour mil'
                b'k parsley pepper pimiento worcestershire sauce',
 'RecipeId': b'44.0'}


## Ratings

In [54]:
ratings_sample.AuthorId = ratings_sample.AuthorId.map(lambda x: bytes(str(x), 'utf-8'))
ratings_sample.RecipeId = ratings_sample.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))

ratings_dict = ratings_sample.groupby(['AuthorId', 'RecipeId'])['Rating'].sum().reset_index()
ratings_dict = {name: np.array(value) for name, value in ratings_dict.items()}
ratings = tf.data.Dataset.from_tensor_slices(ratings_dict)


ratings = ratings.map(lambda x: {'AuthorId' : x['AuthorId'], 
                                 'RecipeId' : x['RecipeId'],
                                 'Rating' : x['Rating']})

In [55]:
for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'AuthorId': b'100026', 'Rating': 5, 'RecipeId': b'120914'}


## Merged dataset

In [56]:
merged_dataset.AuthorId = merged_dataset.AuthorId.map(lambda x: bytes(str(x), 'utf-8'))
merged_dataset.RecipeId = merged_dataset.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))

merged_dict = merged_dataset[['AuthorId', 'RecipeId', 'Rating', 'Ingredients']]
merged_dict = {name: np.array(value) for name, value in merged_dict.items()}
merged_dataset = tf.data.Dataset.from_tensor_slices(merged_dict)


merged_dataset = merged_dataset.map(lambda x: {'AuthorId' : x['AuthorId'], 
                                 'RecipeId' : x['RecipeId'],
                                 'Ingredients' : x['Ingredients'],
                                    'Rating': x['Rating']})

## Timestamps

In [57]:
# timestamps = np.concatenate(list(ratings.map(lambda x: x["Timestamp"]).batch(100)))

# max_timestamp = timestamps.max()
# min_timestamp = timestamps.min()

# timestamp_buckets = np.linspace(
#     min_timestamp, max_timestamp, num=1000,
# )


## Train, test, val datasets

In [62]:
size = ratings_sample.shape[0]
train_size = int(0.7 * size)
val_size = int(0.15 * size)
test_size = size - train_size - val_size

tf.random.set_seed(42)
shuffled = merged_dataset.shuffle(size, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(train_size)
val = shuffled.take(train_size).take(val_size)
test = shuffled.take(train_size).take(val_size).take(test_size)

print(f"Train size: {train_size}")
print(f"Test size: {test_size}")
print(f"Val size: {val_size}") 

Train size: 217015
Test size: 46504
Val size: 46503


In [63]:
recipe_ids = merged_dataset.batch(1_000_000).map(lambda x: x['RecipeId'])
user_ids = merged_dataset.batch(1_000_000).map(lambda x: x["AuthorId"])

unique_recipe_ids = np.unique(np.concatenate(list(recipe_ids)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

# Model

## Query tower

In [67]:
class UserModel(tfrs.models.Model):
    
    def __init__(self, 
                 unique_user_ids,
                 verbose=False):
        
        super().__init__()
        self._verbose = verbose
        if(self._verbose):
            print("USER MODEL INIT")
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32)
        ])
        
#         self.timestamp_embedding = tf.keras.Sequential([
#             tf.keras.layers.Discretization(timestamp_buckets.tolist()),
#             tf.keras.layers.Embedding(len(timestamp_buckets)+1, 32),
#         ])
        
#         self.normalized_timestamp = tf.keras.layers.Normalization(axis=None)
#         self.normalized_timestamp.adapt(timestamps)
        
    def call(self, inputs):
        if(self._verbose):
            print("User model call")
            print("INPUTS: ", inputs)
            print("AuthorId": inputs["AuthorId"])
        return tf.concat([
            self.user_embedding(inputs["AuthorId"]),
#             self.timestamp_embedding(inputs["Timestamp"]),
#             tf.reshape(self.normalized_timestamp(inputs["Timestamp"]), (-1,1)),
        ], axis=1)
        

SyntaxError: invalid syntax (Temp/ipykernel_1568/1127978980.py, line 28)

In [68]:
class QueryModel(tf.keras.Model):
    """Model for encoding user queries."""
    def __init__(self, 
                 layer_sizes,
                 unique_user_ids,
                 verbose=False):
        """Model for encoding user queries.
        Args:
            layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
        """
        
        super().__init__()

        if(verbose):
            print("Query model init")
            
        self._verbose = verbose
        # We first use the user model for generating embeddings.
        self.embedding_model = UserModel(unique_user_ids, verbose)

        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential()

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        # No activation for the last layer.
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))
            
    def call(self, inputs):
        if(self._verbose):
            print("Query model call")
            print("Input: ", inputs)
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

In [69]:
user_model = UserModel(unique_user_ids)

for row in ratings.batch(1).take(1):
    print(f"Representation: {user_model(row)[0, :3]}")

Representation: [-0.01153687  0.02530286  0.04176519]


## Candidate model

In [70]:
class RecipeModel(tfrs.models.Model):
    
    def __init__(self, 
                 unique_recipe_ids,
                 recipes_dataset,
                 verbose=False):
        super().__init__()
        max_tokens = 10_000
        embedding_dim=32
        
        self._verbose = verbose
        if(verbose):
            print("RECIPE MODEL INIT")
        self.recipe_id_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_recipe_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_recipe_ids)+1, 32)
        ])
        
        self.ingredients_vectorizer = tf.keras.layers.TextVectorization(max_tokens = max_tokens)
        
        self.ingredients_text_embedding = tf.keras.Sequential([
            self.ingredients_vectorizer,
            tf.keras.layers.Embedding(input_dim=max_tokens, output_dim=embedding_dim),
            tf.keras.layers.GlobalAveragePooling1D()
        ])
        
        self.ingredients_vectorizer.adapt(recipes_dataset.map(lambda x: x['Ingredients']))
        
    def call(self, inputs):
        if(self._verbose):
            print("Recipe model call")
            print("INPUTS: ", inputs)
        return tf.concat([
            self.recipe_id_embedding(inputs["RecipeId"]),
            self.ingredients_text_embedding(inputs["Ingredients"])
        ], axis=1)

In [71]:
class CandidateModel(tf.keras.Model):
    """Model for encoding recipes."""
    
    def __init__(self, 
                 layer_sizes, 
                 unique_recipe_ids,
                 recipes_dataset,
                 verbose=False):

        super().__init__()
        if(verbose):
            print("Candidate model init")
        self.embedding_model = RecipeModel(unique_recipe_ids,
                                           recipes_dataset,
                                           verbose)

        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential()

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        # No activation for the last layer.
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))
            
        self._verbose = verbose
    
    def call(self, inputs):
        if(self._verbose):
            print("Candidate model call")
            print("Inputs: ", inputs)
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

## Ranking model

In [75]:
class RankingModel(tf.keras.Model):
    
    def __init__(self, layer_sizes,
                 unique_user_ids, 
                 unique_recipe_ids, 
                 embedding_dimension, 
                 recipes_dataset,
                 verbose=False):
        
        super().__init__()
        if(verbose):
            print('Ranking model INIT')
        
        self.embedding_dimension = embedding_dimension
        self.verbose = verbose
        
        self.query_model = QueryModel(layer_sizes,
                                          unique_user_ids,
                                          verbose=verbose)
        
        self.candidate_model = CandidateModel(layer_sizes, 
                                                unique_recipe_ids,
                                                recipes_dataset,
                                                verbose=verbose)
        
        # Compute predictions
        self.ratings = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(1)
        ])
        
    def call(self, inputs):
        if self.verbose:
            print('Ranking model CALL')
            print('INPUTS', inputs)
            
        user_id, recipe_id, ingredients = inputs
        
        user_embedding = self.query_model({
            "AuthorId": user_id
        })
        
        recipe_embedding = self.candidate_model({
            "RecipeId": recipe_id,
            "Ingredients": ingredients
        })
        
        return self.ratings(tf.concat([user_embedding, recipe_embedding], axis=1))

In [76]:
class RecipeRankingModel(tfrs.models.Model):
    
    def __init__(self, layer_sizes,
                 unique_user_ids, 
                 unique_recipe_ids, 
                 embedding_dimension, 
                 recipes_dataset,
                 verbose=False):
        super().__init__()
        self._verbose = verbose
        self.ranking_model: tf.keras.Model = RankingModel(layer_sizes, unique_user_ids, unique_recipe_ids, embedding_dimension, recipes_dataset, verbose)
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics = [tf.keras.metrics.RootMeanSquaredError()])
            
        
    def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
        if self._verbose:
            print('RECIPE RANKING MODEL CALL')
            
        return self.ranking_model((features["AuthorId"], features["RecipeId"], features["Ingredients"]))
    
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        if self._verbose:
             print('COMPUTE LOSS ', features)
        labels = features.pop("Rating")
        rating_predictions = self(features)
        
        return self.task(labels=labels, predictions=rating_predictions)

In [77]:
model_1 = RecipeRankingModel([32], unique_user_ids=unique_user_ids,
                            unique_recipe_ids=unique_recipe_ids,
                            embedding_dimension=32,
                            recipes_dataset=recipes,
                            verbose=True)
model_1.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

Ranking model INIT
Query model init
USER MODEL INIT
Candidate model init
RECIPE MODEL INIT


In [78]:
cached_train = train.shuffle(250_000).batch(16384).cache()
cached_test = test.batch(4096).cache()
cached_val = val.batch(4096).cache()

In [79]:
cached_train

<CacheDataset shapes: {AuthorId: (None,), RecipeId: (None,), Ingredients: (None,), Rating: (None,)}, types: {AuthorId: tf.string, RecipeId: tf.string, Ingredients: tf.string, Rating: tf.int32}>

In [80]:
%%time
model_1_history = model_1.fit(cached_train,
                              epochs=5, 
                              verbose=1,
                              validation_data=cached_val,
                              validation_freq=5)

Epoch 1/5
COMPUTE LOSS  {'AuthorId': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'RecipeId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'Ingredients': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>, 'Rating': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int32>}
RECIPE RANKING MODEL CALL
Ranking model CALL
INPUTS (<tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>)
Query model call
Input:  {'AuthorId': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>}
User model call
INPUTS:  {'AuthorId': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>}
Candidate model call
Inputs:  {'RecipeId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'Ingredients': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>}
Recipe model call
INPUTS:  {'RecipeId': <tf.Tensor 'IteratorGetNext:3' shape=(No