# Imports

In [1]:
import os
import pprint
import tempfile
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import pandas as pd
import sampling

import tensorflow_recommenders as tfrs

import pickle

import datetime

from tensorflow.keras.layers import Flatten   # to flatten the input data
from tensorflow.keras.layers import Dense     # for the hidden layer

# Constants

In [2]:
RATINGS_SMALL = "../Data/EDA_files/ratings_small.parquet"
RECIPES_SMALL = "../Data/EDA_files/recipes_small.parquet"

ING_CLEAN_NO_COMMON = '../Data/cleaned_files/ingredients_clean_without_common_words.obj'
KEYWORDS_CLEAN = '../Data/cleaned_files/keywords_cleaned.obj'
CATEGORIES_CLEAN = '../Data/cleaned_files/categories_cleaned.obj'
NAMES_CLEAN = '../Data/cleaned_files/names_cleaned.obj'



# Load data

In [3]:
recipes_small = pd.read_parquet(RECIPES_SMALL)
ratings_small = pd.read_parquet(RATINGS_SMALL)

with open(ING_CLEAN_NO_COMMON, "rb") as input_file:
    ingredients = pickle.load(input_file)
    
with open(CATEGORIES_CLEAN, "rb") as input_file:
    categories = pickle.load(input_file)
    
with open(NAMES_CLEAN, "rb") as input_file:
    names = pickle.load(input_file)

In [4]:
ratings_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401982 entries, 0 to 1401981
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype              
---  ------         --------------    -----              
 0   RecipeId       1401982 non-null  int32              
 1   AuthorId       1401982 non-null  int32              
 2   Rating         1401982 non-null  int32              
 3   Review         1401982 non-null  object             
 4   DateSubmitted  1401982 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int32(3), object(1)
memory usage: 37.4+ MB


## Ratings

In [5]:
ratings_small["Timestamp"] = ratings_small.DateSubmitted.map(lambda x: int(x.timestamp()))
ratings_small.drop(columns=["Rating", "Review", "DateSubmitted"], inplace=True)

In [6]:
author_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='AuthorId')
recipe_min_20 = sampling.get_rating_with_min_number(ratings_small, 10, col_name='RecipeId')

ratings_min_20 = author_min_20.merge(recipe_min_20, how='inner')

In [7]:
ratings_sample = ratings_min_20.copy()

In [8]:
ratings_sample.RecipeId

0            780
1           4366
2           4807
3            810
4           5466
           ...  
441660     49088
441661     43023
441662     73866
441663     26370
441664    339905
Name: RecipeId, Length: 441665, dtype: int32

## Recipes

In [9]:
recipes_subset = recipes_small[["RecipeId"]].merge(ingredients, on="RecipeId", how="inner").merge(categories, on="RecipeId",
                                                                                                 how="inner")

In [10]:
recipes_subset["Ingredients"] = recipes_subset["Ingredients"].map(lambda x: " ".join(x))

In [12]:
merged_dataset = ratings_sample.merge(recipes_subset, on="RecipeId", how="inner")

# Prepare dataset

In [13]:
merged_dataset.AuthorId = merged_dataset.AuthorId.map(lambda x: bytes(str(x), 'utf-8'))
merged_dataset.RecipeId = merged_dataset.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))

ratings_dict = merged_dataset[['AuthorId', 'RecipeId', 'Timestamp', "Ingredients", "RecipeCategory"]]
ratings_dict = {name: np.array(value) for name, value in ratings_dict.items()}
ratings = tf.data.Dataset.from_tensor_slices(ratings_dict)


ratings = ratings.map(lambda x: {'AuthorId' : x['AuthorId'], 
                                 'RecipeId' : x['RecipeId'],
                                 'Timestamp' : x['Timestamp'],
                                 'Ingredients' : x['Ingredients'], 
                                 'RecipeCategory': x['RecipeCategory']})

In [14]:
for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'AuthorId': b'2312',
 'Ingredients': b'cayenne pepper chicken breast cumin garlic ginger lemon lemo'
                b'n juice nutmeg paprika turmeric water',
 'RecipeCategory': b'chicken breast',
 'RecipeId': b'780',
 'Timestamp': 968798976}


# Featurization

## Creating dictionaries

In [15]:
# recipe_ids_lookup = tf.keras.layers.StringLookup()
# recipe_ids_lookup.adapt(ratings.map(lambda x: x["RecipeId"]))

In [16]:
# print(f"Vocabulary: {recipe_ids_lookup.get_vocabulary()[:3]}")

## Embeddings 

### Recipe id

In [17]:
# recipe_id_embedding = tf.keras.layers.Embedding(
#                         input_dim=recipe_ids_lookup.vocabulary_size(),
#                         output_dim=32
# )

In [18]:
# recipe_id_model = tf.keras.Sequential([recipe_ids_lookup, recipe_id_embedding])

### User id

In [19]:
# user_id_lookup = tf.keras.layers.StringLookup()
# user_id_lookup.adapt(ratings.map(lambda x: x["AuthorId"]))

# user_id_embedding = tf.keras.layers.Embedding(user_id_lookup.vocab_size(), 32)
# user_id_model = tf.keras.Sequential([user_id_lookup, user_id_embedding])

## Normalizing timestamp

In [20]:
# for x in ratings.take(3).as_numpy_iterator():
#     print(f"Timestamp: {x['Timestamp']}")

In [21]:
# timestamp_normalization = tf.keras.layers.Normalization(axis=None)

# timestamp_normalization.adapt(ratings.map(lambda x: x['Timestamp']).batch(1024))

# for x in ratings.take(3).as_numpy_iterator():
#     print(f"Normalized timestamp: {timestamp_normalization(x['Timestamp'])}")

## Discretization timestamp

In [22]:
timestamps = np.concatenate(list(ratings.map(lambda x: x["Timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)


In [23]:
# print(f"Buckets: {timestamp_buckets[:3]}")

In [24]:
# timestamp_embedding_model = tf.keras.Sequential([
#     tf.keras.layers.Discretization(timestamp_buckets.tolist()),
#     tf.keras.layers.Embedding(len(timestamp_buckets)+1, 32)
# ])

# for timestamp in ratings.take(1).map(lambda x: x["Timestamp"]).batch(1).as_numpy_iterator():
#     print(f"Timestamp embedding: {timestamp_embedding_model(timestamp)}")

## Processing text features

In [25]:
# title_text = tf.keras.layers.TextVectorization()
# title_text.adapt(recipes.map(lambda x: x['Name']).batch(1024))

In [26]:
# for row in recipes.take(1).map(lambda x: x['Name']).batch(1).as_numpy_iterator():
#     print(title_text(row))

In [27]:
# title_text.get_vocabulary()[705] + " " + title_text.get_vocabulary()[2] + " " + title_text.get_vocabulary()[60] + " " + title_text.get_vocabulary()[433] + " " + title_text.get_vocabulary()[831]

# Models

In [28]:
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(lambda x: x["AuthorId"]))))

In [29]:
unique_recipe_names = np.unique(np.concatenate(list(recipes.batch(1_000).map(lambda x: x["Name"]))))

NameError: name 'recipes' is not defined

In [None]:
unique_recipe_ids = np.unique(np.concatenate(list(recipes.batch(1_000).map(lambda x: x["RecipeId"]))))

## User model

In [None]:
class UserModel(tfrs.models.Model):
    
    def __init__(self, verbose=False):
        super().__init__()
        self._verbose = verbose
        if(self._verbose):
            print("USER MODEL INIT")
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32)
        ])
        
        self.timestamp_embedding = tf.keras.Sequential([
            tf.keras.layers.Discretization(timestamp_buckets.tolist()),
            tf.keras.layers.Embedding(len(timestamp_buckets)+1, 32),
        ])
        
        self.normalized_timestamp = tf.keras.layers.Normalization(axis=None)
        self.normalized_timestamp.adapt(timestamps)
        
    def call(self, inputs):
        if(self._verbose):
            print("User model call")
            print("INPUTS: ", inputs)
        return tf.concat([
            self.user_embedding(inputs["AuthorId"]),
            self.timestamp_embedding(inputs["Timestamp"]),
            tf.reshape(self.normalized_timestamp(inputs["Timestamp"]), (-1,1)),
        ], axis=1)
        

In [None]:
user_model = UserModel()

In [None]:
for row in ratings.batch(1).take(1):
    print(f"Representation: {user_model(row)[0, :3]}")

## Recipe model

In [None]:
class RecipeModel(tfrs.models.Model):
    
    def __init__(self, verbose=False):
        super().__init__()
        
        max_tokens = 10_000
        self._verbose = verbose
        if(verbose):
            print("RECIPE MODEL INIT")
        self.recipe_id_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_recipe_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_recipe_ids)+1, 32)
        ])
        
        self.name_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_tokens)
        
        self.name_text_embedding = tf.keras.Sequential([
            self.name_vectorizer,
            tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
            tf.keras.layers.GlobalAveragePooling1D()
        ])
        
        self.name_vectorizer.adapt(recipes.map(lambda x: x['Name']))
        
    def call(self, inputs):
        if(self._verbose):
            print("Recipe model call")
            print("INPUTS: ", inputs)
        return tf.concat([
            self.recipe_id_embedding(inputs["RecipeId"]),
            self.name_text_embedding(inputs["Name"])
        ], axis=1)

In [None]:
recipe_model = RecipeModel()

In [None]:
# for x in recipes.take(1).as_numpy_iterator():
# #     print(x)
#     print(recipe_model(x))

## Query model

In [None]:
class QueryModel(tf.keras.Model):
    """Model for encoding user queries."""
    def __init__(self, layer_sizes, verbose=False):
        """Model for encoding user queries.
        Args:
            layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
        """
        
        super().__init__()

        if(verbose):
            print("Query model init")
            
        self._verbose = verbose
        # We first use the user model for generating embeddings.
        self.embedding_model = UserModel()

        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential()

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        # No activation for the last layer.
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))
            
    def call(self, inputs):
        if(self._verbose):
            print("Query model call")
            print("Input: ", inputs)
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

## Candidate model

In [None]:
class CandidateModel(tf.keras.Model):
    """Model for encoding movies."""
    
    def __init__(self, layer_sizes, verbose=False):
        """Model for encoding movies.

        Args:
          layer_sizes:
            A list of integers where the i-th entry represents the number of units
            the i-th layer contains.
        """
        super().__init__()
        if(verbose):
            print("Candidate model init")
        self.embedding_model = RecipeModel()

        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential()

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        # No activation for the last layer.
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))
            
        self._verbose = verbose
    
    def call(self, inputs):
        if(self._verbose):
            print("Candidate model call")
            print("Inputs: ", inputs)
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

## Combined model

In [None]:
class CombinedModel(tfrs.models.Model):
    
    def __init__(self, layer_sizes, verbose=False):
        super().__init__()
        if(verbose):
            print("Init combined model")
        self.query_model = QueryModel(layer_sizes)
        self.candidate_model = CandidateModel(layer_sizes)
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=recipes.batch(128).map(self.candidate_model),
            ),
        )
        self._verbose = verbose
        
        
    def compute_loss(self, features, training=False):
        if(self._verbose):
            print("Combined model compute loss")
            print("Features: ", features)
        query_embeddings = self.query_model({
            "AuthorId": features["AuthorId"],
            "Timestamp": features["Timestamp"],
        })
        
        recipe_embeddings = self.candidate_model({
            "RecipeId": features["RecipeId"],
            "Name": features["Name"]
        })
        
        return self.task(
            query_embeddings, recipe_embeddings, compute_metrics=not training)

In [None]:
size = ratings_min_20.shape[0]
train_size = int(0.8 * size)
test_size = size - train_size

tf.random.set_seed(42)
shuffled = ratings_merged.shuffle(size, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(train_size)
test = shuffled.take(train_size).take(test_size)

cached_train = train.shuffle(1_000_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [None]:
num_epochs = 300

model = CombinedModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))


In [None]:
model.query_model.dense_layers.layers

In [None]:
%%time
one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=1)



In [None]:
accuracy = one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

In [None]:
one_layer_history.history["total_loss"][-1]

In [None]:
model.save_weights('./checkpoints_one_layer/my_checkpoint')

In [None]:
model_two_layer = CombinedM  odel([64, 32])
model_two_layer.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

two_layer_history = model_two_layer.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

accuracy = two_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

In [None]:
model_two_layer.save_weights('./checkpoints_two_layer/my_checkpoint')

In [None]:
two_layer_history.history["total_loss"][-1]

In [None]:
import matplotlib.pyplot as plt

num_validation_runs = len(one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"])
epochs = [(x + 1)* 5 for x in range(num_validation_runs)]

plt.plot(epochs, one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"], label="1 layer")
plt.plot(epochs, two_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"], label="2 layers")
plt.title("Accuracy vs epoch")
plt.xlabel("epoch")
plt.ylabel("Top-100 accuracy");
plt.legend()

In [None]:
type(one_layer_history)

In [None]:
import pickle
with open('../one_layer_history.obj', 'wb') as pickle_file:
    pickle.dump(one_layer_history.history, pickle_file)

In [None]:
model_three_layer = MovielensModel([128, 64, 32])
model_three_layer.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

three_layer_history = model_three_layer.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

accuracy = three_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

In [None]:
with open('../two_layer_history.obj', 'wb') as pickle_file:
    pickle.dump(two_layer_history.history, pickle_file)

In [None]:
one_layer_history.epoch

In [None]:
plt.plot(epochs, one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"], label="1 layer")
plt.plot(epochs, two_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"], label="2 layers")
plt.plot(epochs, three_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"], label="3 layers")
plt.title("Accuracy vs epoch")
plt.xlabel("epoch")
plt.ylabel("Top-100 accuracy");
plt.legend()