# Imports

In [1]:
import os
import pprint
import tempfile
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import pandas as pd
import sampling

import tensorflow_recommenders as tfrs

import pickle

import datetime

# Constants

In [2]:
RATINGS_SMALL = "../EDA_files/ratings_small.parquet"
RECIPES_SMALL = "../EDA_files/recipes_small.parquet"
INDEX_TO_RECIPE_OBJ = "../EDA_files/index_to_recipe.obj"
RECIPE_TO_INDEX_OBJ = "../EDA_files/recipe_to_index.obj"

ING_CLEAN_NO_COMMON = '../cleaned_files/ingredients_clean_without_common_words.obj'
KEYWORDS_CLEAN = '../cleaned_files/keywords_cleaned.obj'
CATEGORIES_CLEAN = '../cleaned_files/categories_cleaned.obj'
NAMES_CLEAN = '../cleaned_files/names_cleaned.obj'

RECIPES_DATA = "../dataset/recipes.parquet"

# Load data

In [3]:
recipes_small = pd.read_parquet(RECIPES_SMALL)
ratings_small = pd.read_parquet(RATINGS_SMALL)

with open(RECIPE_TO_INDEX_OBJ, "rb") as input_file:
    recipe_to_index = pickle.load(input_file)

## Ratings

In [4]:
ratings_small["Timestamp"] = ratings_small.DateSubmitted.map(lambda x: int(x.timestamp()))

In [5]:
author_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='AuthorId')
recipe_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='RecipeId')

ratings_min_20 = author_min_20.merge(recipe_min_20, how='inner')

In [6]:
ratings_sample = ratings_min_20.copy()

## Recipes

In [7]:
recipes_sample = recipes_small[recipes_small.RecipeId.isin(list(ratings_min_20.RecipeId))].copy()

# Prepare dataset

## Ratings

In [8]:
ratings_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 310022 entries, 0 to 310021
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   RecipeId       310022 non-null  int32              
 1   AuthorId       310022 non-null  int32              
 2   Rating         310022 non-null  int32              
 3   Review         310022 non-null  object             
 4   DateSubmitted  310022 non-null  datetime64[ns, UTC]
 5   Timestamp      310022 non-null  int64              
dtypes: datetime64[ns, UTC](1), int32(3), int64(1), object(1)
memory usage: 13.0+ MB


In [9]:
ratings_sample.AuthorId = ratings_sample.AuthorId.map(lambda x: bytes(str(x), 'utf-8'))
ratings_sample.RecipeId = ratings_sample.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))

ratings_dict = ratings_sample[['AuthorId', 'RecipeId', 'Timestamp']]
ratings_dict = {name: np.array(value) for name, value in ratings_dict.items()}
ratings = tf.data.Dataset.from_tensor_slices(ratings_dict)


ratings = ratings.map(lambda x: {'AuthorId' : x['AuthorId'], 
                                 'RecipeId' : x['RecipeId'],
                                 'Timestamp' : x['Timestamp']})

In [10]:
for x in ratings.take(10).as_numpy_iterator():
    pprint.pprint(x)

{'AuthorId': b'2695', 'RecipeId': b'4807', 'Timestamp': 977924870}
{'AuthorId': b'2312', 'RecipeId': b'810', 'Timestamp': 978452126}
{'AuthorId': b'2695', 'RecipeId': b'12134', 'Timestamp': 979922414}
{'AuthorId': b'5523', 'RecipeId': b'2713', 'Timestamp': 981052633}
{'AuthorId': b'2312', 'RecipeId': b'8600', 'Timestamp': 984513096}
{'AuthorId': b'6702', 'RecipeId': b'536', 'Timestamp': 986232934}
{'AuthorId': b'2312', 'RecipeId': b'2886', 'Timestamp': 987584007}
{'AuthorId': b'7802', 'RecipeId': b'8782', 'Timestamp': 990008786}
{'AuthorId': b'10033', 'RecipeId': b'3748', 'Timestamp': 1027020496}
{'AuthorId': b'2178', 'RecipeId': b'5478', 'Timestamp': 990740280}


# Recipes

In [11]:
recipes_sample.RecipeId = recipes_sample.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))
recipes_dict = recipes_sample[['RecipeId', 'Name']]
recipes_dict = {name: np.array(value) for name, value in recipes_dict.items()}
recipes = tf.data.Dataset.from_tensor_slices(recipes_dict)

In [12]:
# recipe_names = recipes.map(lambda x: x["Name"])
# recipes = recipes.map(lambda x: x["RecipeId"])

In [14]:
for x in recipes.take(10).as_numpy_iterator():
    pprint.pprint(x)

{'Name': b'Warm Chicken A La King', 'RecipeId': b'44'}
{'Name': b'Buttermilk Pie', 'RecipeId': b'56'}
{'Name': b'Black Bean, Corn, and Tomato Salad', 'RecipeId': b'62'}
{'Name': b'Alfredo Sauce', 'RecipeId': b'76'}
{'Name': b'Cheesy Scalloped Potato Side Dish', 'RecipeId': b'102'}
{'Name': b'Blueberry Scones', 'RecipeId': b'116'}
{'Name': b'Champagne Punch', 'RecipeId': b'129'}
{'Name': b'Almond Fudge Banana Cake', 'RecipeId': b'142'}
{'Name': b'Amish Friendship Bread and Starter', 'RecipeId': b'153'}
{'Name': b'Light Cucumber Soup', 'RecipeId': b'155'}


In [15]:
# recipes_combined = tf.data.Dataset.from_tensor_slices(recipes_dict)

In [16]:
# for x in recipes_combined.take(10).as_numpy_iterator():
#     pprint.pprint(x)

In [99]:
ratings_merges = ratings_sample.merge(recipes_sample[["RecipeId", "Name"]], on="RecipeId", how="left")

In [101]:
ratings_merged_dict = ratings_merges[['AuthorId', 'RecipeId', 'Timestamp', 'Name']]
ratings_merged_dict = {name: np.array(value) for name, value in ratings_merged_dict.items()}
ratings_merged = tf.data.Dataset.from_tensor_slices(ratings_merged_dict)


In [102]:
ratings_merged = ratings_merged.map(lambda x: {'AuthorId' : x['AuthorId'], 
                                 'RecipeId' : x['RecipeId'],
                                 'Timestamp' : x['Timestamp'],
                                 'Name' : x['Name']})

# Featurization

## Creating dictionaries

In [17]:
recipe_ids_lookup = tf.keras.layers.StringLookup()
recipe_ids_lookup.adapt(ratings.map(lambda x: x["RecipeId"]))

In [18]:
print(f"Vocabulary: {recipe_ids_lookup.get_vocabulary()[:3]}")

Vocabulary: ['[UNK]', '45809', '27208']


## Embeddings 

### Recipe id

In [19]:
recipe_id_embedding = tf.keras.layers.Embedding(
                        input_dim=recipe_ids_lookup.vocabulary_size(),
                        output_dim=32
)

In [20]:
recipe_id_model = tf.keras.Sequential([recipe_ids_lookup, recipe_id_embedding])

### User id

In [21]:
user_id_lookup = tf.keras.layers.StringLookup()
user_id_lookup.adapt(ratings.map(lambda x: x["AuthorId"]))

user_id_embedding = tf.keras.layers.Embedding(user_id_lookup.vocab_size(), 32)
user_id_model = tf.keras.Sequential([user_id_lookup, user_id_embedding])



## Normalizing timestamp

In [22]:
for x in ratings.take(3).as_numpy_iterator():
    print(f"Timestamp: {x['Timestamp']}")

Timestamp: 977924870
Timestamp: 978452126
Timestamp: 979922414


In [23]:
timestamp_normalization = tf.keras.layers.Normalization(axis=None)

timestamp_normalization.adapt(ratings.map(lambda x: x['Timestamp']).batch(1024))

for x in ratings.take(3).as_numpy_iterator():
    print(f"Normalized timestamp: {timestamp_normalization(x['Timestamp'])}")

Normalized timestamp: [-2.5186186]
Normalized timestamp: [-2.5129895]
Normalized timestamp: [-2.4972913]


## Discretization timestamp

In [24]:
timestamps = np.concatenate(list(ratings.map(lambda x: x["Timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)


In [25]:
print(f"Buckets: {timestamp_buckets[:3]}")

Buckets: [9.77924870e+08 9.78556604e+08 9.79188338e+08]


In [26]:
timestamp_embedding_model = tf.keras.Sequential([
    tf.keras.layers.Discretization(timestamp_buckets.tolist()),
    tf.keras.layers.Embedding(len(timestamp_buckets)+1, 32)
])

for timestamp in ratings.take(1).map(lambda x: x["Timestamp"]).batch(1).as_numpy_iterator():
    print(f"Timestamp embedding: {timestamp_embedding_model(timestamp)}")

Timestamp embedding: [[-0.01477839 -0.02660116 -0.04763311  0.00071366 -0.02035424  0.0490985
  -0.04404924 -0.03259118 -0.00255934 -0.04514308  0.03547132 -0.01090539
   0.04388595 -0.00482281 -0.01667156  0.03670093 -0.01225809 -0.00892891
  -0.00916835 -0.03770336  0.00340996  0.01761632 -0.04229828  0.04998362
  -0.00917307 -0.03694183  0.0129084   0.02816662  0.01990917  0.01294592
   0.00850926 -0.0246276 ]]


## Processing text features

In [28]:
title_text = tf.keras.layers.TextVectorization()
title_text.adapt(recipes.map(lambda x: x['Name']).batch(1024))

In [31]:
for row in recipes.take(1).map(lambda x: x['Name']).batch(1).as_numpy_iterator():
    print(title_text(row))

tf.Tensor([[705   2  60 433 831]], shape=(1, 5), dtype=int64)


In [32]:
title_text.get_vocabulary()[705] + " " + title_text.get_vocabulary()[2] + " " + title_text.get_vocabulary()[60] + " " + title_text.get_vocabulary()[433] + " " + title_text.get_vocabulary()[831]

'warm chicken a la king'

# Models

In [33]:
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(lambda x: x["AuthorId"]))))

In [34]:
unique_recipe_names = np.unique(np.concatenate(list(recipes.batch(1_000).map(lambda x: x["Name"]))))

In [35]:
unique_recipe_ids = np.unique(np.concatenate(list(recipes.batch(1_000).map(lambda x: x["RecipeId"]))))

## User model

In [76]:
class UserModel(tfrs.models.Model):
    
    def __init__(self):
        super().__init__()
        
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32)
        ])
        
        self.timestamp_embedding = tf.keras.Sequential([
            tf.keras.layers.Discretization(timestamp_buckets.tolist()),
            tf.keras.layers.Embedding(len(timestamp_buckets)+1, 32),
        ])
        
        self.normalized_timestamp = tf.keras.layers.Normalization(axis=None)
        self.normalized_timestamp.adapt(timestamps)
        
    def call(self, inputs):
        return tf.concat([
            self.user_embedding(inputs["AuthorId"]),
            self.timestamp_embedding(inputs["Timestamp"]),
            tf.reshape(self.normalized_timestamp(inputs["Timestamp"]), (-1,1)),
        ], axis=1)
        

In [37]:
user_model = UserModel()

In [38]:
for row in ratings.batch(1).take(1):
    print(f"Representation: {user_model(row)[0, :3]}")

Representation: [ 0.04997486 -0.02538269 -0.0289157 ]


## Recipe model

In [77]:
class RecipeModel(tfrs.models.Model):
    
    def __init__(self):
        super().__init__()
        
        max_tokens = 10_000
        
        self.recipe_id_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_recipe_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_recipe_ids)+1, 32)
        ])
        
        self.name_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_tokens)
        
        self.name_text_embedding = tf.keras.Sequential([
            self.name_vectorizer,
            tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
            tf.keras.layers.GlobalAveragePooling1D()
        ])
        
        self.name_vectorizer.adapt(recipes.map(lambda x: x['Name']))
        
    def call(self, inputs):
#         print(inputs)
        return tf.concat([
            self.recipe_id_embedding(inputs["RecipeId"])
        ], axis=1)

In [66]:
recipe_model = RecipeModel()

In [67]:
for x in recipes.take(1).as_numpy_iterator():
#     print(x)
    print(recipe_model(x))

Consider rewriting this model with the Functional API.
tf.Tensor(
[ 0.04185012  0.03064827  0.00648462  0.0319382   0.0093194  -0.00619936
  0.01752664  0.0411532   0.04666496 -0.04103626  0.01227326 -0.03131478
  0.01060254  0.03605009 -0.04726265  0.04982701  0.04543876  0.04340408
  0.00903907  0.04870263  0.04487706  0.04395784 -0.04400523 -0.00641304
 -0.04760621 -0.02025449 -0.02897121 -0.04928046 -0.03056639  0.0095516
 -0.01649749  0.03808099], shape=(32,), dtype=float32)


## Query model

In [82]:
class QueryModel(tfrs.models.Model):
    def __init__(self, layer_sizes):
        super().__init__()
        self.embedding_model = UserModel()
        self.dense_layers = tf.keras.Sequential()
        
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))
            
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))
            
    def call(self, inputs):
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

## Candidate model

In [83]:
class CandidateModel(tfrs.models.Model):
    def __init__(self, layer_sizes):
        super().__init__()
        self.embedding_model = RecipeModel()
        
        self.dense_layers = tf.keras.Sequential()
        
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))
            
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))
            
    def call(self, inputs):
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

## Combined model

In [107]:
class CombinedModel(tfrs.models.Model):
    def __init__(self, layer_sizes):
        super().__init__()
        self.query_model = QueryModel(layer_sizes)
        self.candidate_model = CandidateModel(layer_sizes)
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=recipes.batch(128).map(self.candidate_model),
            ),
        )
        
    def compute_loss(self, features, training=False):
        print(features)
        query_embeddings = self.query_model({
            "AuthorId": features["AuthorId"],
            "Timestamp": features["Timestamp"],
        })
        
        recipe_embeddings = self.candidate_model({
            "RecipeId": features["RecipeId"],
            "Name": features["Name"]
        })
        
        return self.task(
            query_embeddings, recipe_embeddings, compute_metrics=not training)

In [108]:
size = ratings_min_20.shape[0]
train_size = int(0.8 * size)
test_size = size - train_size

tf.random.set_seed(42)
shuffled = ratings_merged.shuffle(size, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(train_size)
test = shuffled.take(train_size).take(test_size)

cached_train = train.shuffle(1_000_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [109]:
num_epochs = 300

model = CombinedModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

accuracy = one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

{'AuthorId': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'RecipeId': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=string>, 'Timestamp': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=int64>, 'Name': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>}


ValueError: in user code:

    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "C:\anaconda\envs\RecSys\lib\site-packages\tensorflow_recommenders\models\base.py", line 75, in train_step
        gradients = tape.gradient(total_loss, self.trainable_variables)
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\base_layer.py", line 2308, in trainable_variables
        return self.trainable_weights
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\training.py", line 2104, in trainable_weights
        trainable_variables += trackable_obj.trainable_variables
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\base_layer.py", line 2308, in trainable_variables
        return self.trainable_weights
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\training.py", line 2104, in trainable_weights
        trainable_variables += trackable_obj.trainable_variables
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\base_layer.py", line 2308, in trainable_variables
        return self.trainable_weights
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\training.py", line 2104, in trainable_weights
        trainable_variables += trackable_obj.trainable_variables
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\base_layer.py", line 2308, in trainable_variables
        return self.trainable_weights
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\training.py", line 2099, in trainable_weights
        self._assert_weights_created()
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\sequential.py", line 471, in _assert_weights_created
        super(functional.Functional, self)._assert_weights_created()  # pylint: disable=bad-super-call
    File "C:\anaconda\envs\RecSys\lib\site-packages\keras\engine\training.py", line 2736, in _assert_weights_created
        raise ValueError(f'Weights for model {self.name} have not yet been '

    ValueError: Weights for model sequential_52 have not yet been created. Weights are created when the Model is first called on inputs or `build()` is called with an `input_shape`.
