# Imports

In [2]:
import os
import pprint
import tempfile
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import pandas as pd
import sampling

import tensorflow_recommenders as tfrs

import pickle

# Constants

In [3]:
RATINGS_SMALL = "../EDA_files/ratings_small.parquet"
RECIPES_SMALL = "../EDA_files/recipes_small.parquet"
INDEX_TO_RECIPE_OBJ = "../EDA_files/index_to_recipe.obj"
RECIPE_TO_INDEX_OBJ = "../EDA_files/recipe_to_index.obj"

ING_CLEAN_NO_COMMON = '../cleaned_files/ingredients_clean_without_common_words.obj'
KEYWORDS_CLEAN = '../cleaned_files/keywords_cleaned.obj'
CATEGORIES_CLEAN = '../cleaned_files/categories_cleaned.obj'
NAMES_CLEAN = '../cleaned_files/names_cleaned.obj'

RECIPES_DATA = "../dataset/recipes.parquet"

# Load data

In [4]:
recipes_small = pd.read_parquet(RECIPES_SMALL)
ratings_small = pd.read_parquet(RATINGS_SMALL)

with open(RECIPE_TO_INDEX_OBJ, "rb") as input_file:
    recipe_to_index = pickle.load(input_file)

## Ratings

In [5]:
author_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='AuthorId')
recipe_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='RecipeId')

ratings_min_20 = author_min_20.merge(recipe_min_20, how='inner')

In [6]:
ratings_sample = ratings_min_20.copy()

## Recipes

In [7]:
recipes_sample = recipes_small[recipes_small.RecipeId.isin(list(ratings_min_20.RecipeId))].copy()

# Prepare dataset

## Ratings

In [8]:
ratings_sample.AuthorId = ratings_sample.AuthorId.map(lambda x: bytes(str(x), 'utf-8'))
ratings_sample.RecipeId = ratings_sample.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))

ratings_dict = ratings_sample.groupby(['AuthorId', 'RecipeId'])['Rating'].sum().reset_index()

ratings_dict = {name: np.array(value) for name, value in ratings_dict.items()}
ratings = tf.data.Dataset.from_tensor_slices(ratings_dict)


ratings = ratings.map(lambda x: {'AuthorId' : x['AuthorId'], 
                                 'RecipeId' : x['RecipeId'],
                                'Rating' : x['Rating']})

In [9]:
for x in ratings.take(10).as_numpy_iterator():
    pprint.pprint(x)

{'AuthorId': b'100026', 'Rating': 5, 'RecipeId': b'120914'}
{'AuthorId': b'100026', 'Rating': 5, 'RecipeId': b'143736'}
{'AuthorId': b'100026', 'Rating': 4, 'RecipeId': b'161324'}
{'AuthorId': b'100026', 'Rating': 5, 'RecipeId': b'161335'}
{'AuthorId': b'100026', 'Rating': 4, 'RecipeId': b'161381'}
{'AuthorId': b'100026', 'Rating': 5, 'RecipeId': b'172588'}
{'AuthorId': b'100026', 'Rating': 5, 'RecipeId': b'195437'}
{'AuthorId': b'100026', 'Rating': 5, 'RecipeId': b'33201'}
{'AuthorId': b'100026', 'Rating': 5, 'RecipeId': b'39165'}
{'AuthorId': b'100026', 'Rating': 5, 'RecipeId': b'8739'}


## Recipes

In [10]:
recipes_sample.RecipeId = recipes_sample.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))
recipes_dict = recipes_sample[['RecipeId']]
recipes_dict = {name: np.array(value) for name, value in recipes_dict.items()}
recipes = tf.data.Dataset.from_tensor_slices(recipes_dict)

In [11]:
recipes = recipes.map(lambda x: x['RecipeId'])

In [12]:
for x in recipes.take(10).as_numpy_iterator():
    pprint.pprint(x)

b'44'
b'56'
b'62'
b'76'
b'102'
b'116'
b'129'
b'142'
b'153'
b'155'


# Split

In [13]:
size = ratings_min_20.shape[0]
train_size = int(0.8 * size)
test_size = size - train_size

tf.random.set_seed(42)
shuffled = ratings.shuffle(size, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(train_size)
test = shuffled.take(train_size).take(test_size)

In [14]:
recipe_ids = ratings.batch(1_000_000).map(lambda x: x['RecipeId'])
user_ids = ratings.batch(1_000_000).map(lambda x: x["AuthorId"])

unique_recipe_ids = np.unique(np.concatenate(list(recipe_ids)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

# Model

In [24]:
class RankingModel(tf.keras.Model):
    
    def __init__(self):
        super().__init__()
        embedding_dimension = 32
        
        self.user_embeddings = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids)+1, embedding_dimension)
        ])
        
        self.recipe_embeddings = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_recipe_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_recipe_ids)+1, embedding_dimension)
        ])
        
        # Compute predictions
        self.ratings = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(1)
        ])
        
    def call(self, inputs):
        
        user_id, recipe_id = inputs
        
        user_embedding = self.user_embeddings(user_id)
        recipe_embedding = self.recipe_embeddings(recipe_id)
        
        return self.ratings(tf.concat([user_embedding, recipe_embedding], axis=1))

In [20]:
ratings_sample.head(1)

Unnamed: 0,RecipeId,AuthorId,Rating,Review,DateSubmitted
0,b'4807',b'2695',2,"I'm sorry, but I tried this method for my Chri...",2000-12-27 13:47:50+00:00


In [25]:
RankingModel()(([b'4807'], [b'44']))

Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.


<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.02070185]], dtype=float32)>

In [26]:
task = tfrs.tasks.Ranking(
    loss =tf.keras.losses.MeanSquaredError(),
    metrics = [tf.keras.metrics.RootMeanSquaredError()])

In [27]:
class RecipeModel(tfrs.models.Model):
    
    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel()
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics = [tf.keras.metrics.RootMeanSquaredError()])
            
        
    def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
        return self.ranking_model((features["AuthorId"], features["RecipeId"]))
    
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor: 
        labels = features.pop("Rating")
        rating_predictions = self(features)
        
        return self.task(labels=labels, predictions=rating_predictions)

In [29]:
model = RecipeModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [30]:
cached_train = train.shuffle(1_000_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [31]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2894bcf6cd0>

In [32]:
model.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 0.8940971493721008,
 'loss': 0.7058691382408142,
 'regularization_loss': 0,
 'total_loss': 0.7058691382408142}