In [1]:
# !pip install -q tensorflow-recommenders
# !pip install -q --upgrade tensorflow-datasets
# !pip install -q scann

In [2]:
import os
import pprint
import tempfile
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import pandas as pd
import sampling

In [3]:
import tensorflow_recommenders as tfrs

In [4]:
import pickle

# Constants 

In [5]:
RATINGS_SMALL = "../EDA_files/ratings_small.parquet"
RECIPES_SMALL = "../EDA_files/recipes_small.parquet"
INDEX_TO_RECIPE_OBJ = "../EDA_files/index_to_recipe.obj"
RECIPE_TO_INDEX_OBJ = "../EDA_files/recipe_to_index.obj"

ING_CLEAN_NO_COMMON = '../cleaned_files/ingredients_clean_without_common_words.obj'
KEYWORDS_CLEAN = '../cleaned_files/keywords_cleaned.obj'
CATEGORIES_CLEAN = '../cleaned_files/categories_cleaned.obj'
NAMES_CLEAN = '../cleaned_files/names_cleaned.obj'

RECIPES_DATA = "../dataset/recipes.parquet"

# Load data 

In [6]:
recipes_small = pd.read_parquet(RECIPES_SMALL)
ratings_small = pd.read_parquet(RATINGS_SMALL)

In [7]:
recipes = pd.read_parquet(RECIPES_DATA)

In [8]:
with open(ING_CLEAN_NO_COMMON, "rb") as input_file:
    ingredients_no_common_words = pickle.load(input_file)

with open(KEYWORDS_CLEAN, "rb") as input_file:
    keywords_clean = pickle.load(input_file)

with open(CATEGORIES_CLEAN, "rb") as input_file:
    categories_clean = pickle.load(input_file)
    
with open(NAMES_CLEAN, "rb") as input_file:
    names_clean = pickle.load(input_file)

In [9]:
ingredients_no_common_words.Ingredients = ingredients_no_common_words.Ingredients.map(lambda x: ' '.join(x))

In [10]:
keywords_clean.Keywords = keywords_clean.Keywords.map(lambda x: ' '.join(x)) 

In [11]:
recipes_clean = recipes_small.copy()
recipes_clean.drop(columns=['Ingredients', 'Keywords', 'RecipeCategory', 'Nutritions'], axis=1, inplace=True)

In [12]:
recipes_clean = recipes_clean.merge(ingredients_no_common_words, on='RecipeId')

In [13]:
recipes_clean = recipes_clean.merge(keywords_clean, on='RecipeId')
recipes_clean = recipes_clean.merge(categories_clean, on='RecipeId')

In [14]:
sample = recipes[['RecipeId', 'Calories', 'FatContent', 'SaturatedFatContent',
                                            'CholesterolContent', 'SodiumContent', 'CarbohydrateContent',
                                            'FiberContent', 'SugarContent', 'ProteinContent']]

In [15]:
recipes_clean = recipes_clean.merge(sample, on='RecipeId')

In [16]:
del(recipes)
del(sample)

In [17]:
author_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='AuthorId')
recipe_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='RecipeId')

ratings_min_20 = author_min_20.merge(recipe_min_20, how='inner')

# Prepare dataset

## Ratings

In [18]:
ratings_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401982 entries, 0 to 1401981
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype              
---  ------         --------------    -----              
 0   RecipeId       1401982 non-null  int32              
 1   AuthorId       1401982 non-null  int32              
 2   Rating         1401982 non-null  int32              
 3   Review         1401982 non-null  object             
 4   DateSubmitted  1401982 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int32(3), object(1)
memory usage: 37.4+ MB


In [19]:
ratings_min_20.AuthorId = ratings_min_20.AuthorId.map(lambda x: bytes(str(x), 'utf-8'))
ratings_min_20.RecipeId = ratings_min_20.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))

In [20]:
ratings_dict = ratings_min_20.groupby(['AuthorId', 'RecipeId'])['Rating'].sum().reset_index()

In [21]:
ratings_dict = {name: np.array(value) for name, value in ratings_dict.items()}
ratings = tf.data.Dataset.from_tensor_slices(ratings_dict)


In [22]:
ratings = ratings.map(lambda x: {'AuthorId' : x['AuthorId'], 
                                 'RecipeId' : x['RecipeId'], 
                                 'Rating' : float(x['Rating']),})

In [23]:
for x in ratings.take(10).as_numpy_iterator():
    pprint.pprint(x)

{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'120914'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'143736'}
{'AuthorId': b'100026', 'Rating': 4.0, 'RecipeId': b'161324'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'161335'}
{'AuthorId': b'100026', 'Rating': 4.0, 'RecipeId': b'161381'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'172588'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'195437'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'33201'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'39165'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'8739'}


## Recipes

In [24]:
recipes_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 522517 entries, 0 to 522516
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype              
---  ------               --------------   -----              
 0   RecipeId             522517 non-null  int64              
 1   Name                 522517 non-null  object             
 2   AuthorId             522517 non-null  int32              
 3   CookTimeInMinutes    522517 non-null  float64            
 4   PrepTimeInMinutes    522517 non-null  float64            
 5   TotalTimeInMinutes   522517 non-null  float64            
 6   DatePublished        522517 non-null  datetime64[ns, UTC]
 7   Description          522512 non-null  object             
 8   RecipeServings       339606 non-null  float64            
 9   RecipeInstructions   522517 non-null  object             
 10  Ingredients          522517 non-null  object             
 11  Keywords             522517 non-null  object             
 12  Re

In [43]:
recipes_clean = recipes_clean[recipes_clean.RecipeId.isin(list(ratings_min_20.RecipeId))]

In [45]:
features = ["RecipeId", "Name", "Keywords", "Ingredients", "FatContent", "SaturatedFatContent", "CholesterolContent", "SodiumContent",
           "CarbohydrateContent", "FiberContent","SugarContent", "ProteinContent"]

In [46]:
# items_dict = recipes_clean.groupby(['RecipeId'])["Name", "Keywords", "Ingredients", "FatContent", "SaturatedFatContent", "CholesterolContent", "SodiumContent",
#            "CarbohydrateContent", "FiberContent","SugarContent", "ProteinContent"].sum().reset_index()

recipes_clean.RecipeId = recipes_clean.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))
items_dict = recipes_clean[features]
items_dict = {name: np.array(value) for name, value in items_dict.items()}
items = tf.data.Dataset.from_tensor_slices(items_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [47]:
items = items.map(lambda x: x['RecipeId'])

## Basic version - just ids

In [48]:
size = ratings_min_20.shape[0]
train_size = int(0.8 * size)
test_size = size - train_size

tf.random.set_seed(42)
shuffled = ratings.shuffle(size, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(train_size)
test = shuffled.take(train_size).take(test_size)

In [49]:
recipe_ids = items.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["AuthorId"])

unique_recipe_ids = np.unique(np.concatenate(list(recipe_ids)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [50]:
# unique_user_ids = [bytes(str(x), 'utf-8') for x in unique_user_ids]

In [51]:
# unique_recipe_ids = [bytes(str(x), 'utf-8') for x in unique_recipe_ids]

# Implementing model

## Query tower

In [52]:
embedding_dimension = 64

In [53]:
user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

## Candidate tower

In [54]:
recipe_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_recipe_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_recipe_ids) + 1, embedding_dimension)
])

## Model

In [55]:
class RecipeModel(tfrs.Model):
    def __init__(self, user_model, recipe_model):
        super().__init__()
        self.recipe_model: tf.keras.Model = recipe_model
        self.user_model: tf.keras.Model = user_model
            
        metrics = tfrs.metrics.FactorizedTopK(candidates=items.batch(128).map(recipe_model))
        task = tfrs.tasks.Retrieval(metrics=metrics)
        self.task: tf.keras.layers.Layer = task
            
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
#         print(features)
        user_embeddings = self.user_model(features["AuthorId"])
        positive_recipe_embeddings = self.recipe_model(features["RecipeId"])
        
        return self.task(user_embeddings, positive_recipe_embeddings)

In [56]:
model = RecipeModel(user_model, recipe_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [57]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [59]:
model.fit(cached_train, epochs=3)

Epoch 1/3
{'AuthorId': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'RecipeId': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=string>, 'Rating': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=float32>}
{'AuthorId': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'RecipeId': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=string>, 'Rating': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=float32>}
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1e30ee31250>

In [None]:
model.evaluate(cached_test, return_dict=True)

{'AuthorId': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'RecipeId': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=string>, 'Rating': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=float32>}
