In [114]:
# !pip install -q tensorflow-recommenders
# !pip install -q --upgrade tensorflow-datasets
# !pip install -q scann

In [115]:
import os
import pprint
import tempfile
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import pandas as pd
import sampling

In [116]:
import tensorflow_recommenders as tfrs

In [117]:
import pickle

# Constants 

In [118]:
RATINGS_SMALL = "../EDA_files/ratings_small.parquet"
RECIPES_SMALL = "../EDA_files/recipes_small.parquet"
INDEX_TO_RECIPE_OBJ = "../EDA_files/index_to_recipe.obj"
RECIPE_TO_INDEX_OBJ = "../EDA_files/recipe_to_index.obj"

ING_CLEAN_NO_COMMON = '../cleaned_files/ingredients_clean_without_common_words.obj'
KEYWORDS_CLEAN = '../cleaned_files/keywords_cleaned.obj'
CATEGORIES_CLEAN = '../cleaned_files/categories_cleaned.obj'
NAMES_CLEAN = '../cleaned_files/names_cleaned.obj'

RECIPES_DATA = "../dataset/recipes.parquet"

# Load data 

In [119]:
recipes_small = pd.read_parquet(RECIPES_SMALL)
ratings_small = pd.read_parquet(RATINGS_SMALL)

In [120]:
recipes = pd.read_parquet(RECIPES_DATA)

In [121]:
with open(ING_CLEAN_NO_COMMON, "rb") as input_file:
    ingredients_no_common_words = pickle.load(input_file)

with open(KEYWORDS_CLEAN, "rb") as input_file:
    keywords_clean = pickle.load(input_file)

with open(CATEGORIES_CLEAN, "rb") as input_file:
    categories_clean = pickle.load(input_file)
    
with open(NAMES_CLEAN, "rb") as input_file:
    names_clean = pickle.load(input_file)

In [122]:
ingredients_no_common_words.Ingredients = ingredients_no_common_words.Ingredients.map(lambda x: ' '.join(x))

In [123]:
keywords_clean.Keywords = keywords_clean.Keywords.map(lambda x: ' '.join(x)) 

In [124]:
recipes_clean = recipes_small.copy()
recipes_clean.drop(columns=['Ingredients', 'Keywords', 'RecipeCategory', 'Nutritions'], axis=1, inplace=True)

In [125]:
recipes_clean = recipes_clean.merge(ingredients_no_common_words, on='RecipeId')

In [126]:
recipes_clean = recipes_clean.merge(keywords_clean, on='RecipeId')
recipes_clean = recipes_clean.merge(categories_clean, on='RecipeId')

In [127]:
sample = recipes[['RecipeId', 'Calories', 'FatContent', 'SaturatedFatContent',
                                            'CholesterolContent', 'SodiumContent', 'CarbohydrateContent',
                                            'FiberContent', 'SugarContent', 'ProteinContent']]

In [128]:
recipes_clean = recipes_clean.merge(sample, on='RecipeId')

In [129]:
del(recipes)
del(sample)

In [130]:
author_min_20 = sampling.get_rating_with_min_number(ratings_small, 20, col_name='AuthorId')

# Prepare dataset

## Ratings

In [131]:
ratings_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401982 entries, 0 to 1401981
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype              
---  ------         --------------    -----              
 0   RecipeId       1401982 non-null  int32              
 1   AuthorId       1401982 non-null  int32              
 2   Rating         1401982 non-null  int32              
 3   Review         1401982 non-null  object             
 4   DateSubmitted  1401982 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int32(3), object(1)
memory usage: 37.4+ MB


In [132]:
author_min_20.AuthorId = author_min_20.AuthorId.map(lambda x: bytes(str(x), 'utf-8'))
author_min_20.RecipeId = author_min_20.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))

In [133]:
ratings_dict = author_min_20.groupby(['AuthorId', 'RecipeId'])['Rating'].sum().reset_index()

In [134]:
ratings_dict = {name: np.array(value) for name, value in ratings_dict.items()}
ratings = tf.data.Dataset.from_tensor_slices(ratings_dict)


In [135]:
ratings = ratings.map(lambda x: {'AuthorId' : x['AuthorId'], 
                                 'RecipeId' : x['RecipeId'], 
                                 'Rating' : float(x['Rating']),})

In [136]:
for x in ratings.take(10).as_numpy_iterator():
    pprint.pprint(x)

{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'109470'}
{'AuthorId': b'100026', 'Rating': 4.0, 'RecipeId': b'120462'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'120553'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'120914'}
{'AuthorId': b'100026', 'Rating': 4.0, 'RecipeId': b'121855'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'12187'}
{'AuthorId': b'100026', 'Rating': 4.0, 'RecipeId': b'127106'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'128092'}
{'AuthorId': b'100026', 'Rating': 4.0, 'RecipeId': b'129908'}
{'AuthorId': b'100026', 'Rating': 5.0, 'RecipeId': b'130320'}


## Recipes

In [137]:
recipes_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 522517 entries, 0 to 522516
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype              
---  ------               --------------   -----              
 0   RecipeId             522517 non-null  int64              
 1   Name                 522517 non-null  object             
 2   AuthorId             522517 non-null  int32              
 3   CookTimeInMinutes    522517 non-null  float64            
 4   PrepTimeInMinutes    522517 non-null  float64            
 5   TotalTimeInMinutes   522517 non-null  float64            
 6   DatePublished        522517 non-null  datetime64[ns, UTC]
 7   Description          522512 non-null  object             
 8   RecipeServings       339606 non-null  float64            
 9   RecipeInstructions   522517 non-null  object             
 10  Ingredients          522517 non-null  object             
 11  Keywords             522517 non-null  object             
 12  Re

In [138]:
features = ["RecipeId", "Name", "Keywords", "Ingredients", "FatContent", "SaturatedFatContent", "CholesterolContent", "SodiumContent",
           "CarbohydrateContent", "FiberContent","SugarContent", "ProteinContent"]

In [155]:
# items_dict = recipes_clean.groupby(['RecipeId'])["Name", "Keywords", "Ingredients", "FatContent", "SaturatedFatContent", "CholesterolContent", "SodiumContent",
#            "CarbohydrateContent", "FiberContent","SugarContent", "ProteinContent"].sum().reset_index()

recipes_clean.RecipeId = recipes_clean.RecipeId.map(lambda x: bytes(str(x), 'utf-8'))
items_dict = recipes_clean[features]
items_dict = {name: np.array(value) for name, value in items_dict.items()}
items = tf.data.Dataset.from_tensor_slices(items_dict)

In [156]:
items = items.map(lambda x: x['RecipeId'])

## Basic version - just ids

In [162]:
size = author_min_20.shape[0]
train_size = int(0.8 * size)
test_size = size - train_size

tf.random.set_seed(42)
shuffled = ratings.shuffle(size, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(train_size)
test = shuffled.take(train_size).take(test_size)

In [163]:
recipe_ids = items.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["AuthorId"])

unique_recipe_ids = np.unique(np.concatenate(list(recipe_ids)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [159]:
# unique_user_ids = [bytes(str(x), 'utf-8') for x in unique_user_ids]

In [160]:
# unique_recipe_ids = [bytes(str(x), 'utf-8') for x in unique_recipe_ids]

# Implementing model

## Query tower

In [145]:
embedding_dimension = 64

In [165]:
user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

## Candidate tower

In [166]:
recipe_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_recipe_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_recipe_ids) + 1, embedding_dimension)
])

## Model

In [167]:
class RecipeModel(tfrs.Model):
    def __init__(self, user_model, recipe_model):
        super().__init__()
        self.recipe_model: tf.keras.Model = recipe_model
        self.user_model: tf.keras.Model = user_model
            
        metrics = tfrs.metrics.FactorizedTopK(candidates=items.batch(128).map(recipe_model))
        task = tfrs.tasks.Retrieval(metrics=metrics)
        self.task: tf.keras.layers.Layer = task
            
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        print(features)
        user_embeddings = self.user_model(features["AuthorId"])
        positive_recipe_embeddings = self.recipe_model(features["RecipeId"])
        
        return self.task(user_embeddings, positive_recipe_embeddings)

In [168]:
model = RecipeModel(user_model, recipe_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [169]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [170]:
model.fit(cached_train, epochs=3)

Epoch 1/3
{'AuthorId': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'RecipeId': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=string>, 'Rating': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=float32>}
{'AuthorId': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'RecipeId': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=string>, 'Rating': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=float32>}
 5/89 [>.............................] - ETA: 6:44:04 - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 2.4414e-05 - factorized_top_k/top_10_categorical_accuracy: 7.3242e-05 - factorized_top_k/top_50_categorical_accuracy: 3.4180e-04 - factorized_top_k/top_100_categorical_accuracy: 5.3711e-04 - loss: 73818.2719 - regularization_loss: 0.0000e+00 - total_loss: 73818.2719

KeyboardInterrupt: 