In [None]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import tensorflow_recommenders as tfrs

In [2]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

Let's pretend we have 3 users:
1. User 1 is a vegetarian who is lactose intolerant. 🥝🍅
2. Unser 2 is an athlete who prefers meat dishes to vegetable dishes. 🍖🥩
3. Unser 3 - has no special preferences, eats mostly everything. 🍲🍠

In [3]:
user1 = {'id':'clr', 'prefers': ['vegetables', 'vegan', 'nomeat', 'vegetarian'], 'hates': ['meat', 'lactose']}
user2 = {'id':'mrt', 'prefers': 'meat', 'hates': 'vegetables'}
user3 = {'id':'stl', 'prefers': '', 'hates': ''}

We will additionally enter the factors Calories, Protein, Fat, Carbohydrates for our requirements. **(The numbers were taken at random and do not reflect proportions or recommendations - the numbers are just an example)**
* User1 - 2000/100/60/120
* User2, 2600/150/100/328.
* User3 - 2200/80/50/100

In [4]:
def set_requirements(user, calories, proteins, fats, carbs):
  user['calories'] = calories
  user['proteins'] = proteins
  user['fats'] = fats
  user['carbs'] = carbs
set_requirements(user1, 2000, 100, 60, 120)
set_requirements(user2, 2600, 150, 100, 328)
set_requirements(user3, 2200, 80, 50, 100)


# Data processing 📊

Loading recipes and user_interactions. Nutritions showed like:calories (#), total fat (PDV), sugar (PDV) , sodium (PDV) , protein (PDV) , saturated fat (PDV) , and carbohydrates (PDV)

In [None]:
raw_recipes = pd.read_csv('./sample_data/RAW_recipes.csv')
raw_interactions = pd.read_csv('./sample_data/RAW_interactions.csv')
raw_recipes.head()

Merge data interactions with recipe infos

In [None]:
interactions_with_recipe_info = pd.merge(raw_interactions, raw_recipes[['name', 'tags', 'ingredients', 'id']], left_on='recipe_id', right_on='id', how='left')
interactions_with_recipe_info = interactions_with_recipe_info[['user_id', 'rating', 'name', 'tags']]
interactions_with_recipe_info.rename(columns={'name': 'recipe_name'}, inplace=True)
interactions_with_recipe_info.head()

Formatting data for Datasets

In [7]:
interactions_with_recipe_info['user_id'] = interactions_with_recipe_info.user_id.astype("str")
interactions_with_recipe_info['rating'] = interactions_with_recipe_info.rating.astype(np.float32)
interactions_with_recipe_info['recipe_name'] = interactions_with_recipe_info.recipe_name.astype("str")

Creating datasets

In [9]:

# Шаг 2: Применение слоя в конвейере обработки данных
ratings = tf.data.Dataset.from_tensor_slices((
    tf.cast(interactions_with_recipe_info['user_id'].values, tf.string),
    tf.cast(interactions_with_recipe_info['recipe_name'].values, tf.string),
)).map(lambda x, x1: {
    "user_id": x,
    "recipe_name": x1,
})

In [None]:
for data in ratings.take(1).as_numpy_iterator():
    print(data)

In [11]:
raw_recipes['id'] = raw_recipes.id.astype("str")
raw_recipes['name'] = raw_recipes.name.astype("str")
raw_recipes['tags'] = raw_recipes.tags.astype("str")
raw_recipes['ingredients'] = raw_recipes.ingredients.astype("str")

In [12]:
# prompt: Создай на основе raw_recipes датасет, но учти что ingredients_str это массив

recipes = tf.data.Dataset.from_tensor_slices((tf.cast(raw_recipes['id'].values, tf.string),
                                              tf.cast(raw_recipes['name'].values, tf.string),
                                            #   tf.cast(raw_recipes['ingredients_str'].values.reshape(-1, 1), tf.string),
                                              )
                                             ).map(lambda x, x1: {
                                                 'recipe_id': x,
                                                  "recipe_name": x1,

                                              })

In [None]:
for data in recipes.take(1).as_numpy_iterator():
  print(data)

In [14]:
recipe_names = tf.data.Dataset.from_tensor_slices((tf.cast(raw_recipes['name'].values.reshape(-1, 1), tf.string))).map(lambda x: x)

# Towers 🗼

For our towers set dimensionality of the query and candidate representations: **32**. Higher values will correspond to models that may be more accurate, but will also be slower to fit and more prone to overfitting.

In [16]:
embedding_dimension = 32

## User tower 👷

Lets start creating our towers with User towers. We will compute by User id

### User ID model

In [None]:
unique_user_ids = interactions_with_recipe_info["user_id"].unique()
unique_user_ids[:10]

In [None]:
user_id_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])

### User Prefers models

## Recipe tower 🌭

### Recipe name model

In [19]:
unique_recipe_names = raw_recipes["name"].unique()

In [22]:
recipe_name_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_recipe_names, mask_token=None),
            tf.keras.layers.Embedding(len(unique_recipe_names) + 1, embedding_dimension)
        ])

## Combine models

In [26]:
class RecipeAndUserModel(tfrs.Model):

    def _reduce_mean_if_needed(self, embedding):

        if len(embedding.shape) >= 3:
            return tf.reduce_mean(embedding, axis=1)
        return embedding

    def __init__(self, recipe_name_model, user_id_model):
        super().__init__()

        self.user_model = user_id_model
        self.recipe_name_model = recipe_name_model
        self.candidates = recipes.batch(128).map( lambda x: recipe_name_model(x['recipe_name']))
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=self.candidates
            )
        )
        
    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        
        user_embedding = self.user_model(features['user_id'])
        recipe_embeddings = self.recipe_name_model(features['recipe_name'])
        
        return user_embedding, recipe_embeddings

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        user_embeddings, recipe_embeddings = self(features)
        return self.task(user_embeddings, recipe_embeddings)


In [32]:
# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(200_000)
test = shuffled.skip(len(train)).take(100_000)

cached_train =  train.batch(4_000).cache()
cached_test = test.batch(15_000).cache()

In [None]:
model = RecipeAndUserModel(recipe_name_model, user_id_model)
early_callback = tf.keras.callbacks.EarlyStopping(monitor='loss',  patience=5, min_delta=0.001)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
model.fit(cached_train, epochs=10)

In [None]:
model.evaluate(cached_test, return_dict=True)

Save model

In [None]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((recipes.batch(100).map(lambda x: x['recipe_id']), recipes.batch(100).map(lambda x: x['recipe_name']).map(model.recipe_name_model)))
)
# Get recommendations.
_, titles2 = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles2[0, :3]}")

In [None]:
# Export the query model.
import tempfile


with tempfile.TemporaryDirectory() as tmp:
  path = os.path.join(tmp, "model")

  # Save the index.
  tf.saved_model.save(index, "./model")

  # Load it back; can also be done in TensorFlow Serving.
  loaded = tf.saved_model.load("./model")

  # Pass a user id in, get top predicted movie titles back.
  scores, titles = loaded(["1293707"])
  print(f"Recommendations: {titles[0][:3]}")

### Making predictions

In [None]:
# Get recommendations.
_, titles = index(tf.constant(["1293707"]))
print(f"Recommendations for user 42: {titles[0, :3]}")