In [1]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import tensorflow_recommenders as tfrs




In [2]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

Let's pretend we have 3 users:
1. User 1 is a vegetarian who is lactose intolerant. 🥝🍅
2. Unser 2 is an athlete who prefers meat dishes to vegetable dishes. 🍖🥩
3. Unser 3 - has no special preferences, eats mostly everything. 🍲🍠

In [147]:
user1 = {'id':'1293707', 'prefers': ['30-minutes-or-less',
  'time-to-make',
  'course',
  'preparation',
  'occasion',
  'for-large-groups',
  'low-protein',
  'healthy',
  '5-ingredients-or-less',
  'breads',
  'lunch'], 'hates': ['meat', 'lactose']}
user2 = {'id':'8937', 'prefers': ['meat', 'beaf'], 'hates': ['vegetables']}
user3 = {'id':'57222', 'prefers': [], 'hates': []}

We will additionally enter the factors Calories, Protein, Fat, Carbohydrates for our requirements. **(The numbers were taken at random and do not reflect proportions or recommendations - the numbers are just an example)**
* User1 - 2000/100/60/120
* User2, 2600/150/100/328.
* User3 - 2200/80/50/100

In [148]:
def set_requirements(user, calories, proteins, fats, carbs):
  user['cal'] = calories
  user['proteins'] = proteins
  user['fat'] = fats
  user['carbs'] = carbs
set_requirements(user1, 2600, 120, 70, 120)
set_requirements(user2, 2600, 150, 100, 328)
set_requirements(user3, 2200, 80, 50, 100)


# Data processing 📊

Loading recipes and user_interactions. Nutritions showed like:calories (#), total fat (PDV), sugar (PDV) , sodium (PDV) , protein (PDV) , saturated fat (PDV) , and carbohydrates (PDV)

In [2]:
raw_recipes = pd.read_csv('./sample_data/RAW_recipes.csv')
raw_interactions = pd.read_csv('./sample_data/RAW_interactions.csv')
raw_recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


Merge data interactions with recipe infos

In [3]:
interactions_with_recipe_info = pd.merge(raw_interactions, raw_recipes[['name', 'tags', 'ingredients', 'id']], left_on='recipe_id', right_on='id', how='left')
interactions_with_recipe_info = interactions_with_recipe_info[['user_id', 'rating', 'name', 'tags']]
interactions_with_recipe_info.rename(columns={'name': 'recipe_name'}, inplace=True)
interactions_with_recipe_info.head()

Unnamed: 0,user_id,rating,recipe_name,tags
0,38094,4,white bean green chile pepper soup,"['weeknight', 'time-to-make', 'course', 'main-..."
1,1293707,5,white bean green chile pepper soup,"['weeknight', 'time-to-make', 'course', 'main-..."
2,8937,4,devilicious cookie cake delights,"['30-minutes-or-less', 'time-to-make', 'course..."
3,126440,5,baked potato toppings,"['15-minutes-or-less', 'time-to-make', 'course..."
4,57222,5,baked potato toppings,"['15-minutes-or-less', 'time-to-make', 'course..."


Formatting data for Datasets

In [4]:
interactions_with_recipe_info['user_id'] = interactions_with_recipe_info.user_id.astype("str")
interactions_with_recipe_info['rating'] = interactions_with_recipe_info.rating.astype(np.float32)
interactions_with_recipe_info['recipe_name'] = interactions_with_recipe_info.recipe_name.astype("str")

Creating datasets

In [5]:

# Шаг 2: Применение слоя в конвейере обработки данных
ratings = tf.data.Dataset.from_tensor_slices((
    tf.cast(interactions_with_recipe_info['user_id'].values, tf.string),
    tf.cast(interactions_with_recipe_info['recipe_name'].values, tf.string),
)).map(lambda x, x1: {
    "user_id": x,
    "recipe_name": x1,
})

In [6]:
for data in ratings.take(1).as_numpy_iterator():
    print(data)

{'user_id': b'38094', 'recipe_name': b'white bean   green chile pepper soup'}


In [7]:
raw_recipes['id'] = raw_recipes.id.astype("str")
raw_recipes['name'] = raw_recipes.name.astype("str")
raw_recipes['tags'] = raw_recipes.tags.astype("str")
raw_recipes['ingredients'] = raw_recipes.ingredients.astype("str")

In [8]:
# prompt: Создай на основе raw_recipes датасет, но учти что ingredients_str это массив

recipes = tf.data.Dataset.from_tensor_slices((tf.cast(raw_recipes['id'].values, tf.string),
                                              tf.cast(raw_recipes['name'].values, tf.string),
                                            #   tf.cast(raw_recipes['ingredients_str'].values.reshape(-1, 1), tf.string),
                                              )
                                             ).map(lambda x, x1: {
                                                 'recipe_id': x,
                                                  "recipe_name": x1,

                                              })

In [9]:
for data in recipes.take(1).as_numpy_iterator():
  print(data)

{'recipe_id': b'137739', 'recipe_name': b'arriba   baked winter squash mexican style'}


In [10]:
recipe_names = tf.data.Dataset.from_tensor_slices((tf.cast(raw_recipes['name'].values.reshape(-1, 1), tf.string))).map(lambda x: x)

# Towers 🗼

For our towers set dimensionality of the query and candidate representations: **32**. Higher values will correspond to models that may be more accurate, but will also be slower to fit and more prone to overfitting.

In [14]:
embedding_dimension = 32

## User tower 👷

Lets start creating our towers with User towers. We will compute by User id

### User ID model

In [15]:
unique_user_ids = interactions_with_recipe_info["user_id"].unique()
unique_user_ids[:10]

array(['38094', '1293707', '8937', '126440', '57222', '52282', '124416',
       '2000192946', '76535', '273745'], dtype=object)

In [16]:
user_id_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token='new_user'),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])




### User Prefers models

## Recipe tower 🌭

### Recipe name model

In [17]:
unique_recipe_names = raw_recipes["name"].unique()

In [18]:
recipe_name_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_recipe_names, mask_token=None),
            tf.keras.layers.Embedding(len(unique_recipe_names) + 1, embedding_dimension)
        ])

## Combine models

In [19]:
class RecipeAndUserModel(tfrs.Model):

    def _reduce_mean_if_needed(self, embedding):

        if len(embedding.shape) >= 3:
            return tf.reduce_mean(embedding, axis=1)
        return embedding

    def __init__(self, recipe_name_model, user_id_model):
        super().__init__()

        self.user_model = user_id_model
        self.recipe_name_model = recipe_name_model
        self.candidates = recipes.batch(128).map( lambda x: recipe_name_model(x['recipe_name']))
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=self.candidates
            )
        )
        
    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        
        user_embedding = self.user_model(features['user_id'])
        recipe_embeddings = self.recipe_name_model(features['recipe_name'])
        
        return user_embedding, recipe_embeddings

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        user_embeddings, recipe_embeddings = self(features)
        return self.task(user_embeddings, recipe_embeddings)


In [20]:
# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(200_000)
test = shuffled.skip(len(train)).take(100_000)

cached_train =  train.batch(4_000).cache()
cached_test = test.batch(15_000).cache()

In [21]:
model = RecipeAndUserModel(recipe_name_model, user_id_model)
early_callback = tf.keras.callbacks.EarlyStopping(monitor='loss',  patience=5, min_delta=0.001)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
model.fit(cached_train, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x12e1921f850>

In [26]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 2.9999999242136255e-05,
 'factorized_top_k/top_5_categorical_accuracy': 0.00046999999904073775,
 'factorized_top_k/top_10_categorical_accuracy': 0.001019999966956675,
 'factorized_top_k/top_50_categorical_accuracy': 0.004110000096261501,
 'factorized_top_k/top_100_categorical_accuracy': 0.007029999978840351,
 'loss': 102549.1171875,
 'regularization_loss': 0,
 'total_loss': 102549.1171875}

Save model

In [27]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((recipes.batch(100).map(lambda x: x['recipe_id']), recipes.batch(100).map(lambda x: x['recipe_name']).map(model.recipe_name_model)))
)
# Get recommendations.
_, titles2 = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles2[0, :3]}")

Cause: could not parse the source code of <function <lambda> at 0x0000012E70F1B130>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x: x['recipe_id']

Match 1:
lambda x: x['recipe_name']



Cause: could not parse the source code of <function <lambda> at 0x0000012E70F1B130>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x: x['recipe_id']

Match 1:
lambda x: x['recipe_name']



Cause: could not parse the source code of <function <lambda> at 0x0000012E70F1B130>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x: x['recipe_id']

Match 1:
lambda x: x['recipe_name']

Cause: could not parse the source code of <function <lambda> at 0x0000012E735E11B0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x: x['recipe_id']

Match 1:
lambda x: x['recipe_name']



Cause: could not parse the source code of <function <lambda> at 0x0000012E735E11B0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x: x['recipe_id']

Match 1:
lambda x: x['recipe_name']



Cause: could not parse the source code of <function <lambda> at 0x0000012E735E11B0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x: x['recipe_id']

Match 1:
lambda x: x['recipe_name']

Recommendations for user 42: [b'146320' b'95220' b'120444']


In [29]:
tf.saved_model.save(index, "./model2")









INFO:tensorflow:Assets written to: ./model2\assets


INFO:tensorflow:Assets written to: ./model2\assets


Recommendations: [b'39446' b'204257' b'134316']


In [70]:
 # Load it back; can also be done in TensorFlow Serving.
loaded = tf.saved_model.load("./model2")
# Pass a user id in, get top predicted movie titles back.
scores, titles = loaded(["1293707"])
print(f"Recommendations: ")

Recommendations: 


### Making predictions

In [71]:
# Get recommendations.
_, titles = loaded([user1['id']])
print(f"Recommendations for user 42: {titles[0, :3]}")

Recommendations for user 42: [b'39446' b'204257' b'134316']


### Processing results 🏁

In [66]:
loaded

<tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject at 0x2211b236cb0>

In [56]:
import ast

def convert_to_list(data_str):
  try:
    return ast.literal_eval(data_str)
  except (SyntaxError, ValueError):
    return []

In [91]:
selected_recipes = raw_recipes[raw_recipes['id'].isin(titles.numpy().flatten().astype('str'))][['id','name','tags', 'nutrition']]
selected_recipes

Unnamed: 0,id,name,tags,nutrition
23754,510874,biscuits gravy breakfast casserole,"['60-minutes-or-less', 'time-to-make', 'course...","[613.7, 61.0, 17.0, 85.0, 49.0, 72.0, 12.0]"
111026,132397,indian lentil soup dal shorva,"['60-minutes-or-less', 'time-to-make', 'course...","[498.9, 25.0, 24.0, 22.0, 58.0, 13.0, 20.0]"
117239,156331,kfc coleslaw copycat recipe by todd wilbur,"['15-minutes-or-less', 'time-to-make', 'course...","[81.4, 0.0, 59.0, 9.0, 3.0, 1.0, 6.0]"
118767,39446,koshari,"['60-minutes-or-less', 'time-to-make', 'course...","[359.9, 9.0, 20.0, 0.0, 28.0, 4.0, 20.0]"
161923,204257,polenta lasagna with feta and kale,"['60-minutes-or-less', 'time-to-make', 'course...","[193.7, 20.0, 23.0, 24.0, 13.0, 28.0, 4.0]"
171095,2941,raspberry sherbet punch,"['15-minutes-or-less', 'time-to-make', 'course...","[103.2, 1.0, 79.0, 1.0, 1.0, 2.0, 7.0]"
171258,52491,ratatouille,"['weeknight', 'time-to-make', 'course', 'main-...","[113.8, 1.0, 53.0, 7.0, 10.0, 1.0, 8.0]"
218616,134316,turnip and carrot mash,"['60-minutes-or-less', 'time-to-make', 'course...","[78.3, 0.0, 38.0, 6.0, 4.0, 0.0, 6.0]"
228148,204013,workday borscht vegetarian crock pot,"['course', 'main-ingredient', 'cuisine', 'prep...","[98.9, 0.0, 18.0, 2.0, 6.0, 0.0, 7.0]"
230850,25509,zucchini pasta with fresh tomato sauce,"['30-minutes-or-less', 'time-to-make', 'course...","[160.6, 17.0, 33.0, 14.0, 8.0, 8.0, 4.0]"


In [92]:
selected_recipes['tags'] = selected_recipes['tags'].apply(convert_to_list)
selected_recipes['nutrition'] = selected_recipes['nutrition'].apply(convert_to_list)

Layers

In [59]:
unique_tags = set()
for tags in raw_recipes['tags'].apply(convert_to_list):
  if isinstance(tags, list):
    for tag in tags:
      unique_tags.add(tag)
unique_tags = np.unique(list(unique_tags))


In [60]:
MAX_TAG_LENGTH = len(selected_recipes['tags'].max())
PADDING_VALUE = ""

selected_recipes['tags'] = [
    (tags + [PADDING_VALUE] * (MAX_TAG_LENGTH - len(tags)))[:MAX_TAG_LENGTH] 
    for tags in selected_recipes['tags']
]

In [61]:
from tensorflow.keras.layers import TextVectorization
vectorizer = TextVectorization(output_mode='int')
vectorizer.adapt(unique_tags)
embedding_layer = tf.keras.layers.Embedding(input_dim=len(vectorizer.get_vocabulary()), output_dim=16)





In [94]:
import math
def compute_percent_personality(user_tags, recipe_tags):
    user_tags_vectorized = vectorizer(user_tags)
    recipe_tags_vectorized = vectorizer(recipe_tags)
    user_tags_embedded = embedding_layer(user_tags_vectorized)
    recipe_tags_embedded = embedding_layer(recipe_tags_vectorized)
    user_vector = tf.reduce_mean(user_tags_embedded, axis=0)  # [embedding_dim]
    recipe_vector = tf.reduce_mean(recipe_tags_embedded, axis=0)  # [embedding_dim]
    
    cosine_similarity = tf.keras.losses.cosine_similarity(user_vector, recipe_vector, axis=-1)
    similarity_percentage = (1 - cosine_similarity) /2 * 100
    return  similarity_percentage.numpy()


In [95]:
selected_recipes['similarity_tags'] = selected_recipes['tags'].apply(lambda x: compute_percent_personality(user1['prefers'], x))
selected_recipes = selected_recipes.sort_values(by=['similarity_tags'], ascending=False)

In [96]:
selected_recipes[:3]

Unnamed: 0,id,name,tags,nutrition,similarity_tags
23754,510874,biscuits gravy breakfast casserole,"[60-minutes-or-less, time-to-make, course, mai...","[613.7, 61.0, 17.0, 85.0, 49.0, 72.0, 12.0]",[81.44639]
171095,2941,raspberry sherbet punch,"[15-minutes-or-less, time-to-make, course, mai...","[103.2, 1.0, 79.0, 1.0, 1.0, 2.0, 7.0]",[79.40008]
117239,156331,kfc coleslaw copycat recipe by todd wilbur,"[15-minutes-or-less, time-to-make, course, mai...","[81.4, 0.0, 59.0, 9.0, 3.0, 1.0, 6.0]",[76.19014]


Processing portions

In [135]:
# Импортируем оболочку OR-Tools для линейного программирования
from ortools.linear_solver import pywraplp

# Создаем решатель с помощью бэкенда GLOP
solver = pywraplp.Solver('Find optimal weights', pywraplp.Solver.GLOP_LINEAR_PROGRAMMING)

So we deciding, that every our dish can be from 100 gr to 500 gr.

In [136]:
# Создаем переменные для оптимизации
w1 = solver.IntVar(50, 500, 'weight1')
w2 = solver.IntVar(50, 500, 'weight2')
w3 = solver.IntVar(50, 500, 'weight3')

In [137]:
nutrition_info = selected_recipes[:3]['nutrition'].to_numpy()
nutrition_info

array([list([613.7, 61.0, 17.0, 85.0, 49.0, 72.0, 12.0]),
       list([103.2, 1.0, 79.0, 1.0, 1.0, 2.0, 7.0]),
       list([81.4, 0.0, 59.0, 9.0, 3.0, 1.0, 6.0])], dtype=object)

In [138]:
first_dish = {'cal':nutrition_info[0][0]/100, 'fat':nutrition_info[0][1]/100, 'carbs': nutrition_info[0][2]/100, 'proteins':nutrition_info[0][4]/100}
second_dish = {'cal':nutrition_info[1][0]/100, 'fat':nutrition_info[1][1]/100, 'carbs': nutrition_info[1][2]/100, 'proteins':nutrition_info[1][4]/100}
third_dish = {'cal':nutrition_info[2][0]/100, 'fat':nutrition_info[2][1]/100, 'carbs': nutrition_info[2][2]/100, 'proteins':nutrition_info[2][4]/100}

In [149]:
user1

{'id': '1293707',
 'prefers': ['30-minutes-or-less',
  'time-to-make',
  'course',
  'preparation',
  'occasion',
  'for-large-groups',
  'low-protein',
  'healthy',
  '5-ingredients-or-less',
  'breads',
  'lunch'],
 'hates': ['meat', 'lactose'],
 'cal': 2600,
 'proteins': 120,
 'fat': 70,
 'carbs': 120}

In [150]:
solver.Add(first_dish["proteins"] * w1 + second_dish["proteins"] * w2 + third_dish["proteins"] * w3 <= user1['proteins'])
solver.Add(first_dish["carbs"] * w1 + second_dish["carbs"] * w2 + third_dish["carbs"] * w3 <= user1['carbs'])
solver.Add(first_dish["fat"] * w1 + second_dish["fat"] * w2 + third_dish["fat"] * w3 <= user1['fat'])
solver.Add(first_dish["cal"] * w1 + second_dish["cal"] * w2 + third_dish["cal"] * w3 <= user1['cal'])


<ortools.linear_solver.pywraplp.Constraint; proxy of <Swig Object of type 'operations_research::MPConstraint *' at 0x000002213CAE9140> >

In [151]:
solver.Maximize(first_dish["cal"] * w1 + second_dish["cal"] * w2 + third_dish["cal"] * w3)

In [152]:
status = solver.Solve()

# Если оптимальное решение найдено, вывести результаты
if status == pywraplp.Solver.OPTIMAL:
  print('================= Solution =================')
  print(f'Solved in {solver.wall_time():.2f} milliseconds in {solver.iterations()} iterations')
  print()
  print(f'Optimal power = {solver.Objective().Value()} 💪power')
  print('Army:')
  print(f' - Weight 1 = {w1.solution_value()}')
  print(f' - Weight 2 = {w2.solution_value()}')
  print(f' - Weight 3 = {w3.solution_value()}')
else:
  print('The solver could not find an optimal solution.')

Solved in 114476.00 milliseconds in 0 iterations

Optimal power = 835.155834954154 💪power
Army:
 - Weight 1 = 113.9344262295082
 - Weight 2 = 50.0
 - Weight 3 = 103.61211447624339
