In [1]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import tensorflow_recommenders as tfrs




In [2]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

Let's pretend we have 3 users:
1. Clarissa is a vegetarian who is lactose intolerant. 🥝🍅
2. Martin is an athlete who prefers meat dishes to vegetable dishes. 🍖🥩
3. Stella - has no special preferences, eats mostly everything. 🍲🍠

In [3]:
clarissa = {'id':'clr', 'prefers': ['vegetables', 'vegan', 'nomeat', 'vegetarian'], 'hates': ['meat', 'lactose']}
martin = {'id':'mrt', 'prefers': 'meat', 'hates': 'vegetables'}
stella = {'id':'stl', 'prefers': '', 'hates': ''}

We will additionally enter the factors Calories, Protein, Fat, Carbohydrates for our requirements. **(The numbers were taken at random and do not reflect proportions or recommendations - the numbers are just an example)**
* Clarissa - 2000/100/60/120
* Martin, 2600/150/100/328.
* Stella - 2200/80/50/100

In [4]:
def set_requirements(user, calories, proteins, fats, carbs):
  user['calories'] = calories
  user['proteins'] = proteins
  user['fats'] = fats
  user['carbs'] = carbs
set_requirements(clarissa, 2000, 100, 60, 120)
set_requirements(martin, 2600, 150, 100, 328)
set_requirements(martin, 2200, 80, 50, 100)


# Data processing 📊

Loading recipes and user_interactions. Nutritions showed like:calories (#), total fat (PDV), sugar (PDV) , sodium (PDV) , protein (PDV) , saturated fat (PDV) , and carbohydrates (PDV)

In [5]:
raw_recipes = pd.read_csv('./sample_data/RAW_recipes.csv')
raw_interactions = pd.read_csv('./sample_data/RAW_interactions.csv')
raw_recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


Merge data interactions with recipe infos

In [6]:
interactions_with_recipe_info = pd.merge(raw_interactions, raw_recipes[['name', 'tags', 'ingredients', 'id']], left_on='recipe_id', right_on='id', how='left')
interactions_with_recipe_info = interactions_with_recipe_info[['user_id', 'recipe_id', 'rating', 'name', 'tags', 'ingredients']]
interactions_with_recipe_info.rename(columns={'name': 'recipe_name'}, inplace=True)
interactions_with_recipe_info.head()

Unnamed: 0,user_id,recipe_id,rating,recipe_name,tags,ingredients
0,38094,40893,4,white bean green chile pepper soup,"['weeknight', 'time-to-make', 'course', 'main-...","['great northern beans', 'yellow onion', 'dice..."
1,1293707,40893,5,white bean green chile pepper soup,"['weeknight', 'time-to-make', 'course', 'main-...","['great northern beans', 'yellow onion', 'dice..."
2,8937,44394,4,devilicious cookie cake delights,"['30-minutes-or-less', 'time-to-make', 'course...","[""devil's food cake mix"", 'vegetable oil', 'eg..."
3,126440,85009,5,baked potato toppings,"['15-minutes-or-less', 'time-to-make', 'course...","['mayonnaise', 'salsa', 'cheddar cheese', 'ref..."
4,57222,85009,5,baked potato toppings,"['15-minutes-or-less', 'time-to-make', 'course...","['mayonnaise', 'salsa', 'cheddar cheese', 'ref..."


In [7]:
import ast

def convert_to_list(data_str):
  try:
    return ast.literal_eval(data_str)
  except (SyntaxError, ValueError):
    return []

In [8]:
interactions_with_recipe_info['ingredients_str'] = interactions_with_recipe_info['ingredients'].apply(convert_to_list)
interactions_with_recipe_info['tags_str'] = interactions_with_recipe_info['tags'].apply(convert_to_list)
raw_recipes['ingredients_str'] = raw_recipes['ingredients'].apply(convert_to_list)
raw_recipes['tags_str'] = raw_recipes['tags'].apply(convert_to_list)

Formatting data for Datasets

In [9]:
interactions_with_recipe_info['user_id'] = interactions_with_recipe_info.user_id.astype("str")
interactions_with_recipe_info['recipe_id'] = interactions_with_recipe_info.recipe_id.astype("str")
interactions_with_recipe_info['rating'] = interactions_with_recipe_info.rating.astype(np.float32)
interactions_with_recipe_info['recipe_name'] = interactions_with_recipe_info.recipe_name.astype("str")
interactions_with_recipe_info['tags'] = interactions_with_recipe_info.tags.astype("str")
interactions_with_recipe_info['ingredients'] = interactions_with_recipe_info.ingredients.astype("str")


Creating train preferences for users

In [10]:
user_preferences = interactions_with_recipe_info[['user_id', 'rating', 'tags_str']]
user_preferences.head()

Unnamed: 0,user_id,rating,tags_str
0,38094,4.0,"[weeknight, time-to-make, course, main-ingredi..."
1,1293707,5.0,"[weeknight, time-to-make, course, main-ingredi..."
2,8937,4.0,"[30-minutes-or-less, time-to-make, course, mai..."
3,126440,5.0,"[15-minutes-or-less, time-to-make, course, mai..."
4,57222,5.0,"[15-minutes-or-less, time-to-make, course, mai..."


In [11]:
user_liked_tags = user_preferences[user_preferences['rating'] >= 3].groupby('user_id')['tags_str'].apply(list).reset_index()
user_unliked_tags = user_preferences[user_preferences['rating'] < 3].groupby('user_id')['tags_str'].apply(list).reset_index()
users_ds = pd.DataFrame(interactions_with_recipe_info['user_id'].unique(), columns=['user_id'])


In [12]:
users_ds = users_ds.merge(user_liked_tags, on='user_id', how='left')
users_ds = users_ds.rename(columns={'tags_str': 'liked_tags'})
users_ds = users_ds.merge(user_unliked_tags, on='user_id', how='left')
users_ds = users_ds.rename(columns={'tags_str': 'unliked_tags'})

In [13]:
users_ds.head()

Unnamed: 0,user_id,liked_tags,unliked_tags
0,38094,"[[weeknight, time-to-make, course, main-ingred...",
1,1293707,"[[weeknight, time-to-make, course, main-ingred...","[[60-minutes-or-less, time-to-make, course, ma..."
2,8937,"[[30-minutes-or-less, time-to-make, course, ma...",
3,126440,"[[15-minutes-or-less, time-to-make, course, ma...","[[60-minutes-or-less, time-to-make, course, pr..."
4,57222,"[[15-minutes-or-less, time-to-make, course, ma...","[[60-minutes-or-less, time-to-make, course, ma..."


Looking that some of users don't have unliked_tags, so replace all Nan with empty arrays

In [14]:
def unique_tags_in_list(tags_list):
  if isinstance(tags_list, list):
    unique_tags = []
    for sublist in tags_list:
      if isinstance(sublist, list):
        for tag in sublist:
          if tag not in unique_tags:
            unique_tags.append(tag)
    return unique_tags
  else:
    return []

users_ds['liked_tags'] = users_ds['liked_tags'].apply(unique_tags_in_list)
users_ds['unliked_tags'] = users_ds['unliked_tags'].apply(unique_tags_in_list)

In [15]:
def remove_duplicate_tags(row):
  liked_tags = set(row['liked_tags']) if isinstance(row['liked_tags'], list) else set()
  unliked_tags = set(row['unliked_tags']) if isinstance(row['unliked_tags'], list) else set()
  duplicate_tags = liked_tags.intersection(unliked_tags)
  row['liked_tags'] = [tag for tag in row['liked_tags'] if tag not in duplicate_tags] if isinstance(row['liked_tags'], list) else []
  row['unliked_tags'] = [tag for tag in row['unliked_tags'] if tag not in duplicate_tags] if isinstance(row['unliked_tags'], list) else []
  return row

users_ds = users_ds.apply(remove_duplicate_tags, axis=1)


In [16]:
users_ds.head()

Unnamed: 0,user_id,liked_tags,unliked_tags
0,38094,"[weeknight, time-to-make, course, main-ingredi...",[]
1,1293707,"[weeknight, soups-stews, beans, crock-pot-slow...","[cupcakes, finger-food, cakes, english, cake-f..."
2,8937,"[30-minutes-or-less, time-to-make, course, mai...",[]
3,126440,"[15-minutes-or-less, 3-steps-or-less, jewish-s...",[]
4,57222,"[condiments-etc, salads, beans, grains, south-...","[kwanzaa, dairy-free]"


In [17]:
interactions_with_recipe_info = pd.merge(interactions_with_recipe_info, users_ds, on='user_id', how='left')

Creating datasets

In [18]:
# Параметры
MAX_TAG_LENGTH = 30
PADDING_VALUE = "empty"

interactions_with_recipe_info['liked_tags'] = [
    (tags + [PADDING_VALUE] * (MAX_TAG_LENGTH - len(tags)))[:MAX_TAG_LENGTH] 
    for tags in interactions_with_recipe_info['liked_tags']
]
interactions_with_recipe_info['unliked_tags'] = [
    (tags + [PADDING_VALUE] * (MAX_TAG_LENGTH - len(tags)))[:MAX_TAG_LENGTH] 
    for tags in interactions_with_recipe_info['unliked_tags']
]
interactions_with_recipe_info['ingredients_str'] =  [
    (tags + [PADDING_VALUE] * (MAX_TAG_LENGTH - len(tags)))[:MAX_TAG_LENGTH] 
    for tags in interactions_with_recipe_info['ingredients_str'] ]

interactions_with_recipe_info['tags_str'] =  [
    (tags + [PADDING_VALUE] * (MAX_TAG_LENGTH - len(tags)))[:MAX_TAG_LENGTH] 
    for tags in interactions_with_recipe_info['tags_str'] ]

raw_recipes['tags_str'] = [
    (tags + [PADDING_VALUE] * (MAX_TAG_LENGTH - len(tags)))[:MAX_TAG_LENGTH] 
    for tags in raw_recipes['tags_str'] ]

raw_recipes['ingredients_str'] = [
    (tags + [PADDING_VALUE] * (MAX_TAG_LENGTH - len(tags)))[:MAX_TAG_LENGTH] 
    for tags in raw_recipes['ingredients_str'] ]

In [19]:
interactions_with_recipe_info['liked_tags_str'] = interactions_with_recipe_info['liked_tags'].apply(lambda x: ','.join(x))
interactions_with_recipe_info['unliked_tags_str'] = interactions_with_recipe_info['unliked_tags'].apply(lambda x: ','.join(x))
interactions_with_recipe_info['tags_str'] = interactions_with_recipe_info['tags_str'].apply(lambda x: ','.join(x))

In [20]:
def split_tags(tags_string):
    return tf.strings.split(tf.strings.strip(tags_string), ',')

In [21]:
tf.compat.v1.enable_eager_execution()




In [22]:
def squeeze_tags(data):
    return {
        "user_id": data["user_id"],
        "recipe_id": data["recipe_id"],
        "rating": data["rating"],
        "recipe_name": data["recipe_name"],
        "tags": tf.squeeze(data["tags"], axis=0),  # Укажите ось, которую нужно удалить
        "liked_tags": tf.squeeze(data["liked_tags"], axis=0),  # Укажите ось, которую нужно удалить
        "unliked_tags": tf.squeeze(data["unliked_tags"], axis=0)  # Укажите ось, которую нужно удалить
    }

ratings = tf.data.Dataset.from_tensor_slices((tf.cast(interactions_with_recipe_info['user_id'].values, tf.string),
                                              tf.cast(interactions_with_recipe_info['recipe_id'].values, tf.string),
                                              tf.cast(interactions_with_recipe_info['rating'].values, tf.float32),
                                              tf.cast(interactions_with_recipe_info['recipe_name'].values, tf.string),
                                              tf.cast(interactions_with_recipe_info['tags_str'].values.reshape(-1,1), tf.string),
                                              tf.cast(interactions_with_recipe_info['liked_tags_str'].values.reshape(-1,1), tf.string),
                                              tf.cast(interactions_with_recipe_info['unliked_tags_str'].values.reshape(-1,1), tf.string)
                                              )).map(lambda x, x1, x2, x3, x4, x5, x6: {
                                                  "user_id": x,
                                                  "recipe_id": x1,
                                                  "rating": x2,
                                                  "recipe_name": x3,
                                                  "tags": split_tags(x4),
                                                  "liked_tags": split_tags(x5),
                                                  "unliked_tags": split_tags(x6)
                                              }).map(squeeze_tags)


In [23]:
for data in ratings.take(1).as_numpy_iterator():
    print(data)

{'user_id': b'38094', 'recipe_id': b'40893', 'rating': 4.0, 'recipe_name': b'white bean   green chile pepper soup', 'tags': array([b'weeknight', b'time-to-make', b'course', b'main-ingredient',
       b'preparation', b'occasion', b'soups-stews', b'beans',
       b'vegetables', b'easy', b'crock-pot-slow-cooker', b'dietary',
       b'equipment', b'empty', b'empty', b'empty', b'empty', b'empty',
       b'empty', b'empty', b'empty', b'empty', b'empty', b'empty',
       b'empty', b'empty', b'empty', b'empty', b'empty', b'empty'],
      dtype=object), 'liked_tags': array([b'weeknight', b'time-to-make', b'course', b'main-ingredient',
       b'preparation', b'occasion', b'soups-stews', b'beans',
       b'vegetables', b'easy', b'crock-pot-slow-cooker', b'dietary',
       b'equipment', b'60-minutes-or-less', b'main-dish', b'fruit',
       b'vegan', b'vegetarian', b'stove-top', b'black-beans',
       b'30-minutes-or-less', b'low-protein', b'salads', b'dinner-party',
       b'low-cholesterol', b'lo

In [24]:
raw_recipes['ingredients_str'] = raw_recipes['ingredients_str'].apply(lambda x: ','.join(x))
raw_recipes['tags_str'] = raw_recipes['tags_str'].apply(lambda x: ','.join(x))

In [25]:
raw_recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,ingredients_str,tags_str
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,"winter squash,mexican seasoning,mixed spice,ho...","60-minutes-or-less,time-to-make,course,main-in..."
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,"prepared pizza crust,sausage patty,eggs,milk,s...","30-minutes-or-less,time-to-make,course,main-in..."
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,"ground beef,yellow onions,diced tomatoes,tomat...","time-to-make,course,preparation,main-dish,chil..."
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,"spreadable cheese with garlic and herbs,new po...","60-minutes-or-less,time-to-make,course,main-in..."
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,"tomato juice,apple cider vinegar,sugar,salt,pe...","weeknight,time-to-make,course,main-ingredient,..."


In [26]:
raw_recipes['id'] = raw_recipes.id.astype("str")
raw_recipes['name'] = raw_recipes.name.astype("str")
raw_recipes['tags'] = raw_recipes.tags.astype("str")
raw_recipes['ingredients'] = raw_recipes.ingredients.astype("str")

In [27]:
raw_recipes['name'][:1]

0    arriba   baked winter squash mexican style
Name: name, dtype: object

In [28]:
# prompt: Создай на основе raw_recipes датасет, но учти что ingredients_str это массив

recipes = tf.data.Dataset.from_tensor_slices((tf.cast(raw_recipes['id'].values, tf.string),
                                              tf.cast(raw_recipes['name'].values, tf.string),
                                              tf.cast(raw_recipes['tags_str'].values.reshape(-1, 1), tf.string),
                                              tf.cast(raw_recipes['ingredients_str'].values.reshape(-1, 1), tf.string),)).map(lambda x, x1, x2, x3: {
                                                  "recipe_id": x,
                                                  "recipe_name": x1,
                                                  "tags": split_tags(x2),
                                                  "ingredients": split_tags(x3)
                                              }).map(lambda data: {
        **data,
        "tags": tf.squeeze(data["tags"], axis=0),  # Удаляем дополнительные измерения
        "ingredients": tf.squeeze(data["ingredients"], axis=0)  # Удаляем дополнительные измерения
    })

for data in recipes.take(1).as_numpy_iterator():
  print(data)


{'recipe_id': b'137739', 'recipe_name': b'arriba   baked winter squash mexican style', 'tags': array([b'60-minutes-or-less', b'time-to-make', b'course',
       b'main-ingredient', b'cuisine', b'preparation', b'occasion',
       b'north-american', b'side-dishes', b'vegetables', b'mexican',
       b'easy', b'fall', b'holiday-event', b'vegetarian', b'winter',
       b'dietary', b'christmas', b'seasonal', b'squash', b'empty',
       b'empty', b'empty', b'empty', b'empty', b'empty', b'empty',
       b'empty', b'empty', b'empty'], dtype=object), 'ingredients': array([b'winter squash', b'mexican seasoning', b'mixed spice', b'honey',
       b'butter', b'olive oil', b'salt', b'empty', b'empty', b'empty',
       b'empty', b'empty', b'empty', b'empty', b'empty', b'empty',
       b'empty', b'empty', b'empty', b'empty', b'empty', b'empty',
       b'empty', b'empty', b'empty', b'empty', b'empty', b'empty',
       b'empty', b'empty'], dtype=object)}


In [29]:
recipe_names = tf.data.Dataset.from_tensor_slices((tf.cast(raw_recipes['name'].values.reshape(-1, 1), tf.string))).map(lambda x: x)

# Towers 🗼

For our towers set dimensionality of the query and candidate representations: **32**. Higher values will correspond to models that may be more accurate, but will also be slower to fit and more prone to overfitting.

In [30]:
embedding_dimension = 32

## User tower 👷

Lets start creating our towers with User towers. We will compute by:
1. User id
2. Prefers and hates

### User ID model

In [31]:
unique_user_ids = users_ds["user_id"].unique()
unique_user_ids[:10]

array(['38094', '1293707', '8937', '126440', '57222', '52282', '124416',
       '2000192946', '76535', '273745'], dtype=object)

In [32]:
user_id_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids)+1, embedding_dimension)
])




### User Prefers models

In [33]:
# prompt: модель для liked_tags которым стоит отдать предпочтение, но учти что liked_tags в датасете это массив

unique_liked_tags = set()
for tags in users_ds['liked_tags']:
  if isinstance(tags, list):
    for tag in tags:
      unique_liked_tags.add(tag)
unique_liked_tags = list(unique_liked_tags)

liked_tags_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_liked_tags, mask_token=None),
  tf.keras.layers.Embedding(len(unique_liked_tags) + 1, embedding_dimension)
])


In [34]:
unique_liked_tags[:10]

['',
 'toddler-friendly',
 'chick-peas-garbanzos',
 'eggs-dairy',
 'peaches',
 'ragu-recipe-contest',
 'savory-sauces',
 'green-yellow-beans',
 'belgian',
 'marinades-and-rubs']

In [35]:

unique_unliked_tags = set()
for tags in users_ds['unliked_tags']:
  if isinstance(tags, list):
    for tag in tags:
      unique_unliked_tags.add(tag)
unique_unliked_tags = list(unique_unliked_tags)

unliked_tags_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_unliked_tags, mask_token=None),
  tf.keras.layers.Embedding(len(unique_unliked_tags) + 1, embedding_dimension)
])
unique_unliked_tags[:10]

['',
 'toddler-friendly',
 'chick-peas-garbanzos',
 'eggs-dairy',
 'peaches',
 'ragu-recipe-contest',
 'savory-sauces',
 'green-yellow-beans',
 'belgian',
 'marinades-and-rubs']

## Recipe tower 🌭

In [36]:
unique_recipe_names = np.unique(list(raw_recipes["name"].unique()))
recipe_name_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_recipe_names, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_recipe_names) + 1, embedding_dimension)
])
unique_recipe_names[:10]

array(['0 carb   0 cal gummy worms', '0 fat chunky watermelon salsa',
       '0 point ice cream  only 1 ingredient', '0 point soup   ww',
       '0 point soup  crock pot', '007  martini', '007 cocktail',
       '1  2  3  swiss meringue buttercream', '1 00 tangy chicken recipe',
       '1 000 artichoke hearts'], dtype='<U85')

In [37]:
# prompt: напиши вызов recipe_name_model

recipe_name_model(['some recipe name'])





<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[ 0.00053295,  0.0327567 ,  0.02697826,  0.01809064,  0.01589452,
         0.0016561 , -0.03789175, -0.00153773,  0.00528934, -0.04600277,
         0.01358695,  0.01810402,  0.03867164, -0.03290868,  0.04922197,
        -0.03007996, -0.01582808,  0.0045259 , -0.00811077, -0.01791621,
        -0.02288712,  0.04904668,  0.00751042, -0.01267761, -0.00802997,
         0.02425978, -0.01806009,  0.01782855,  0.00202962,  0.03536825,
         0.0405396 ,  0.01242997]], dtype=float32)>

### Recipe ingredient&tag model

In [38]:
unique_tags = set()
for tags in raw_recipes['tags'].apply(convert_to_list):
  if isinstance(tags, list):
    for tag in tags:
      unique_tags.add(tag)
unique_tags = np.unique(list(unique_tags))

recipe_tags_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_tags, mask_token=None),
  tf.keras.layers.Embedding(len(unique_tags) + 1, embedding_dimension)
])
unique_tags[:10]

array(['', '1-day-or-more', '15-minutes-or-less', '3-steps-or-less',
       '30-minutes-or-less', '4-hours-or-less', '5-ingredients-or-less',
       '60-minutes-or-less',
       'Throw the ultimate fiesta with this sopaipillas recipe from Food.com.',
       'a1-sauce'], dtype='<U69')

## Combine models

In [39]:
class RecipeModel(tf.keras.Model):
    def __init__(self, unique_recipe_names, unique_tags, embedding_dimension):
        super().__init__()

        self.recipe_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_recipe_names, mask_token=None),
            tf.keras.layers.Embedding(len(unique_recipe_names) + 1, embedding_dimension)
        ])
    
    # Tags embeddings
        self.tags_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_tags, mask_token=None),
            tf.keras.layers.Embedding(len(unique_tags) + 1, embedding_dimension)
        ])
        
    def call(self, features):
        # Inputs should contain "recipe_name" and "tags"
        recipe_name = features["recipe_name"]
        tags = features["tags"]

        # Embed recipe name
        recipe_embeddings = self.recipe_model(recipe_name)

        tags_embeddings = self.tags_model(tags)
        if(len(recipe_embeddings.shape.as_list())>=3): 
        #   recipe_embeddings = tf.reshape(recipe_embeddings, [1,32])
           recipe_embeddings = tf.reshape(recipe_embeddings, [1,embedding_dimension])
        if(len(tags_embeddings.shape.as_list())>=3):
            tags_embeddings = tf.reshape(tags_embeddings, [1,embedding_dimension])
        #   tags_embeddings = tf.reshape(tags_embeddings, [1,32])
        recipe_embeddings = tf.concat([recipe_embeddings, tags_embeddings], axis=1)
        return recipe_embeddings

In [40]:
class UserModel(tf.keras.Model):
    def __init__(self, unique_user_ids, unique_liked_tags, unique_unliked_tags, embedding_dimension):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        # User ID embeddings
        self.user_id_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])
        
        # Liked tags embeddings
        self.liked_tags_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_liked_tags, mask_token=None),
            tf.keras.layers.Embedding(len(unique_liked_tags) + 1, embedding_dimension)
        ])

        # Unliked tags embeddings
        self.unliked_tags_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_unliked_tags, mask_token=None),
            tf.keras.layers.Embedding(len(unique_unliked_tags) + 1, embedding_dimension)
        ])

        # Penalty weight for unliked tags
        self.penalty_weight = -0.5

    def call(self, features):
        user_id = features["user_id"]
        liked_tags = features["liked_tags"]
        unliked_tags = features["unliked_tags"]
        
        user_id_embedding = self.user_id_model(user_id)
        if user_id_embedding.shape.as_list() == [1, 1, 32]:
            user_id_embedding = tf.reduce_mean(user_id_embedding, axis=1)
      # Например, ограничимся 50 тегами
        liked_tags = liked_tags[:self.embedding_dimension]
        unliked_tags = unliked_tags[:self.embedding_dimension]
    
        # Embed liked and unliked tags
        liked_tags_embedding = self.liked_tags_model(liked_tags)
        unliked_tags_embedding = self.unliked_tags_model(unliked_tags)
    
        # Вычисление средневзвешенного значения (пример)
        weights = tf.ones_like(liked_tags_embedding)  # Инициализация весов
        weights_unl = tf.ones_like(unliked_tags_embedding)
        
        # Здесь можно рассчитать более сложные веса, например, на основе частоты встречаемости тегов
        weighted_sum = tf.reduce_sum(liked_tags_embedding * weights, axis=1)
        weights_unl_sum = tf.reduce_sum(unliked_tags_embedding * weights_unl, axis=1)
        
        sum_of_weights = tf.reduce_sum(weights, axis=1)
        sum_of_weights_unl = tf.reduce_sum(weights_unl, axis=1)
        
        liked_tags_embedding = weighted_sum / sum_of_weights
        unliked_tags_embedding = weights_unl_sum / sum_of_weights_unl
    
        # Reduce mean to handle ragged tensors (different lengths of liked/unliked tags)
        liked_tags_embedding = tf.expand_dims(liked_tags_embedding, axis=0)
        unliked_tags_embedding = tf.expand_dims(unliked_tags_embedding, axis=0)
        user_id_embedding = tf.expand_dims(user_id_embedding, axis=0)
        print("USer: ",user_id_embedding.shape,liked_tags_embedding.shape,unliked_tags_embedding.shape   )
        # Concatenate user embeddings with liked and unliked tags
        user_embedding = tf.concat([user_id_embedding, liked_tags_embedding, unliked_tags_embedding], axis=1)
    
        return user_embedding


In [41]:
class UserModel(tf.keras.Model):
    def __init__(self, unique_user_ids, unique_liked_tags, unique_unliked_tags, embedding_dimension):
        super().__init__()
        
        # User ID embeddings
        self.user_id_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])
        
        # Liked tags embeddings
        self.liked_tags_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_liked_tags, mask_token=None),
            tf.keras.layers.Embedding(len(unique_liked_tags) + 1, embedding_dimension)
        ])

        # Unliked tags embeddings
        self.unliked_tags_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_unliked_tags, mask_token=None),
            tf.keras.layers.Embedding(len(unique_unliked_tags) + 1, embedding_dimension)
        ])

        # Penalty weight for unliked tags
        self.penalty_weight = -0.5

    def call(self, features):
        user_id = features["user_id"]
        liked_tags = features["liked_tags"]
        unliked_tags = features["unliked_tags"]

        # Embed user ID
        user_id_embedding = self.user_id_model(user_id)
        #user_id_embedding = tf.reduce_mean(user_id_embedding, axis=1)

        # Embed liked and unliked tags
        liked_tags_embedding = self.liked_tags_model(liked_tags)
        unliked_tags_embedding = self.unliked_tags_model(unliked_tags)

        # Apply penalty to unliked tags
        unliked_tags_embedding = unliked_tags_embedding * self.penalty_weight

        # Aggregate embeddings
        liked_tags_embedding = tf.reduce_mean(liked_tags_embedding, axis=1, keepdims=True)  # (1, 32)
        unliked_tags_embedding = tf.reduce_mean(unliked_tags_embedding, axis=1, keepdims=True)  # (1, 32)

        # Используйте weighted суммирование для учета штрафов
        user_embedding = tf.add(user_id_embedding, liked_tags_embedding)
        user_embedding = tf.add(user_embedding, unliked_tags_embedding)

        return user_embedding



In [42]:
# Извлечение данных пользователя
test_user = {}
for data in ratings.take(1).as_numpy_iterator():
    test_user = data
    
user_model = UserModel(unique_user_ids, unique_liked_tags, unique_unliked_tags, embedding_dimension)


user_model(test_user)

<tf.Tensor: shape=(30, 32), dtype=float32, numpy=
array([[ 1.29763968e-04,  2.42709070e-02, -3.05240881e-03,
        -3.89042832e-02, -4.43574637e-02, -1.72759015e-02,
         6.94114715e-06,  2.82437727e-02, -2.92675458e-02,
        -9.50353965e-03,  2.69355550e-02, -2.09017713e-02,
         6.48100767e-03,  9.60340723e-03, -3.22529264e-02,
         4.67817336e-02,  1.57317240e-03,  5.06354719e-02,
        -2.40916926e-02, -3.36208083e-02, -3.69999744e-02,
         3.72381881e-02,  1.13132149e-02,  3.31809074e-02,
        -9.76277515e-05, -1.95095744e-02, -1.07586570e-02,
         3.53069697e-03, -2.15469096e-02, -3.28410827e-02,
         1.14859380e-02,  5.00914156e-02],
       [-7.44607439e-03,  1.66950691e-02, -1.06282476e-02,
        -4.64801192e-02, -5.19332998e-02, -2.48517394e-02,
        -7.56889721e-03,  2.06679348e-02, -3.68433818e-02,
        -1.70793775e-02,  1.93597171e-02, -2.84776092e-02,
        -1.09483022e-03,  2.02757027e-03, -3.98287624e-02,
         3.92058976e-0

In [43]:
class RecipeAndUserModel(tfrs.Model):

    def _reduce_mean_if_needed(self, embedding):
        """Сокращает размерность эмбеддинга до среднего, если он многомерный."""
        if len(embedding.shape) >= 3:
            return tf.reduce_mean(embedding, axis=1)
        return embedding

    def __init__(self, unique_recipe_names, unique_tags, embedding_dimension):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        # Модель пользователя
        self.user_model = UserModel(unique_user_ids, unique_liked_tags, unique_unliked_tags, 64)

        # Recipe embeddings
        self.recipe_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_recipe_names, mask_token=None),
            tf.keras.layers.Embedding(len(unique_recipe_names) + 1, embedding_dimension)
        ])

        # Tags embeddings
        self.tags_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_tags, mask_token=None),
            tf.keras.layers.Embedding(len(unique_tags) + 1, embedding_dimension)
        ])

        self.dense = tf.keras.layers.Dense(embedding_dimension)  # Сжимаем до размерности 32

        # Подготовка кандидатов, учитывающих и теги
        self.candidates = recipes.batch(128).map(self._get_candidate_embeddings)

        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=self.candidates
            )
        )
        
    def _get_candidate_embeddings(self, recipe_name):
        # Получаем эмбеддинги для имен и тегов
        recipe_embedding = self.recipe_model(recipe_name['recipe_name'])
        print(recipe_name['tags'])
        tags_embedding = self.tags_model(recipe_name['tags'])  # Здесь предполагается, что recipe_name может содержать теги
        recipe_embedding = self._reduce_mean_if_needed(recipe_embedding)
        tags_embedding = self._reduce_mean_if_needed(tags_embedding)
        # Объединяем эмбеддинги имен и тегов
        combined_embedding = tf.concat([recipe_embedding, tags_embedding], axis=1)
        return combined_embedding

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        # User embedding
        user_embedding = self.user_model(features)

        # Recipe name embeddings
        recipe_name_embeddings = self.recipe_model(features["recipe_name"])
        recipe_name_embeddings = self._reduce_mean_if_needed(recipe_name_embeddings)
        # Tags embeddings
        tags = features["tags"]
        tags_embeddings = self.tags_model(tags)
        tags_embeddings = self._reduce_mean_if_needed(tags_embeddings)
        # # Вычисление средневзвешенного значения для тегов (можно модифицировать веса)
        # tags_embeddings_weighted_sum = tf.reduce_sum(tags_embeddings, axis=1)
        # sum_of_weights = tf.reduce_sum(tf.ones_like(tags_embeddings), axis=1)
        # tags_embeddings = tags_embeddings_weighted_sum / sum_of_weights
        
        # recipe_name_embeddings = self._reduce_mean_if_needed(recipe_name_embeddings)
        # tags_embeddings = tf.expand_dims(tags_embeddings, axis=0)
        # Конкатенация эмбеддингов рецепта и тегов
        recipe_embeddings = tf.concat([recipe_name_embeddings, tags_embeddings], axis=1)

        return user_embedding, recipe_embeddings

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        user_embeddings, recipe_embeddings = self(features)
        print("Computed: ", user_embeddings.shape, recipe_embeddings.shape)
        # Используем задачу факторизованного топ-K для оптимизации
        user_embeddings = self._reduce_mean_if_needed(user_embeddings)
        recipe_embeddings = self._reduce_mean_if_needed(recipe_embeddings)
        
        # user_embeddings = tf.keras.layers.Dense(64)(user_embeddings)
        # recipe_embeddings = tf.keras.layers.Dense(64)(recipe_embeddings)
        print("Computed: ", user_embeddings.shape, recipe_embeddings.shape)
        return self.task(user_embeddings, recipe_embeddings)


In [44]:
# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(40_000)
test = shuffled.skip(80_000).take(20_000)

cached_train =  train.batch(10_000).cache()
cached_test = test.batch(4096).cache()

In [45]:
def print_shapes(features):
    print({k: v.shape for k, v in features.items()})
    return features

# Проверьте формы данных перед batch
shuffled = shuffled.map(print_shapes)

{'user_id': TensorShape([]), 'recipe_id': TensorShape([]), 'rating': TensorShape([]), 'recipe_name': TensorShape([]), 'tags': TensorShape([None]), 'liked_tags': TensorShape([None]), 'unliked_tags': TensorShape([None])}


In [46]:
model = RecipeAndUserModel(unique_recipe_names, unique_tags, embedding_dimension)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
model.fit(cached_train, epochs=1)

Tensor("args_3:0", shape=(None, None), dtype=string)
Tensor("args_3:0", shape=(None, None), dtype=string)
Computed:  (None, None, 64) (None, 64)
Computed:  (None, 64) (None, 64)
Computed:  (None, None, 64) (None, 64)
Computed:  (None, 64) (None, 64)


InvalidArgumentError: Graph execution error:

Detected at node retrieval/streaming/ReduceDataset defined at (most recent call last):
  File "f:\P310\lib\runpy.py", line 196, in _run_module_as_main

  File "f:\P310\lib\runpy.py", line 86, in _run_code

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\traitlets\config\application.py", line 1046, in launch_instance

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelapp.py", line 736, in start

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\tornado\platform\asyncio.py", line 195, in start

  File "f:\P310\lib\asyncio\base_events.py", line 595, in run_forever

  File "f:\P310\lib\asyncio\base_events.py", line 1881, in _run_once

  File "f:\P310\lib\asyncio\events.py", line 80, in _run

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 516, in dispatch_queue

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 505, in process_one

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 412, in dispatch_shell

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\ipykernel\kernelbase.py", line 740, in execute_request

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\ipykernel\ipkernel.py", line 422, in do_execute

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\ipykernel\zmqshell.py", line 546, in run_cell

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3024, in run_cell

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3079, in _run_cell

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3284, in run_cell_async

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3466, in run_ast_nodes

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code

  File "C:\Users\Maincharter\AppData\Local\Temp\ipykernel_17380\561549084.py", line 3, in <module>

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\engine\training.py", line 1807, in fit

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\engine\training.py", line 1401, in train_function

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\engine\training.py", line 1384, in step_function

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\engine\training.py", line 1373, in run_step

  File "f:\P310\lib\site-packages\tensorflow_recommenders\models\base.py", line 68, in train_step

  File "C:\Users\Maincharter\AppData\Local\Temp\ipykernel_17380\573055804.py", line 84, in compute_loss

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\engine\base_layer.py", line 1149, in __call__

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "f:\P310\lib\site-packages\tensorflow_recommenders\tasks\retrieval.py", line 195, in call

  File "f:\P310\lib\site-packages\tensorflow_recommenders\tasks\retrieval.py", line 196, in call

  File "f:\P310\lib\site-packages\tensorflow_recommenders\tasks\retrieval.py", line 197, in call

  File "f:\P310\lib\site-packages\tensorflow_recommenders\metrics\factorized_top_k.py", line 136, in update_state

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\engine\training.py", line 590, in __call__

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\engine\base_layer.py", line 1149, in __call__

  File "C:\Users\Maincharter\AppData\Roaming\Python\Python310\site-packages\keras\src\utils\traceback_utils.py", line 96, in error_handler

  File "f:\P310\lib\site-packages\tensorflow_recommenders\layers\factorized_top_k.py", line 487, in call

Cannot add tensor to the batch: number of elements does not match. Shapes are: [tensor]: [31], [batch]: [30]
	 [[{{node retrieval/streaming/ReduceDataset}}]] [Op:__inference_train_function_2490]

In [None]:
# prompt: Напиши код для теста model для пользователя 42
model2 = RecipeAndUserModel()
model2.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
test_user = {}
for data in test.take(1).as_numpy_iterator():
  print(model2(data))

In [None]:
model.evaluate(cached_test, return_dict=True)

### Making predictions

In [None]:
# Извлечение данных пользователя
test_user = {}
for data in test.take(1).as_numpy_iterator():
    test_user = data

# Создание набора данных для тестового пользователя
user_id_to_test = "42"  # Замените на нужный ID пользователя
test_user_dataset = {
    "user_id": np.array(test_user["user_id"]),
    "liked_tags": np.array([['30-minutes-or-less']]),
    "unliked_tags": np.array([['60-minutes-or-less'], ['lactose']])
}

brute_force = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
def _reduce_mean_if_needed(embedding):
    """Сокращает размерность эмбеддинга до среднего, если он многомерный."""
    if len(embedding.shape) >= 3:
        return tf.reduce_mean(embedding, axis=1)
    return embedding
dense = tf.keras.layers.Dense(96)
# Создание набора данных для рецептов и тегов
def get_recipe_and_tag_embeddings(x):
    recipe_embedding = model.recipe_model(x['recipe_name'])  # (None, 32)
    tag_embedding = model.tags_model(x['tags'])  # (None, 32)
    recipe_embedding =_reduce_mean_if_needed(recipe_embedding)
    tag_embedding = _reduce_mean_if_needed(tag_embedding)
    
    combined_embeddings = tf.concat([_reduce_mean_if_needed(recipe_embedding),  _reduce_mean_if_needed(tag_embedding)], axis=1)  # (None, 64)
    
    return dense(combined_embeddings)

# Используем map для обработки всех данных сразу
recipes_embeddings = recipes.batch(100).map(get_recipe_and_tag_embeddings)


# Создание набора имен рецептов
recipes_names = recipes.batch(100).map(lambda x: x["recipe_id"])  # Убедитесь, что это (None,)

# Объединяем эмбеддинги с именами
index = brute_force.index_from_dataset(
    tf.data.Dataset.zip((recipes_names, recipes_embeddings))
)

# Получаем рекомендации
_, titles = index(test_user_dataset, k=15)

# Вывод тегов для рекомендуемых рецептов
recommended_tags = raw_recipes[raw_recipes['id'].isin(titles.numpy().flatten().astype('str'))]


In [None]:
recommended_tags