# Imports & funcs

In [1]:
# !pip install fuzzywuzzy transformers

In [2]:
import os
import re

import pandas as pd
import numpy as np
import torch
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel



In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
def read_text_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()

In [6]:
def extract_recipes_names(recipes):
    return list(map(lambda x: x.split('üìñ')[0].strip(), recipes))

In [7]:
def extract_ingridients(recipes):
    ingredients_lists = []

    for recipe in recipes:
        ingredients_part = re.search(r'–ò–Ω–≥—Ä–µ–¥–∏–µ–Ω—Ç—ã:(.*?)üßëüèª‚Äçüç≥', recipe, re.DOTALL)
        if not ingredients_part:
            continue
        ingredients_text = ingredients_part.group(1)

        ingredients = [ingredient.strip() for ingredient in ingredients_text.split('-') if ingredient.strip()]

        ingredients = [re.sub(r'\s*-\s*.*', '', ingredient) for ingredient in ingredients]

        upper_case_ingredients = [ingredient for ingredient in ingredients if ingredient[0].isupper()]

        ingredients_lists.append(' '.join(upper_case_ingredients))

    return ingredients_lists

In [8]:
def encode_text(text):
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt', max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(input_ids)
    return outputs.last_hidden_state.mean(dim=1)

In [9]:
def get_list_of_recipes(dists_values, threshold):
    ans_indices = np.where(dists_values > threshold)[0]
    values = dists_values[ans_indices]
    sorted_indices_desc = ans_indices[np.argsort(-values)]

    if len(sorted_indices_desc) > 0:
        return sorted_indices_desc, True

    return [], False

In [10]:
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
model = AutoModel.from_pretrained('cointegrated/rubert-tiny2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Data analysis

In [11]:
all_recipe_files = sorted(os.listdir("/content/drive/MyDrive/text/recipe/"))

In [12]:
recipes = []
for file in all_recipe_files:
    recipes.append((file.split('.')[0], read_text_file("/content/drive/MyDrive/text/recipe/" + file)))

In [13]:
data = pd.DataFrame(columns=["recipe_id", "text"])

data["recipe_id"] = list(map(lambda x: x[0], recipes))
data["text"] = list(map(lambda x: x[1], recipes))

data["recipe_id"] = data["recipe_id"].astype(int)

In [14]:
data = data.sort_values("recipe_id")
data.reset_index(drop=True, inplace=True)

# Search

–ü–æ –∏–Ω–≥—Ä–∏–¥–∏–µ–Ω—Ç–∞–º –∏ —Ä–µ—Ü–µ–ø—Ç–∞–º

In [15]:
def get_ans_search(query, dist_threshold=85, sim_threshold=0.7, action_type="recipes"):
    recipes = data.text.tolist()

    func = extract_recipes_names if action_type == "recipes" else extract_ingridients

    # 1 step
    useful_info_recipes = func(recipes)

    lev_dists = np.array(list(map(lambda x: fuzz.WRatio(query, x), useful_info_recipes)))

    ans_recipes, flag = get_list_of_recipes(lev_dists, dist_threshold)

    if flag:
        return ans_recipes

    # 2 step
    search_vector = encode_text(query)
    recipe_vectors = torch.stack([encode_text(recipe) for recipe in useful_info_recipes])

    cosine_similarities = cosine_similarity(search_vector, recipe_vectors.reshape(recipe_vectors.shape[0], recipe_vectors.shape[2]))

    ans_recipes, flag = get_list_of_recipes(cosine_similarities[0], sim_threshold)

    return ans_recipes

–¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –ø–æ–∏—Å–∫–∞ –ø–æ —Ä–µ—Ü–µ–ø—Ç–∞–º

In [16]:
get_ans_search("–±–∞–Ω–∞–Ω–æ–≤—ã–π –∫–µ–∫—Å")

array([251,   3,   7])

In [17]:
data.text.loc[251], data.text.loc[3], data.text.loc[7]

('–ë–∞–Ω–∞–Ω–æ–≤—ã–π –∫–µ–∫—Å –≤ –º–∏–∫—Ä–æ–≤–æ–ª–Ω–æ–≤–∫–µ üìñ –ò–Ω–≥—Ä–µ–¥–∏–µ–Ω—Ç—ã: - –ë–∞–Ω–∞–Ω - 2 —à—Ç. - –Ø–π—Ü–æ - 2 —à—Ç. - –°–ª–∏–≤–æ—á–Ω–æ–µ –º–∞—Å–ª–æ - 100 –≥. - –°–∞—Ö–∞—Ä - 100 –≥. - –ú—É–∫–∞ - 100 –≥. - –†–∞–∑—Ä—ã—Ö–ª–∏—Ç–µ–ª—å - 1 —á. –ª. - –°–æ–ª—å - 1 —â–µ–ø–æ—Ç–∫–∞ - –ö–æ—Ä–∏—Ü–∞ - 2 —á. –ª. - –ú–æ–ª–æ–∫–æ - 4 —Å—Ç. –ª. üßëüèª\u200düç≥ –ü—Ä–∏–≥–æ—Ç–æ–≤–ª–µ–Ω–∏–µ: 1. –ë–∞–Ω–∞–Ω—ã –æ—á–∏—Å—Ç–∏—Ç—å –æ—Ç –∫–æ–∂—É—Ä—ã, –Ω–∞—Ä–µ–∑–∞—Ç—å –Ω–∞ –Ω–µ–±–æ–ª—å—à–∏–µ –∫—É—Å–æ—á–∫–∏ –∏ —Ä–∞–∑–º—è—Ç—å –≤–∏–ª–∫–æ–π –∏–ª–∏ —Ç–æ–ª–∫—É—à–∫–æ–π –≤ –ø—é—Ä–µ. 2. –°–º–µ—à–∞—Ç—å –±–∞–Ω–∞–Ω–æ–≤–æ–µ –ø—é—Ä–µ —Å —è–π—Ü–∞–º–∏, —Ä–∞—Å—Ç–æ–ø–ª–µ–Ω–Ω—ã–º —Å–ª–∏–≤–æ—á–Ω—ã–º –º–∞—Å–ª–æ–º, —Å–∞—Ö–∞—Ä–æ–º –∏ –ø–µ—Ä–µ–º–µ—à–∞—Ç—å. 3. –ú—É–∫—É, —Ä–∞–∑—Ä—ã—Ö–ª–∏—Ç–µ–ª—å, —â–µ–ø–æ—Ç–∫—É —Å–æ–ª–∏, –∫–æ—Ä–∏—Ü—É –∏ –º–æ–ª–æ–∫–æ –¥–æ–±–∞–≤–∏—Ç—å –≤ –±–∞–Ω–∞–Ω–æ–≤—É—é —Å–º–µ—Å—å. –ü–µ—Ä–µ–º–µ—à–∞—Ç—å –¥–æ –æ–¥–Ω–æ—Ä–æ–¥–Ω–æ—Å—Ç–∏. 4. –†–∞—Å–ø—Ä–µ–¥–µ–ª–∏—Ç—å —Å–º–µ—Å—å –ø–æ —Ñ–æ—Ä–º–µ –ø—Ä–∏–≥–æ–¥–Ω–æ–π 

–¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –ø–æ–∏—Å–∫–∞ –ø–æ –∏–Ω–≥—Ä–∏–¥–∏–µ–Ω—Ç–∞–º

In [18]:
get_ans_search("–º–∞—Å–ª–æ —Å–ª–∏–≤–æ—á–Ω–æ–µ", action_type="ingridients")[:3]

array([  2,   3, 255])

In [19]:
data.text.loc[2], data.text.loc[3], data.text.loc[255]

('–®–æ–∫–æ–ª–∞–¥–Ω—ã–π —Ñ–æ–Ω–¥–∞–Ω üìñ –ò–Ω–≥—Ä–µ–¥–∏–µ–Ω—Ç—ã:‚†Ä - –®–æ–∫–æ–ª–∞–¥ —Ç–µ–º–Ω—ã–π - 100 –≥. - –ú–∞—Å–ª–æ —Å–ª–∏–≤–æ—á–Ω–æ–µ - 60 –≥. - –Ø–π—Ü–æ- 2 —à—Ç. - –°–∞—Ö–∞—Ä - 3 —Å—Ç.–ª. ‚†Ä - –ú—É–∫–∞ - 2 —Å—Ç.–ª. ‚†Ä - –ö–∞–∫–∞–æ - 2 —á.–ª ‚†Ä - –†–∞–∑—Ä—ã—Ö–ª–∏—Ç–µ–ª—å - 1 —á.–ª ‚†Ä - –°–æ–ª—å - 1/4 —á.–ª ‚†Ä ‚†Ä üßëüèª\u200düç≥ –ü—Ä–∏–≥–æ—Ç–æ–≤–ª–µ–Ω–∏–µ:‚†Ä 1. –®–æ–∫–æ–ª–∞–¥ –ª–æ–º–∞–µ–º –Ω–∞ –∫—É—Å–æ—á–∫–∏ –∏ —Ä–∞—Å—Ç–∞–ø–ª–∏–≤–∞–µ–º –Ω–∞ –≤–æ–¥—è–Ω–æ–π –±–∞–Ω–µ –∏–ª–∏ –º–∏–∫—Ä–æ–≤–æ–ª–Ω–æ–≤–æ–π –ø–µ—á–∏ –∫–æ—Ä–æ—Ç–∫–∏–º–∏ –∏–º–ø—É–ª—å—Å–∞–º–∏. –†–∞—Å—Ç–∞–ø–ª–∏–≤–∞–µ–º —Å–ª–∏–≤–æ—á–Ω–æ–µ –º–∞—Å–ª–æ –≤ –º–∏–∫—Ä–æ–≤–æ–ª–Ω–æ–≤–æ–π –ø–µ—á–∏ –∏ —Å–æ–µ–¥–∏–Ω—è–µ–º —Å —à–æ–∫–æ–ª–∞–¥–æ–º. 2. –í –æ—Ç–¥–µ–ª—å–Ω–æ–π –µ–º–∫–æ—Å—Ç–∏ —Å–æ–µ–¥–∏–Ω—è–µ–º—ã—Ö —è–π—Ü–∞ –∏ —Å–∞—Ö–∞—Ä, –ø–µ—Ä–µ–º–µ—à–∏–≤–∞–µ–º. 3. –í–ª–∏–≤–∞–µ–º –∫ —è–π—Ü–∞–º —Å —Å–∞—Ö–∞—Ä–æ–º —à–æ–∫–æ–ª–∞–¥–Ω–æ-–º–∞—Å–ª—è–Ω—É—é —Å–º–µ—Å—å, –¥–æ–±–∞–≤–ª—è–µ–º –º—É–∫—É, —Å–æ–ª—å, —Ä–∞–∑—Ä—ã—Ö–ª–∏—Ç–µ–ª—å –∏ –ø–µ—Ä–µ–º–µ—à–

# RecSys

In [20]:
def get_recommendation_by_fav(recipe_id, dist_threshold=0.5, sim_threshold=0.5):
    recipes = data[data["recipe_id"] != recipe_id].text.tolist()

    anchor_recipe_name = extract_recipes_names(data[data["recipe_id"] == recipe_id].text.tolist())[0]
    recipes_names = extract_recipes_names(recipes)

    lev_dists = np.array(list(map(lambda x: fuzz.WRatio(anchor_recipe_name, x), recipes_names)))

    ans_recipes, _ = get_list_of_recipes(lev_dists, dist_threshold)

    first_step = ans_recipes[:50]

    anchor_vector = encode_text(anchor_recipe_name)
    recipe_vectors = torch.stack([encode_text(recipe) for recipe in recipes_names])

    cosine_similarities = cosine_similarity(anchor_vector, recipe_vectors.reshape(recipe_vectors.shape[0], recipe_vectors.shape[2]))

    ans_recipes, _ = get_list_of_recipes(cosine_similarities[0], sim_threshold)

    second_step = ans_recipes[:50]

    return [item for item in second_step if item in set(first_step)][:9]

In [21]:
predict_recipes = get_recommendation_by_fav(50)

In [22]:
print(f"–†–µ—Ü–µ–ø—Ç –∏–∑ –∏–∑–±—Ä–∞–Ω–Ω–æ–≥–æ:\n{data[data['recipe_id'] == 50].text.values[0]}\n\n–†–µ–∫–æ–º–µ–Ω–¥—É–µ–º—ã–µ —Ä–µ—Ü–µ–ø—Ç—ã:\n{data[data['recipe_id'] != 50].text[predict_recipes].values}")

–†–µ—Ü–µ–ø—Ç –∏–∑ –∏–∑–±—Ä–∞–Ω–Ω–æ–≥–æ:
–°–º–µ—Ç–∞–Ω–Ω—ã–π —Ç–æ—Ä—Ç üìñ –ò–Ω–≥—Ä–µ–¥–∏–µ–Ω—Ç—ã: - –°–º–µ—Ç–∞–Ω–∞ - 2 —Å—Ç–∞–∫–∞–Ω–∞ - –ú—É–∫–∞ - 3 —Å—Ç–∞–∫–∞–Ω–∞ - –°–∞—Ö–∞—Ä - 3/4 —Å—Ç–∞–∫–∞–Ω–∞ (–ø–ª—é—Å 1/2 —Å—Ç–∞–∫–∞–Ω–∞ –≤ –∫—Ä–µ–º) - –°–æ–¥–∞ - 1/4 —á. –ª. - –°–æ–ª—å - ‚Äî 1/4 —á. –ª. - –í–∞–Ω–∏–ª—å–Ω—ã–π —Å–∞—Ö–∞—Ä ‚Äî 1/3 —á. –ª. üßëüèª‚Äçüç≥ –ü—Ä–∏–≥–æ—Ç–æ–≤–ª–µ–Ω–∏–µ: 1. –†–∞–∑–æ–≥—Ä–µ—Ç—å –¥—É—Ö–æ–≤–∫—É –¥–æ 230-240 –≥—Ä–∞–¥—É—Å–æ–≤. –°–º–∞–∑–∞—Ç—å –ø—Ä–æ—Ç–∏–≤–µ–Ω—å —Ä–∞—Å—Ç–∏—Ç–µ–ª—å–Ω—ã–º –º–∞—Å–ª–æ–º. 2. –ü–µ—Ä–µ–º–µ—à–∞—Ç—å –≤ –±–æ–ª—å—à–æ–π –º–∏—Å–∫–µ —Å–º–µ—Ç–∞–Ω—É, —Å–∞—Ö–∞—Ä –∏ —Å–æ–ª—å. –í—Å—ã–ø–∞—Ç—å –º—É–∫—É –∏ —Å–æ–¥—É, –∑–∞–º–µ—Å–∏—Ç—å —Ç–µ—Å—Ç–æ. 3. –†–∞–∑–¥–µ–ª–∏—Ç—å –≥–æ—Ç–æ–≤–æ–µ —Ç–µ—Å—Ç–æ –Ω–∞ 4 —Ä–∞–≤–Ω—ã–µ —á–∞—Å—Ç–∏. –ò–∑ –∫–∞–∂–¥–æ–π —á–∞—Å—Ç–∏ —Å–∫–∞–ª–∫–æ–π —Ä–∞—Å–∫–∞—Ç–∞—Ç—å –∫—Ä—É–≥. –í—ã–ª–æ–∂–∏—Ç—å –∫–æ—Ä–∂–∏ –Ω–∞ –ø–æ–¥–≥–æ—Ç–æ–≤–ª–µ–Ω–Ω—ã–π –ø—Ä–æ—Ç–∏–≤–µ–Ω—å. –í—ã–ø–µ–∫–∞—Ç—å –≤ —Ç–µ—á–µ–Ω–∏–µ 10-15 –º–∏–Ω—É—Ç. 4. –¢–µ–º –≤—Ä–µ–º–µ–Ω–µ–º 