# Imports & funcs

In [1]:
# !pip install fuzzywuzzy transformers

In [None]:
import os
import re

import pandas as pd
import numpy as np
import torch
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

In [3]:
pd.set_option('display.max_colwidth', None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [5]:
def read_text_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()

In [6]:
def extract_recipes_names(recipes):
    return list(map(lambda x: x.split('📖')[0].strip(), recipes))

In [7]:
def extract_ingridients(recipes):
    ingredients_lists = []

    for recipe in recipes:
        ingredients_part = re.search(r'Ингредиенты:(.*?)🧑🏻‍🍳', recipe, re.DOTALL)
        if not ingredients_part:
            continue
        ingredients_text = ingredients_part.group(1)

        ingredients = [ingredient.strip() for ingredient in ingredients_text.split('-') if ingredient.strip()]

        ingredients = [re.sub(r'\s*-\s*.*', '', ingredient) for ingredient in ingredients]

        upper_case_ingredients = [ingredient for ingredient in ingredients if ingredient[0].isupper()]

        ingredients_lists.append(upper_case_ingredients)

    return ingredients_lists

In [8]:
def encode_text(text):
    input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt', max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(input_ids)
    return outputs.last_hidden_state.mean(dim=1)

In [9]:
def get_list_of_recipes(dists_values, threshold):
    ans_indices = np.where(dists_values > threshold)[0]
    values = dists_values[ans_indices]
    sorted_indices_desc = ans_indices[np.argsort(-values)]

    if len(sorted_indices_desc) > 0:
        return sorted_indices_desc, True

    return [], False

In [None]:
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
model = AutoModel.from_pretrained('cointegrated/rubert-tiny2')

# Data analysis

In [11]:
all_recipe_files = sorted(os.listdir("/content/drive/MyDrive/text/recipe/"))

In [12]:
recipes = []
for file in all_recipe_files:
    recipes.append((file.split('.')[0], read_text_file("/content/drive/MyDrive/text/recipe/" + file)))

In [13]:
data = pd.DataFrame(columns=["recipe_id", "text"])

data["recipe_id"] = list(map(lambda x: x[0], recipes))
data["text"] = list(map(lambda x: x[1], recipes))

data["recipe_id"] = data["recipe_id"].astype(int)

In [14]:
data = data.sort_values("recipe_id")
data.reset_index(drop=True, inplace=True)

# Search

По ингридиентам и рецептам

In [15]:
def get_ans_search(query, dist_threshold=90, sim_threshold=0.65, action_type="recipes"):
    recipes = data.text.tolist()

    if action_type == "recipes":
        # 1 step
        recipes_names = extract_recipes_names(recipes)

        lev_dists = np.array(list(map(lambda x: fuzz.WRatio(query, x), recipes_names)))

        ans_recipes, flag = get_list_of_recipes(lev_dists, dist_threshold)

        if flag:
            return ans_recipes

        # 2 step
        search_vector = encode_text(query)
        recipe_vectors = torch.stack([encode_text(recipe) for recipe in recipes_names])

        cosine_similarities = cosine_similarity(search_vector, recipe_vectors.reshape(recipe_vectors.shape[0], recipe_vectors.shape[2]))

        ans_recipes, flag = get_list_of_recipes(cosine_similarities[0], sim_threshold)

        if flag:
            return ans_recipes

    elif action_type == "ingridients":
        # 1 step
        ingridients = extract_ingridients(recipes)

        lev_dists = np.array(list(map(lambda x: fuzz.WRatio(query, ' '.join(x).lower()), ingridients)))

        ans_recipes, flag = get_list_of_recipes(lev_dists, dist_threshold)

        if flag:
            return ans_recipes

         # 2 step
        search_vector = encode_text(query)
        ingridients_vectors = torch.stack([encode_text(' '.join(ingridient)) for ingridient in ingridients])

        cosine_similarities = cosine_similarity(search_vector, ingridients_vectors.reshape(ingridients_vectors.shape[0], ingridients_vectors.shape[2]))

        ans_recipes, flag = get_list_of_recipes(cosine_similarities[0], sim_threshold)

        if flag:
            return ans_recipes

    return []

# RecSys

In [16]:
def get_recommendation_by_fav(recipe_id):
    recipes = data[data["recipe_id"] != recipe_id].text.tolist()

    search_vector = encode_text(data[data["recipe_id"] == recipe_id].text.tolist()[0])
    recipe_vectors = torch.stack([encode_text(recipe) for recipe in recipes])

    cosine_similarities = cosine_similarity(search_vector, recipe_vectors.reshape(recipe_vectors.shape[0], recipe_vectors.shape[2]))

    sorted_indices = torch.argsort(torch.tensor(cosine_similarities[0]), descending=True).numpy()

    return recipes[sorted_indices[:9]], cosine_similarities[0][sorted_indices[:9]]