In [139]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)

# Виводити всі стовпці без обмежень
pd.set_option('display.max_columns', None)

# Не обрізати текст у клітинках
pd.set_option('display.max_colwidth', None)

In [164]:
dataset = pd.read_csv("../../all_recipes.csv")

In [165]:
print(dataset.iloc[0,0])

No-Bake Nut Cookies

Ingredients:
- 1 c. firmly packed brown sugar
- 1/2 c. evaporated milk
- 1/2 tsp. vanilla
- 1/2 c. broken nuts (pecans)
- 2 Tbsp. butter or margarine
- 3 1/2 c. bite size shredded rice biscuits

Directions:
- In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.
- Stir over medium heat until mixture bubbles all over top.
- Boil and stir 5 minutes more. Take off heat.
- Stir in vanilla and cereal; mix well.
- Using 2 teaspoons, drop and shape into 30 clusters on wax paper.
- Let stand until firm, about 30 minutes.


In [166]:
def split_func(text):
    new_data = text.split("\n\n")
    title = new_data[0]
    ingredients = new_data[1].replace("Ingredients:\n", "").replace("- ", "").strip().split("\n")
    directions = new_data[2].replace("Directions:\n", "").replace("- ", "").strip().split("\n")
    return pd.Series([title, ingredients, directions])

In [None]:
df = pd.DataFrame()
df[["title", "ingredients", "directions"]] = dataset["input"].head(10).apply(lambda text: split_func(text))

In [168]:
df.head(2)

Unnamed: 0,title,ingredients,directions
0,No-Bake Nut Cookies,"[1 c. firmly packed brown sugar, 1/2 c. evaporated milk, 1/2 tsp. vanilla, 1/2 c. broken nuts (pecans), 2 Tbsp. butter or margarine, 3 1/2 c. bite size shredded rice biscuits]","[In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine., Stir over medium heat until mixture bubbles all over top., Boil and stir 5 minutes more. Take off heat., Stir in vanilla and cereal; mix well., Using 2 teaspoons, drop and shape into 30 clusters on wax paper., Let stand until firm, about 30 minutes.]"
1,Jewell Ball'S Chicken,"[1 small jar chipped beef, cut up, 4 boned chicken breasts, 1 can cream of mushroom soup, 1 carton sour cream]","[Place chipped beef on bottom of baking dish., Place chicken on top of beef., Mix soup and cream together; pour over chicken. Bake, uncovered, at 275° for 3 hours.]"


In [169]:
len((dataset.drop_duplicates()))

2147248

In [None]:
import re
from fractions import Fraction

def parse_ingredient(ingredient_line):
    match = re.match(r'^(\d+\s\d+/\d+|\d+/\d+|\d+)(\s)?([a-zA-Z\.]+)?\s*(.*)', ingredient_line)
    if not match:
        return None, None, ingredient_line

    raw_amount = match.group(1)
    unit = match.group(3)
    ingredient = match.group(4)

    if ' ' in raw_amount:
        whole, frac = raw_amount.split()
        amount = float(whole) + float(Fraction(frac))
    else:
        amount = float(Fraction(raw_amount))

    unit = unit.replace('.', '').lower() if unit else None
    return amount, unit, ingredient.strip()

In [171]:
df["parsed_ingredients"] = df["ingredients"].apply(
    lambda lst: [parse_ingredient(ing) for ing in lst]
)

from decimal import Decimal

def clean_tuple(tup):
    return tuple(
        float(val) if isinstance(val, Decimal)
        else val.replace('c', 'cup') if isinstance(val, str) and val == 'c'
        else val
        for val in tup
    )

df["parsed_ingredients"] = df["parsed_ingredients"].apply(
    lambda lst: [clean_tuple(tup) for tup in lst]
)

df["parsed_ingredients"]

0                                                                                                                                                                         [(1.0, cup, firmly packed brown sugar), (0.5, cup, evaporated milk), (0.5, tsp, vanilla), (0.5, cup, broken nuts (pecans)), (2.0, tbsp, butter or margarine), (3.5, cup, bite size shredded rice biscuits)]
1                                                                                                                                                                                                                                              [(1.0, small, jar chipped beef, cut up), (4.0, boned, chicken breasts), (1.0, can, cream of mushroom soup), (1.0, carton, sour cream)]
2                                                                                                                                                                                 [(2.0, None, (16 oz.) pkg. frozen corn), (1.0, None, (8 oz.) pkg. cream ch

In [172]:
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..')))

from backend.api.app import create_app, db

app = create_app()

with app.app_context():
    nutrient_df = pd.read_sql("SELECT * FROM nutrient_info", db.engine)


In [173]:
import pandas as pd
import re
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_md")
tqdm.pandas()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

def get_vector(text):
    return nlp(text).vector

def preprocess_dataframe(df, text_col):
    df["cleaned"] = df[text_col].apply(clean_text)
    df["lemmatized"] = df["cleaned"].progress_apply(lemmatize_text)
    df["vector"] = df["lemmatized"].progress_apply(get_vector)
    df["category"] = df["description"].apply(lambda x: x.split(",")[0].strip())
    df["category"] = df["category"].apply(lemmatize_text)
    return df

nutrient_df = preprocess_dataframe(nutrient_df, "description")

100%|██████████| 85149/85149 [06:46<00:00, 209.70it/s]
100%|██████████| 85149/85149 [07:25<00:00, 191.08it/s]


In [174]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_most_similar(text, df, top_k=1):
    cleaned = clean_text(text)
    lemmatized = lemmatize_text(cleaned)
    vec = get_vector(lemmatized).reshape(1, -1)

    # Порівняння з векторами в таблиці
    vectors = np.vstack(df["vector"].values)
    similarities = cosine_similarity(vec, vectors)[0]

    # Вибір топ-K найближчих
    top_k_indices = similarities.argsort()[::-1][:top_k]
    results = df.iloc[top_k_indices].copy()
    results["similarity"] = similarities[top_k_indices]
    return results

In [175]:
import csv
nutrient_df.to_csv("nutrient_table_vectorized.csv", index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)

In [176]:
resultTFIDF = pd.DataFrame()
for ingredient in df["parsed_ingredients"].to_list():
    for tup in ingredient:
        ingredient_name = tup[2]
        resultTFIDF = pd.concat([resultTFIDF, find_most_similar("rice brown", nutrient_df)["description"]], axis=0)
resultTFIDF

Unnamed: 0,description
69777,"Rice flour, brown"
69777,"Rice flour, brown"
69777,"Rice flour, brown"
69777,"Rice flour, brown"
69777,"Rice flour, brown"
69777,"Rice flour, brown"
69777,"Rice flour, brown"
69777,"Rice flour, brown"
69777,"Rice flour, brown"
69777,"Rice flour, brown"


In [177]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

corpus = nutrient_df["cleaned"].tolist()

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

def filter_and_prioritize_by_category(df, query):
    query_words = set(clean_text(query).split())

    def get_overlap_score(category):
        if pd.isna(category):
            return 0, float('inf')
        cat_words = set(clean_text(category).split())
        return len(query_words & cat_words), len(cat_words)

    df = df.copy()
    df[["overlap_score", "category_length"]] = df["category"].apply(get_overlap_score).apply(pd.Series)

    df = df.sort_values(["overlap_score", "category_length"], ascending=[False, True])
    return df

def tfidf_search(query, measure, top_k=1):
    query_clean = clean_text(query)

    candidates = nutrient_df[nutrient_df["measure"] == measure].copy()

    #
    candidates_category = filter_and_prioritize_by_category(candidates, query)
    if not candidates_category.empty:
        candidates = candidates_category.copy()

    corpus = candidates["cleaned"].tolist()
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)

    query_vec = vectorizer.transform([query_clean])
    sims = cosine_similarity(query_vec, tfidf_matrix)[0]

    top_k_idx = sims.argsort()[::-1][:top_k]
    results = candidates.iloc[top_k_idx].copy()
    results["similarity"] = sims[top_k_idx]
    return results

result = pd.DataFrame()
for ingredient in df["parsed_ingredients"].to_list():
    for tup in ingredient:
        ingredient_name = tup[2]
        result = pd.concat([result, tfidf_search(ingredient_name, tup[1])], axis=0)

ValueError: Columns must be same length as key

In [178]:
result[["category", "description"]]

Unnamed: 0,category,description
78766,sweetener,"Sweeteners, sugar substitute, granulated, brown"
52892,milk,"Milk, canned, evaporated, with added vitamin A"
82014,vanilla extract,Vanilla extract
55927,nut,"Nuts, pecans, oil roasted, with salt added"
23640,butter,"Butter, without salt"
25309,candy,"Candies, MARS SNACKFOOD US, SKITTLES Original Bite Size Candies"
1053,apple,"Apples, raw, fuji, with skin (Includes foods for USDA's Food Distribution Program)"


In [None]:
for ingredient in df["parsed_ingredients"].to_list():
    for tup in ingredient:
        print(tup[2])

firmly packed brown sugar
evaporated milk
vanilla
broken nuts (pecans)
butter or margarine
bite size shredded rice biscuits


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

corpus = nutrient_df["cleaned"].tolist()

vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(corpus)

def countvec_search(query, measure, top_k=5):
    query_clean = clean_text(query)
    query_vec = vectorizer.transform([query_clean])
    sims = cosine_similarity(query_vec, count_matrix)[0]

    filtered_df = nutrient_df[nutrient_df["measure"] == measure]
    filtered_matrix = count_matrix[filtered_df.index]

    sims_filtered = cosine_similarity(query_vec, filtered_matrix)[0]

    top_k_idx = sims_filtered.argsort()[::-1][:top_k]
    results = filtered_df.iloc[top_k_idx].copy()
    results["similarity"] = sims_filtered[top_k_idx]
    return results

resultCount = pd.DataFrame()
for ingredient in df["parsed_ingredients"].to_list():
    for tup in ingredient:
        ingredient_name = tup[2]
        resultCount = pd.concat([resultCount, tfidf_search(ingredient_name, tup[1])], axis=0)
resultCount["description"]

78766                    Sweeteners, sugar substitute, granulated, brown
52892                     Milk, canned, evaporated, with added vitamin A
82014                                                    Vanilla extract
55927                         Nuts, pecans, oil roasted, with salt added
23640                                               Butter, without salt
25309    Candies, MARS SNACKFOOD US, SKITTLES Original Bite Size Candies
Name: description, dtype: object