In [None]:
# === Imports ===
import re
import spacy
import pandas as pd
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# === Load Data ===
# Load the recipe dataset
recipe_df = pd.read_csv('/content/70000_recipes_nutrients_cleaned_final.csv')

# === Step 1: Extract All Known Ingredients from Dataset ===
def extract_all_ingredients(df):
    all_ingredients = set()
    for row in df['ingredients']:
        ing_list = [i.strip().lower() for i in row.split(',')]
        all_ingredients.update(ing_list)
    return list(all_ingredients)

known_ingredients = extract_all_ingredients(recipe_df)

# === Step 2: Clean Text Utility Function ===
def clean_ingredient_text(text):
    text = text.lower()
    text = re.sub(r'\b(\d+\/\d+|\d+\.\d+|\d+)\s*(cups?|tbsp|tablespoons?|tsp|teaspoons?|grams?|kg|g|ml|l|pounds?|oz|cloves?|slices?|dash|pinch|package|packets?)\b', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\b(chopped|minced|diced|fresh|thinly sliced|grated|sliced|peeled|optional|divided|rinsed|drained|beaten|ground)\b', '', text)
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# === Step 3: Cleaned Ingredient Mapping for Better Matching ===
def create_cleaned_ingredient_mapping(df):
    mapping = defaultdict(list)
    for row in df['ingredients']:
        for ing in row.split(','):
            ing_clean = clean_ingredient_text(ing)
            if ing_clean:
                mapping[ing_clean].append(ing.strip().lower())
    return mapping

ingredient_mapping = create_cleaned_ingredient_mapping(recipe_df)
cleaned_ingredients = list(ingredient_mapping.keys())

# === Step 4: Extract Ingredients from User Input ===
def extract_ingredients_from_text(recipe_text):
    lines = recipe_text.lower().split('\n')
    extracted = []
    for line in lines:
        line = re.sub(r'\b(\d+\/\d+|\d+\.\d+|\d+)\s*(cups?|tbsp|tablespoons?|tsp|teaspoons?|grams?|kg|g|ml|l|pounds?|oz|cloves?|slices?|dash|pinch)\b', '', line)
        line = re.sub(r'\(.*?\)', '', line)
        line = re.sub(r'\b(chopped|minced|diced|fresh|thinly sliced|grated|sliced|peeled|optional|divided|rinsed|drained|beaten|ground)\b', '', line)
        line = re.sub(r'[^a-zA-Z ]', '', line)
        line = line.strip()
        if line:
            extracted.append(line)
    return extracted

# === Step 5: Match Ingredients to Cleaned Dataset using Cosine Similarity ===
def match_ingredients_to_cleaned_dataset(user_ings, cleaned_ings, threshold=0.6):
    matches = []

    vectorizer = TfidfVectorizer().fit(cleaned_ings + user_ings)
    cleaned_vecs = vectorizer.transform(cleaned_ings)

    for user_ing in user_ings:
        user_vec = vectorizer.transform([user_ing])
        sims = cosine_similarity(user_vec, cleaned_vecs)[0]

        max_sim_idx = sims.argmax()
        max_sim_val = sims[max_sim_idx]

        if max_sim_val >= threshold:
            matched = cleaned_ings[max_sim_idx]
            matches.append((user_ing, matched))
        else:
            matches.append((user_ing, None))

    return matches

# === MAIN EXECUTION ===

# Example user input recipe
user_recipe = """
1 cup quinoa
2 tablespoons lemon juice
1/4 cup chopped cucumber
1/4 cup diced tomatoes
2 tablespoons chopped parsley
1 tablespoon olive oil
Salt and pepper to taste

"""

# Step 1: Extract cleaned ingredients from user input
extracted = extract_ingredients_from_text(user_recipe)

# Step 2: Match extracted ingredients to known/cleaned ingredients
matches = match_ingredients_to_cleaned_dataset(extracted, cleaned_ingredients)

# Step 3: Print results
print("User Input ↔ Matched Ingredient")
for user_ing, matched in matches:
    print(f"{user_ing} ↔ {matched if matched else 'No match'}")


First prompt:  I need to perform NER ingredient matching with cosine similarity

Last prompt : it should match with dataset after execution with 0.6 threshold

**COSINE SIMILARITY**

The third section focuses on matching ingredients using cosine similarity. It begins by compiling all known ingredients from your dataset and cleaning them using regex to remove quantities and descriptors. It then takes a user-input recipe, extracts and cleans the ingredients similarly, and represents both user and known ingredients as vectors using TF-IDF. The cosine similarity between each user ingredient and the dataset's ingredients is calculated, and the closest match above a given threshold is selected. This technique allows for fuzzy matching, accommodating slight variations in wording (e.g., "tomato" vs "tomatoes") and improving alignment between user-provided ingredients and dataset entries.