<a href="https://colab.research.google.com/github/Manav716/Booking.com/blob/main/CookGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# %% [markdown]
# # CookGPT (Pantry → Recipes) - Setup

# %%capture
!pip -q install sentence-transformers faiss-cpu transformers accelerate datasets unidecode


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# %% [markdown]
# ## Download dataset

import os, urllib.request

DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)

PRIMARY_URL = "https://raw.githubusercontent.com/jakevdp/open-recipe-data/main/recipeitems.json.gz"  # Source: jakevdp/open-recipe-data
PRIMARY_PATH = os.path.join(DATA_DIR, "recipeitems.json.gz")

try:
    if not os.path.exists(PRIMARY_PATH):
        print("Downloading primary dataset...")
        urllib.request.urlretrieve(PRIMARY_URL, PRIMARY_PATH)
    else:
        print("Primary dataset already present.")
    print("Done.")
except Exception as e:
    print("Primary download failed:", e)


Downloading primary dataset...
Done.


In [3]:
# %% [markdown]
# ## Load & normalize the recipes (JSON Lines)

import gzip, json, re
from unidecode import unidecode

RAW_RECIPES = []

def safe_strip(x):
    if isinstance(x, str):
        return unidecode(x).strip()
    return x

def normalize_ingredients(ing):
    if isinstance(ing, list):
        return [safe_strip(i).lower() for i in ing if isinstance(i, str)]
    if isinstance(ing, str):
        # split by commas/semicolons if a string
        parts = re.split(r"[;,]", ing)
        return [safe_strip(p).lower() for p in parts if safe_strip(p)]
    return []

loaded = 0
max_items = 50000   # for quick demo; increase to, say, 200_000 if your Colab GPU/CPU RAM allows
try:
    with gzip.open(PRIMARY_PATH, "rt", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
                title = safe_strip(obj.get("title") or obj.get("name") or "")
                if not title:
                    continue
                ingredients = normalize_ingredients(obj.get("ingredients") or obj.get("ingredient") or obj.get("ingredients_list") or [])
                instructions = obj.get("instructions") or obj.get("directions") or obj.get("steps") or ""
                if isinstance(instructions, list):
                    steps = [safe_strip(s) for s in instructions if isinstance(s, str) and s.strip()]
                else:
                    # split into sentences/steps
                    steps = [s.strip() for s in re.split(r"(?<=[.!?])\s+", str(instructions)) if s.strip()]

                recipe = {
                    "title": title,
                    "ingredients": ingredients,
                    "steps": steps[:12],  # cap for brevity
                    "tags": [t.lower() for t in (obj.get("categories") or obj.get("tags") or []) if isinstance(t, str)],
                    "time_minutes": obj.get("time") or obj.get("total_time") or obj.get("cook_time") or obj.get("prep_time") or None
                }
                RAW_RECIPES.append(recipe)
                loaded += 1
                if loaded >= max_items:
                    break
            except Exception:
                continue
    print(f"Loaded recipes: {len(RAW_RECIPES)}")
except Exception as e:
    print("Failed to read primary dataset:", e)

# Fallback tiny dataset if nothing loaded
if len(RAW_RECIPES) < 50:
    RAW_RECIPES = [
        {
            "title": "Masala Omelette",
            "ingredients": ["eggs","onion","tomato","green chilli","cilantro","salt","pepper","oil"],
            "steps": ["Beat eggs with salt and pepper.","Saute onion, tomato, chilli.","Add eggs and cook.","Garnish with cilantro."],
            "tags": ["indian","breakfast","vegetarian"],
            "time_minutes": 10
        },
        {
            "title": "Simple Aglio e Olio",
            "ingredients": ["spaghetti","garlic","olive oil","chilli flakes","parsley","salt"],
            "steps": ["Boil pasta.","Saute garlic in oil.","Toss with pasta and chilli flakes.","Finish with parsley and salt."],
            "tags": ["italian","vegetarian","quick"],
            "time_minutes": 15
        },
        {
            "title": "Chana Masala (Quick)",
            "ingredients": ["chickpeas","onion","tomato","garlic","ginger","garam masala","cumin","oil","salt"],
            "steps": ["Saute onion, garlic, ginger.","Add spices.","Add tomato and chickpeas.","Simmer till thick."],
            "tags": ["indian","vegan","main"],
            "time_minutes": 25
        }
    ]
    print("Using tiny fallback dataset.")


Loaded recipes: 50000


In [4]:
# %% [markdown]
# ## Heuristic dietary tags

NON_VEG = {"chicken","beef","pork","fish","mutton","lamb","shrimp","prawn","bacon","turkey","egg","eggs"}
ANIMAL_NON_DAIRY = {"chicken","beef","pork","fish","mutton","lamb","shrimp","prawn","bacon","turkey"}
GLUTEN_SOURCES = {"wheat","maida","all-purpose flour","ap flour","bread flour","spaghetti","pasta","noodles","soy sauce","atta","barley","rye"}
ROOTS = {"potato","onion","garlic","carrot","beet","radish"}  # basic for jain heuristic demo

def infer_tags(recipe):
    ing = set(recipe["ingredients"])
    tags = set([t.lower() for t in (recipe.get("tags") or [])])

    # vegetarian (no animal meat/fish, eggs allowed)
    is_veg = not (ing & ANIMAL_NON_DAIRY)
    if is_veg: tags.add("vegetarian")
    # vegan (no animal at all incl. eggs and dairy)
    is_vegan = is_veg and "egg" not in ing and "eggs" not in ing and not any(x in " ".join(ing) for x in ["milk","cheese","butter","ghee","yogurt","curd","paneer"])
    if is_vegan: tags.add("vegan")
    # jain (simplified: vegetarian + avoid common root vegetables)
    is_jain = is_veg and not any(r in ing for r in ROOTS)
    if is_jain: tags.add("jain")
    # gluten-free (very rough)
    gluten_free = not any(g in " ".join(ing) for g in GLUTEN_SOURCES)
    if gluten_free: tags.add("gluten-free")

    recipe["tags"] = sorted(tags)
    return recipe

RECIPES = [infer_tags(r) for r in RAW_RECIPES]
len(RECIPES), RECIPES[0]


(50000,
 {'title': 'Drop Biscuits and Sausage Gravy',
  'ingredients': ['biscuits\n3 cups all-purpose flour\n2 tablespoons baking powder\n1/2 teaspoon salt\n1-1/2 stick (3/4 cup) cold butter',
   'cut into pieces\n1-1/4 cup butermilk\n sausage gravy\n1 pound breakfast sausage',
   'hot or mild\n1/3 cup all-purpose flour\n4 cups whole milk\n1/2 teaspoon seasoned salt\n2 teaspoons black pepper',
   'more to taste'],
  'steps': [],
  'tags': ['jain', 'vegetarian'],
  'time_minutes': None})

In [5]:
# %% [markdown]
# ## Embeddings + FAISS index

import numpy as np, faiss
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")  # ~80MB, fast and good enough

def recipe_text(r):
    return f"{r['title']} :: ingredients: {', '.join(r['ingredients'])} :: tags: {', '.join(r.get('tags',[]))} :: time:{r.get('time_minutes')}"

texts = [recipe_text(r) for r in RECIPES]
X = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True).astype("float32")

faiss.normalize_L2(X)
index = faiss.IndexFlatIP(X.shape[1])
index.add(X)

print("Index size:", index.ntotal)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

Index size: 50000


In [6]:
# %% [markdown]
# ## Retrieval with constraints

from typing import List, Dict, Any

def matches_constraints(recipe: Dict, diet: List[str], exclude: List[str], max_time: int|None):
    ing = set(recipe["ingredients"])
    if exclude and any(x.lower() in ing for x in exclude):
        return False
    if max_time is not None and isinstance(recipe.get("time_minutes"), (int,float)) and recipe["time_minutes"] > max_time:
        return False
    if diet:
        tags = set([t.lower() for t in recipe.get("tags", [])])
        if not set([d.lower() for d in diet]).issubset(tags):
            return False
    return True

def retrieve(pantry: List[str], diet: List[str], exclude: List[str], max_time: int|None, k: int = 5):
    query_text = f"ingredients: {', '.join([p.lower().strip() for p in pantry])}; diet: {', '.join([d.lower() for d in diet])}; no: {', '.join([e.lower() for e in exclude])};"
    qv = embedder.encode([query_text], convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(qv)
    D, I = index.search(qv, 50)  # search wider then filter
    results = []
    for i in I[0]:
        r = RECIPES[int(i)]
        if matches_constraints(r, diet, exclude, max_time):
            results.append(r)
        if len(results) >= k:
            break
    return results

# Quick smoke test
retrieve(["eggs","spinach","onion","garlic","milk"], ["vegetarian"], ["mushroom"], 20, k=3)


[{'title': 'BBQ Spinach, Mushroom, Pineapple and Onion Pizza',
  'ingredients': ['2 whole 2 whole\n1 cup 1 cup\n1- 1/2 cup 1- 1/2 cup\n8 whole 8 whole\n1 cup 1 cup\n1 whole 1 whole'],
  'steps': [],
  'tags': ['gluten-free', 'jain', 'vegan', 'vegetarian'],
  'time_minutes': None},
 {'title': 'Spinach Stuffed Mushrooms',
  'ingredients': ['2  2 \n2 teaspoons 2 teaspoons\n10 ounces',
   'weight 10 ounces',
   'weight\n1 clove 1 clove\n2 packages 2 packages\n2 ounces',
   'weight 2 ounces',
   'weight'],
  'steps': [],
  'tags': ['gluten-free', 'jain', 'vegan', 'vegetarian'],
  'time_minutes': None},
 {'title': 'Easy Crispy Spinach Mushroom and Onion Appetizer',
  'ingredients': ['1 whole 1 whole\n1 whole 1 whole\n5 ounces',
   'weight 5 ounces',
   'weight\n2 cloves 2 cloves\n 1/2 cups  1/2 cups\n1 cup 1 cup\n1 package 1 package'],
  'steps': [],
  'tags': ['gluten-free', 'jain', 'vegan', 'vegetarian'],
  'time_minutes': None}]

In [7]:
# %% [markdown]
# ## LLM for formatting/generation (FLAN-T5)

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

LLM_NAME = "google/flan-t5-base"   # switch to "google/flan-t5-small" if RAM is tight
tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(LLM_NAME)

def cookgpt_format(pantry: List[str], diet: List[str], exclude: List[str], max_time: int|None, hits: List[Dict[str,Any]], want_generated: bool=True):
    ctx = ""
    for h in hits[:2]:
        ctx += f"- {h['title']} ({h.get('time_minutes','?')} min)\nIngredients: {', '.join(h['ingredients'])}\nSteps: " + " | ".join(h['steps'][:6]) + "\n"
    user = (
        f"Pantry: {', '.join(pantry)}. Diet: {', '.join(diet)}. Exclude: {', '.join(exclude)}. Max time: {max_time}.\n"
        "Task: Suggest 2 best matches (short steps <= 6 each), list clear substitutions for missing items. "
        "If needed, add 1 generated fallback recipe marked [Generated] with steps <= 6."
    )
    prompt = f"You are CookGPT. Use retrieved recipes first and respect constraints.\nRetrieved:\n{ctx}\n\n{user}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    out = model.generate(**inputs, max_new_tokens=350, temperature=0.7, top_p=0.95)
    return tokenizer.decode(out[0], skip_special_tokens=True)

# Quick test on a query
hits = retrieve(["eggs","spinach","onion","garlic","milk"], ["vegetarian"], ["mushroom"], 20, k=5)
print(cookgpt_format(["eggs","spinach","onion","garlic","milk"], ["vegetarian"], ["mushroom"], 20, hits))


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[Generated]


In [8]:
# %% [markdown]
# ## End-to-end function

def cookgpt(pantry: List[str],
            diet: List[str] = None,
            exclude: List[str] = None,
            max_time: int|None = None,
            k: int = 5):
    diet = diet or []
    exclude = exclude or []
    hits = retrieve(pantry, diet, exclude, max_time, k=k)
    response = cookgpt_format(pantry, diet, exclude, max_time, hits)
    return hits, response

# Demo
hits, response = cookgpt(
    pantry=["tomato","onion","garlic","chickpeas","garam masala","oil","salt"],
    diet=["vegan"],
    exclude=[],
    max_time=30,
    k=6
)
print("Top hits (titles):", [h["title"] for h in hits[:3]])
print("\n--- CookGPT ---\n", response)


Top hits (titles): ['Pepper, tomato and basil pasta', 'Vegetable and Chickpea Ragout', 'Garlic, chilli and broccoli stir-fry']

--- CookGPT ---
 Ingredients: 1 can (14.5 ounces) diced tomatoes 1 cup 1/2 cup 1/2 teaspoon salt 1 teaspoon dried oregano 1/4 teaspoon black pepper 1/8 teaspoon red pepper flakes 4 artichoke hearts in water, drained and quartered 1/2 cup frozen peas 1/4 cup black olives 1/2 cup whole-wheat penne, cooked 1/4 cup chopped fresh basil


In [9]:
# %% [markdown]
# ## Mini evaluation (Retrieval@k + constraint adherence)

test_queries = [
    {"pantry":["chickpeas","onion","tomato"], "diet":["vegan"], "exclude":[], "max_time":40},
    {"pantry":["spaghetti","garlic","olive oil"], "diet":["vegetarian"], "exclude":[], "max_time":25},
    {"pantry":["eggs","onion","tomato"], "diet":["vegetarian"], "exclude":["mushroom"], "max_time":20},
]

def pantry_subset_ok(recipe, pantry):
    ing = set(recipe["ingredients"])
    return all(p.lower() in " ".join(ing) for p in pantry)

def eval_retrieval_k(queries, k=5):
    ok = 0
    for q in queries:
        hits = retrieve(q["pantry"], q["diet"], q["exclude"], q["max_time"], k=k)
        if any(pantry_subset_ok(h, q["pantry"]) for h in hits):
            ok += 1
    return ok, len(queries)

def eval_constraints(queries, k=10):
    ok = 0
    for q in queries:
        hits = retrieve(q["pantry"], q["diet"], q["exclude"], q["max_time"], k=k)
        if all(matches_constraints(h, q["diet"], q["exclude"], q["max_time"]) for h in hits):
            ok += 1
    return ok, len(queries)

r_ok, r_tot = eval_retrieval_k(test_queries, k=5)
c_ok, c_tot = eval_constraints(test_queries, k=10)
print(f"Retrieval@5 coverage: {r_ok}/{r_tot}")
print(f"Constraint adherence (top10): {c_ok}/{c_tot}")


Retrieval@5 coverage: 1/3
Constraint adherence (top10): 3/3


In [10]:
# %% [markdown]
# ## Simple substitution table (optional)

SUBS = {
    "butter": ["ghee","olive oil"],
    "yogurt": ["curd","coconut yogurt"],
    "cream": ["milk + butter","cashew cream"],
    "garam masala": ["curry powder","coriander+cumin+clove+cardamom (pinches)"],
    "soy sauce": ["tamari","coconut aminos"],
    "spaghetti": ["penne","rice noodles"],
    "egg": ["flax egg","silken tofu (binding)"],
}

def suggest_subs(missing: List[str]):
    out = {}
    for m in missing:
        m = m.lower()
        if m in SUBS:
            out[m] = SUBS[m]
    return out

print(suggest_subs(["butter","cream","spaghetti"]))


{'butter': ['ghee', 'olive oil'], 'cream': ['milk + butter', 'cashew cream'], 'spaghetti': ['penne', 'rice noodles']}


In [11]:
# %% [markdown]
# ## Save a few suggestions to JSON (optional)

import json, time

def save_run(pantry, diet, exclude, max_time, k=5, path="cookgpt_run.json"):
    hits, resp = cookgpt(pantry, diet, exclude, max_time, k=k)
    bundle = {
        "timestamp": time.time(),
        "query": {"pantry":pantry,"diet":diet,"exclude":exclude,"max_time":max_time},
        "top_hits": hits,
        "llm_response": resp
    }
    with open(path, "w") as f:
        json.dump(bundle, f, indent=2)
    return path

save_run(["eggs","spinach","onion"], ["vegetarian"], ["mushroom"], 20, k=6)


'cookgpt_run.json'

In [12]:
# %%capture
# --- Install deps (quiet) ---
!pip -q install sentence-transformers faiss-cpu gradio unidecode

# =========================
# Minimal Pantry → Recipes App (Gradio)
# =========================
import os, urllib.request, gzip, json, re
from unidecode import unidecode
from typing import List, Dict, Any

import numpy as np, faiss, gradio as gr
from sentence_transformers import SentenceTransformer

# --------- 1) Data download ---------
DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)
PRIMARY_URL = "https://raw.githubusercontent.com/jakevdp/open-recipe-data/main/recipeitems.json.gz"  # Open recipe dataset (CC-BY)
PRIMARY_PATH = os.path.join(DATA_DIR, "recipeitems.json.gz")

if not os.path.exists(PRIMARY_PATH):
    try:
        print("Downloading recipes...")
        urllib.request.urlretrieve(PRIMARY_URL, PRIMARY_PATH)
        print("Downloaded.")
    except Exception as e:
        print("Download failed:", e)

# --------- 2) Load + normalize (light cleaning) ---------
def safe_strip(x):
    if isinstance(x, str):
        return unidecode(x).strip()
    return x

def normalize_ingredients(ing):
    if isinstance(ing, list):
        return [safe_strip(i).lower() for i in ing if isinstance(i, str)]
    if isinstance(ing, str):
        parts = re.split(r"[;,]", ing)
        return [safe_strip(p).lower() for p in parts if safe_strip(p)]
    return []

RAW_RECIPES = []
loaded = 0
max_items = 30000   # keep this modest for Colab RAM; increase if you want

try:
    with gzip.open(PRIMARY_PATH, "rt", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
                title = safe_strip(obj.get("title") or obj.get("name") or "")
                if not title:
                    continue
                ingredients = normalize_ingredients(obj.get("ingredients") or obj.get("ingredient") or obj.get("ingredients_list") or [])
                instructions = obj.get("instructions") or obj.get("directions") or obj.get("steps") or ""
                if isinstance(instructions, list):
                    steps = [safe_strip(s) for s in instructions if isinstance(s, str) and s.strip()]
                else:
                    steps = [s.strip() for s in re.split(r"(?<=[.!?])\s+", str(instructions)) if s.strip()]
                recipe = {
                    "title": title,
                    "ingredients": ingredients,
                    "steps": steps[:10],  # keep short
                    "tags": [t.lower() for t in (obj.get("categories") or obj.get("tags") or []) if isinstance(t, str)],
                    "time_minutes": obj.get("time") or obj.get("total_time") or obj.get("cook_time") or obj.get("prep_time") or None
                }
                RAW_RECIPES.append(recipe)
                loaded += 1
                if loaded >= max_items:
                    break
            except Exception:
                continue
    print(f"Loaded recipes: {len(RAW_RECIPES)}")
except Exception as e:
    print("Reading dataset failed:", e)

# Fallback tiny set if download/parse failed
if len(RAW_RECIPES) < 30:
    RAW_RECIPES = [
        {
            "title": "Masala Omelette",
            "ingredients": ["eggs","onion","tomato","green chilli","cilantro","salt","pepper","oil"],
            "steps": ["Beat eggs with salt and pepper.","Saute onion, tomato, chilli.","Add eggs and cook.","Garnish with cilantro."],
            "tags": ["indian","breakfast","vegetarian"],
            "time_minutes": 10
        },
        {
            "title": "Simple Aglio e Olio",
            "ingredients": ["spaghetti","garlic","olive oil","chilli flakes","parsley","salt"],
            "steps": ["Boil pasta.","Saute garlic in oil.","Toss with pasta and chilli flakes.","Finish with parsley and salt."],
            "tags": ["italian","vegetarian","quick"],
            "time_minutes": 15
        },
        {
            "title": "Chana Masala (Quick)",
            "ingredients": ["chickpeas","onion","tomato","garlic","ginger","garam masala","cumin","oil","salt"],
            "steps": ["Saute onion, garlic, ginger.","Add spices.","Add tomato and chickpeas.","Simmer till thick."],
            "tags": ["indian","vegan","main"],
            "time_minutes": 25
        }
    ]
    print("Using tiny fallback dataset.")

# --------- 3) Heuristic dietary tags ---------
NON_DAIRY_MEAT = {"chicken","beef","pork","fish","mutton","lamb","shrimp","prawn","bacon","turkey"}
GLUTEN_SOURCES = {"wheat","maida","all-purpose flour","ap flour","bread flour","spaghetti","pasta","noodles","soy sauce","atta","barley","rye"}
ROOTS = {"potato","onion","garlic","carrot","beet","radish"}  # simple jain heuristic

def infer_tags(recipe):
    ing_text = " ".join(recipe["ingredients"])
    ing_set = set(recipe["ingredients"])
    tags = set([t.lower() for t in (recipe.get("tags") or [])])

    is_veg = not (ing_set & NON_DAIRY_MEAT)
    if is_veg: tags.add("vegetarian")

    is_vegan = is_veg and ("egg" not in ing_set and "eggs" not in ing_set) \
               and not any(x in ing_text for x in ["milk","cheese","butter","ghee","yogurt","curd","paneer"])
    if is_vegan: tags.add("vegan")

    is_jain = is_veg and not any(r in ing_set for r in ROOTS)
    if is_jain: tags.add("jain")

    gluten_free = not any(g in ing_text for g in GLUTEN_SOURCES)
    if gluten_free: tags.add("gluten-free")

    recipe["tags"] = sorted(tags)
    return recipe

RECIPES = [infer_tags(r) for r in RAW_RECIPES]

# --------- 4) Embeddings + FAISS index ---------
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def recipe_text(r):
    return f"{r['title']} :: ingredients: {', '.join(r['ingredients'])} :: tags: {', '.join(r.get('tags',[]))} :: time:{r.get('time_minutes')}"

texts = [recipe_text(r) for r in RECIPES]
X = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=False).astype("float32")
faiss.normalize_L2(X)
index = faiss.IndexFlatIP(X.shape[1])
index.add(X)

# --------- 5) Retrieval helpers ---------
def matches_constraints(recipe: Dict, diet: List[str], exclude: List[str], max_time: int|None):
    ing = set(recipe["ingredients"])
    if exclude and any(x.lower().strip() in ing for x in exclude):
        return False
    if max_time is not None and isinstance(recipe.get("time_minutes"), (int,float)) and recipe["time_minutes"] is not None:
        try:
            if float(recipe["time_minutes"]) > float(max_time):
                return False
        except Exception:
            pass
    if diet:
        tags = set([t.lower() for t in recipe.get("tags", [])])
        if not set([d.lower() for d in diet]).issubset(tags):
            return False
    return True

def retrieve(pantry: List[str], diet: List[str], exclude: List[str], max_time: int|None, k: int = 5):
    pantry = [p.lower().strip() for p in pantry if p.strip()]
    diet = [d.lower().strip() for d in diet if d.strip()]
    exclude = [e.lower().strip() for e in exclude if e.strip()]

    query_text = f"ingredients: {', '.join(pantry)}; diet: {', '.join(diet)}; no: {', '.join(exclude)};"
    qv = embedder.encode([query_text], convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(qv)
    D, I = index.search(qv, 60)  # search wider; we'll filter
    results = []
    for i in I[0]:
        r = RECIPES[int(i)]
        if matches_constraints(r, diet, exclude, max_time):
            results.append(r)
        if len(results) >= k:
            break
    return results

def format_results(recipes: List[Dict[str,Any]]) -> str:
    if not recipes:
        return "No good matches found. Try removing exclusions or increasing time."
    blocks = []
    for r in recipes:
        title = r["title"]
        tmin = r.get("time_minutes", "?")
        tags = ", ".join(r.get("tags", [])) or "-"
        ing = ", ".join(r["ingredients"])
        steps = "\n".join([f"{i+1}. {s}" for i, s in enumerate(r["steps"][:6])]) if r["steps"] else "-"
        block = f"**{title}**  \n_Time:_ {tmin} min  •  _Tags:_ {tags}\n\n**Ingredients:** {ing}\n\n**Steps:**\n{steps}"
        blocks.append(block)
    return "\n\n---\n\n".join(blocks)

# --------- 6) Gradio UI (very basic) ---------
with gr.Blocks(title="CookGPT (Basic)") as demo:
    gr.Markdown("# CookGPT (Basic)\nType ingredients and get matching recipes below. Not fancy, just works.")
    with gr.Row():
        ingredients_in = gr.Textbox(label="Ingredients (comma-separated)", placeholder="eggs, spinach, onion, garlic, milk")
    with gr.Row():
        diet_in = gr.CheckboxGroup(choices=["vegetarian","vegan","gluten-free","jain"], label="Diet (optional)")
        exclude_in = gr.Textbox(label="Exclude (comma-separated, optional)", placeholder="mushroom, peanut")
        time_in = gr.Number(label="Max time in minutes (optional)", value=None)
        k_in = gr.Slider(1, 10, value=5, step=1, label="Top-K results")
    find_btn = gr.Button("Find Recipes")
    out = gr.Markdown()

    def on_find(ingredients, diet, exclude, max_time, k):
        pantry = [x.strip() for x in (ingredients or "").split(",") if x.strip()]
        exclude_list = [x.strip() for x in (exclude or "").split(",") if x.strip()]
        try:
            max_time = int(max_time) if max_time not in (None, "", "None") else None
        except Exception:
            max_time = None
        hits = retrieve(pantry, diet or [], exclude_list, max_time, k=int(k))
        return format_results(hits)

    find_btn.click(on_find, inputs=[ingredients_in, diet_in, exclude_in, time_in, k_in], outputs=out)

demo.launch(share=True)


Loaded recipes: 30000
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9864a5a3e8ae34f805.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


