In [1]:
!pip install -q transformers accelerate bitsandbytes sentence-transformers faiss-cpu


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import re
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


In [4]:
recipes_df = pd.read_csv("/content/70000_recipes_nutrients.csv")
profile_df = pd.read_csv("/content/health_age_data_70000_synthetic.csv")


In [5]:
def clean_ingredients(text):
    if pd.isna(text): return ""
    text = re.sub(r"[^a-zA-Z0-9, ]", "", text)
    return ', '.join([w.strip().lower() for w in text.split(',') if w.strip()])


In [6]:
def get_similar_dishes(dish_name, top_k=3):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    names = recipes_df["recipe_name"].astype(str).tolist()
    embeddings = model.encode(names)
    target_emb = model.encode([dish_name])
    scores = cosine_similarity(target_emb, embeddings)[0]
    top_indices = scores.argsort()[::-1][:top_k]
    return [names[i] for i in top_indices]


In [7]:
def get_user_nutrient_goals(age):
    return {
        "Daily Calorie Target": 2500 if age < 50 else 2200,
        "Protein": 200,
        "Carbohydrates": 250,
        "Fat": 70,
        "Fiber": 30
    }


In [8]:
def build_health_rules():
    return {
        "diabetes": {"avoid": ["sugar", "ghee", "salt"], "recommend": ["cinnamon", "olive oil", "spinach"]},
        "heart disease": {"avoid": ["butter", "cream", "red meat"], "recommend": ["garlic", "spinach", "flaxseeds"]},
        "obesity": {"avoid": ["soda", "fries", "white bread"], "recommend": ["vegetables", "oats", "lean protein"]},
        "hypertension": {"avoid": ["salt", "processed meat"], "recommend": ["leafy greens", "olive oil"]},
        "cholesterol": {"avoid": ["egg yolk", "red meat", "saturated fat"], "recommend": ["chia seeds", "walnuts", "whole grains"]}
    }

health_rules = build_health_rules()

approved_additions = [
    "turmeric", "olive oil", "cinnamon", "spinach", "broccoli", "kale",
    "bell peppers", "mushrooms", "zucchini", "flaxseeds", "chia seeds", "whole grain pasta"
]


In [9]:
def build_prompt_zephyr(dish_name, age, disease, ingredients, avoid, recommend, targets):
    cleaned = clean_ingredients(ingredients)
    cleaned_list = [i.strip() for i in cleaned.split(',') if i.strip()]
    bullet_ing = "\n".join(f"- {item}" for item in cleaned_list)

    prompt = f"""You are a nutrition expert.

A {age}-year-old patient with {disease} wants to eat a dish called "{dish_name}".

Here are the ingredients:
{bullet_ing}

Nutrition Targets:
- Calories: {targets['Daily Calorie Target']}
- Protein: {targets['Protein']}g
- Carbs: {targets['Carbohydrates']}g
- Fat: {targets['Fat']}g
- Fiber: {targets['Fiber']}g

Avoid ingredients: {', '.join(avoid)}.
You may only add from the approved healthy list: {', '.join(approved_additions)}.

Now write a short paragraph:
- Say what to remove and why.
- Suggest what to add and why.
- Explain how it improves the dish for someone with {disease}.
"""
    return prompt.strip()


In [10]:
model_id = "HuggingFaceH4/zephyr-7b-beta"

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=torch.float16,
# )

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=bnb_config)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



In [11]:
def zephyr_llm(prompt, max_tokens=256):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    with torch.no_grad():
        output = model.generate(input_ids, max_new_tokens=max_tokens)
    return tokenizer.decode(output[0], skip_special_tokens=True).strip()


In [12]:
def generate_output_zephyr(dish_name, age, disease):
    print(f"🔍 Finding best match for: {dish_name}")
    match = get_similar_dishes(dish_name)[0]
    row = recipes_df[recipes_df["recipe_name"].str.lower() == match.lower()]

    if row.empty:
        return f"❌ Recipe '{match}' not found"

    raw_ing = row.iloc[0]["ingredients"]
    targets = get_user_nutrient_goals(age)
    disease_lower = disease.strip().lower()
    matched_key = next((key for key in health_rules if key in disease_lower), None)
    if matched_key is None:
        return f"❌ No health rules found for any condition in: '{disease}'"

    avoid = health_rules[matched_key]["avoid"]
    recommend = health_rules[matched_key]["recommend"]

    prompt = build_prompt_zephyr(
        dish_name=match,
        age=age,
        disease=disease,
        ingredients=raw_ing,
        avoid=avoid,
        recommend=recommend,
        targets=targets
    )

    print("\n📨 Prompt Sent:\n", prompt)
    result = zephyr_llm(prompt)
    print("\n🤖 Zephyr Response:\n", result)
    return result


generate_output_zephyr("cake", 45, "diabetes, hypertension, heart disease")


🔍 Finding best match for: cake


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



📨 Prompt Sent:
 You are a nutrition expert.

A 45-year-old patient with diabetes, hypertension, heart disease wants to eat a dish called "king cake".

Here are the ingredients:
- active dry yeast
- white sugar
- warm milk 110 degrees f45 degrees c
- butter
- egg yolks
- allpurpose flour
- salt
- ground nutmeg
- grated lemon zest
- cream cheese
- confectioners sugar
- confectioners sugar
- lemon juice
- milk
- multicolored candy sprinkles

Nutrition Targets:
- Calories: 2500
- Protein: 200g
- Carbs: 250g
- Fat: 70g
- Fiber: 30g

Avoid ingredients: sugar, ghee, salt.
You may only add from the approved healthy list: turmeric, olive oil, cinnamon, spinach, broccoli, kale, bell peppers, mushrooms, zucchini, flaxseeds, chia seeds, whole grain pasta.

Now write a short paragraph:
- Say what to remove and why.
- Suggest what to add and why.
- Explain how it improves the dish for someone with diabetes, hypertension, heart disease.

🤖 Zephyr Response:
 You are a nutrition expert.

A 45-year-old

'You are a nutrition expert.\n\nA 45-year-old patient with diabetes, hypertension, heart disease wants to eat a dish called "king cake".\n\nHere are the ingredients:\n- active dry yeast\n- white sugar\n- warm milk 110 degrees f45 degrees c\n- butter\n- egg yolks\n- allpurpose flour\n- salt\n- ground nutmeg\n- grated lemon zest\n- cream cheese\n- confectioners sugar\n- confectioners sugar\n- lemon juice\n- milk\n- multicolored candy sprinkles\n\nNutrition Targets:\n- Calories: 2500\n- Protein: 200g\n- Carbs: 250g\n- Fat: 70g\n- Fiber: 30g\n\nAvoid ingredients: sugar, ghee, salt.\nYou may only add from the approved healthy list: turmeric, olive oil, cinnamon, spinach, broccoli, kale, bell peppers, mushrooms, zucchini, flaxseeds, chia seeds, whole grain pasta.\n\nNow write a short paragraph:\n- Say what to remove and why.\n- Suggest what to add and why.\n- Explain how it improves the dish for someone with diabetes, hypertension, heart disease.\n\nExample:\n"To make this dish healthier for