In [None]:
# ✅ STEP 1: Install Required Packages
!pip install -q transformers pandas tqdm

# ✅ STEP 2: Import Required Libraries
import pandas as pd
import re
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

# ✅ STEP 3: Load Your Datasets
recipes_df = pd.read_csv("/content/70000_recipes_nutrients.csv")
health_df = pd.read_csv("/content/health_age_data_70000_synthetic.csv")

# ✅ STEP 4: Clean Ingredients
def clean_ingredients(raw_ingredients):
    if pd.isna(raw_ingredients): return ""
    items = [re.sub(r"[^a-zA-Z ]", "", x).strip().lower() for x in str(raw_ingredients).split(',')]
    items = [i for i in items if len(i) > 1 and not i.isdigit()]
    return ', '.join(sorted(set(items)))

recipes_df["cleaned_ingredients"] = recipes_df["ingredients"].apply(clean_ingredients)

# ✅ STEP 5: Load Bloom Model (560M) and Tokenizer
model_name = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ✅ STEP 6: Define Prompt Template
avoid_map = {
    "Diabetes": "sugar, white rice, honey",
    "Heart Disease": "red meat, salt, fried foods"
}
include_map = {
    "Diabetes": "spinach, cinnamon, olive oil",
    "Heart Disease": "oats, berries, nuts"
}

def create_prompt(dish_name, ingredients, user_profile):
    age = user_profile['Ages']
    condition = user_profile['Disease']
    condition_list = [c.strip() for c in condition.split(',')]
    primary_condition = condition_list[0] if condition_list else "None"
    avoid = avoid_map.get(primary_condition, "sugar")
    include = include_map.get(primary_condition, "spinach")

    return (
        f"Suggest a healthier version of the dish '{dish_name}' for a {age}-year-old with {primary_condition}. "
        f"It currently contains: {ingredients}. "
        f"Avoid: {avoid}. Try to include: {include}. "
        f"Explain the ingredient changes briefly."
    )

# ✅ STEP 7: Generate Output using Bloom
def generate_recipe(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = model.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ✅ STEP 8: Local Interpretation Function (No API)
def interpret_bloom_output(prompt, response, original_ingredients, avoid_list, recommend_list):
    def extract_ingredients(text):
        text = text.lower()
        return set(re.findall(r'\b[a-z]+\b', text))

    original_ingredients = extract_ingredients(original_ingredients)
    response_ingredients = extract_ingredients(response)
    avoid_set = extract_ingredients(avoid_list)
    recommend_set = extract_ingredients(recommend_list)

    hallucinated = response_ingredients - original_ingredients - recommend_set
    violations = response_ingredients & avoid_set
    healthy_additions = response_ingredients & recommend_set

    if violations:
        trust_score = 0.3
    elif hallucinated:
        trust_score = 0.6
    else:
        trust_score = 0.9

    explanation = []
    if hallucinated:
        explanation.append(f"⚠️ Hallucinated: {', '.join(hallucinated)}")
    if violations:
        explanation.append(f"❌ Violates: {', '.join(violations)}")
    if healthy_additions:
        explanation.append(f"✅ Helpful: {', '.join(healthy_additions)}")
    if not explanation:
        explanation.append("✅ Clean and compliant output.")

    return trust_score, " | ".join(explanation)

# ✅ STEP 9: Batch Execution (Test with 100 rows first)
results = []
N = 100  # For full dataset use: N = len(recipes_df)

for i in tqdm(range(N)):
    recipe_row = recipes_df.iloc[i]
    user_row = health_df.iloc[i]

    # Build prompt
    prompt = create_prompt(
        dish_name=recipe_row["recipe_name"],
        ingredients=recipe_row["cleaned_ingredients"],
        user_profile=user_row
    )

    # Generate Bloom output
    response = generate_recipe(prompt)

    # Interpretation
    disease = user_row["Disease"].split(",")[0].strip()
    avoid = avoid_map.get(disease, "sugar")
    recommend = include_map.get(disease, "spinach")

    trust_score, explanation = interpret_bloom_output(
        prompt=prompt,
        response=response,
        original_ingredients=recipe_row["ingredients"],
        avoid_list=avoid,
        recommend_list=recommend
    )

    results.append({
        "Index": i,
        "Recipe Name": recipe_row["recipe_name"],
        "Age": user_row["Ages"],
        "Disease": disease,
        "Prompt": prompt,
        "Bloom Output": response,
        "Trust Score": trust_score,
        "Explanation": explanation
    })

# ✅ STEP 10: Save Results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("bloom_batch_interpretation_results.csv", index=False)

print("✅ DONE! Results saved to 'bloom_batch_interpretation_results.csv'")



 27%|██▋       | 27/100 [11:33<34:22, 28.26s/it]