## Final Project - NLP-Driven Ingredient Health and Dietary Restriction Analysis

*Name: Laura Obermaier*

*Stevens ID: 20027358*

#### Imports

In [1]:
import re
import csv
import requests
import spacy
import pubchempy as pcp
from rapidfuzz import process, fuzz
from collections import defaultdict
import json
import time
from spacy.lang.en import English
import tiktoken
from googleapiclient.discovery import build
from dotenv import load_dotenv
import os
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import torch
from nltk.tokenize import sent_tokenize
import nltk
import urllib.parse
import math
from rouge_score import rouge_scorer
import numpy as np
from bert_score import score as bert_score
from sentence_transformers import SentenceTransformer, util

#### Environment and Global Setup

In [2]:
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")

csv.field_size_limit(2**20)

nlp = spacy.load("en_core_web_sm")
nlp_sentencizer = English()
nlp_sentencizer.add_pipe("sentencizer")

global_alias_set = set()
search_cache = {}
alias_frequency = defaultdict(int)

CACHE_FILE = "search_cache.json"
ALIAS_FREQ_FILE = "alias_frequency.json"
ALIAS_CACHE_FILE = "alias_cache.json"

try:
    with open(CACHE_FILE, "r", encoding="utf-8") as f:
        search_cache = json.load(f)
except FileNotFoundError:
    search_cache = {}

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    use_fast=True,
    trust_remote_code=True
)

model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    use_safetensors=True,
    device_map="auto",
    trust_remote_code=True,
    revision="main"
)

keyword_config = {
    "benefits": [
        "benefit", "supports", "improves", "boosts", "enhances", "aids", "reduces", "prevents",
        "protects", "promotes", "stimulates", "strengthens", "aiding", "improving", "healing",
        "facilitates", "enhancing", "balances", "restores", "ameliorates", "treats", "alleviates",
        "relieves", "contributes to", "beneficial", "positive", "advantageous", "favorable",
        "healthy", "wellness", "nutrient", "nutritional", "immune", "well-being", "absorption",
        "energy", "fitness", "cognitive", "focus", "clarity", "relief", "anti-inflammatory",
        "cramps", "gut", "probiotic", "enzyme", "alkaline", "acidic", "bloating", "constipation"
    ],
    "concerns": [
        "risk", "toxic", "harm", "adverse", "cause", "increased", "linked to", "danger",
        "poisonous", "unsafe", "negatively", "exacerbates", "side effect",
        "carcinogenic", "neurotoxic", "hepatotoxic", "irritation", "may lead to", "trigger",
        "overdoes", "reaction", "symptom", "pain", "toxins"
    ],
    "restrictions": [
        "allergy", "intolerance", "sensitivity", "restricted", "avoid", "not suitable",
        "contraindicated", "dietary restriction", "religious restriction", "vegan", "vegetarian",
        "halal", "haram", "gluten", "lactose", "kosher", "FODMAP", "contains", "may contain",
        "cross-contamination", "not recommended", "not advised", "not suitable for",
        "not safe for"
    ],
    "neutral impact": [
        "kidney", "liver", "cholesterol", "diabetes", "inflammation", "cancer", "cardiovascular",
        "digestion", "therapy", "treatment", "metabolism", "metabolic", "calories"
    ],
    "health": [
        "health", "wellness", "nutrition", "nutritional", "dietary", "diet", "supplement", "vitamin", "mineral",
        "well-being", "immune", "absorption", "energy", "fitness", "cognitive", "focus",
        "clarity", "relief", "anti-inflammatory", "cramps", "gut", "probiotic", "enzyme", 
    ],
}

# Compose superset for health relevance filtering
keyword_config["all"] = list(set(
    keyword_config["benefits"] +
    keyword_config["concerns"] +
    keyword_config["restrictions"] +
    keyword_config["neutral impact"] +
    keyword_config["health"]
))

INFO - The layer lm_head is not quantized.


#### Cache Utilities

In [3]:
def save_search_cache():
    try:
        with open(CACHE_FILE, "w", encoding="utf-8") as f:
            json.dump(search_cache, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"[!] Failed to save search cache: {e}")

def save_alias_frequency(path=ALIAS_FREQ_FILE):
    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(alias_frequency, f, ensure_ascii=False, indent=2)
        print(f"[✓] Alias frequency saved to {path}")
    except Exception as e:
        print(f"[!] Error saving alias frequency: {e}")

def load_alias_frequency(path=ALIAS_FREQ_FILE):
    global alias_frequency
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
            alias_frequency = defaultdict(int, {k: int(v) for k, v in data.items()})
        print(f"[✓] Loaded {len(alias_frequency)} alias frequencies from cache.")
        return True
    except FileNotFoundError:
        print(f"[ ] Alias frequency cache not found at {path}. Will regenerate.")
        return False
    except Exception as e:
        print(f"[!] Error loading alias frequency: {e}")
        return False
    
def save_alias_cache(path=ALIAS_CACHE_FILE):
    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(sorted(global_alias_set), f, ensure_ascii=False, indent=2)
        print(f"[✓] Alias cache saved to {path}")
    except Exception as e:
        print(f"[!] Error saving alias cache: {e}")

def load_alias_cache(path=ALIAS_CACHE_FILE):
    global global_alias_set
    try:
        with open(path, "r", encoding="utf-8") as f:
            global_alias_set = set(json.load(f))
        print(f"[✓] Loaded {len(global_alias_set)} aliases from cache.")
        return True
    except FileNotFoundError:
        print(f"[ ] Alias cache not found at {path}. Will seed from source...")
        return False
    except Exception as e:
        print(f"[!] Error loading alias cache: {e}")
        return False

#### Alias + Name Handling

In [4]:
def is_relevant_alias(alias):
    alias_clean = alias.strip().lower()
    if len(alias_clean.split()) > 4:
        return False
    if re.search(r'\d{3,}|\d+%|[^\w\s\-]', alias_clean):  # long numeric sequence or symbols
        if alias_frequency[alias_clean] < 5:  # require higher frequency to keep
            return False
    if re.search(r'^\d{2,5}-\d{2,5}-\d$', alias_clean): 
        return False
    if re.search(r"\b(usp|grade|reagent|testing|microg|ml|specification|mixture|vetec|methanol|water|preparation)\b", alias_clean):
        return False
    if len(alias_clean) > 40:
        return False
    if alias_clean.count(',') > 0 or alias_clean.count('(') > 1:
        return False
    if any(keyword in alias_clean for keyword in ['acs', 'usp', 'grade', 'reference', 'powder', 'solution', 'mist']):
        return False
    return True

def get_pubchem_aliases(ingredient_name):
    try:
        compounds = pcp.get_compounds(ingredient_name, 'name')
        if not compounds:
            return []
        
        all_synonyms = set()
        for compound in compounds[:3]:  # try top 3 matches
            if hasattr(compound, "synonyms"):
                all_synonyms.update(s.lower() for s in compound.synonyms if is_relevant_alias(s))
        return list(all_synonyms)
    except Exception as e:
        print(f"[PubChem error for '{ingredient_name}']: {e}")
        return []

def update_alias_cache(aliases):
    for a in aliases:
        if a:
            global_alias_set.add(a.lower().strip())

def fuzzy_match_alias(name, threshold=90):
    if not global_alias_set:
        print("[Warning] Alias set is empty — did you run seed_aliases_from_open_food_facts?")
        return None
    result = process.extractOne(name, global_alias_set, scorer=fuzz.token_sort_ratio)
    if result is None:
        return None
    match, score, _ = result
    return match if score >= threshold else None
"""
def is_valid_alias(alias, reference, threshold=70):
    return fuzz.ratio(alias.lower(), reference.lower()) >= threshold"""

def is_phonetically_valid(word):
    word = word.lower()
    if len(word) < 3:
        return False
    vowels = sum(1 for c in word if c in "aeiou")
    consonants = sum(1 for c in word if c.isalpha() and c not in "aeiou")
    if consonants == 0:
        return False
    ratio = vowels / (consonants + vowels)
    return 0.2 <= ratio <= 0.8  # extremely low/high = junk

def standardize_ingredient_name(name, max_aliases=5):
    name = name.lower().strip()
    aliases = get_pubchem_aliases(name)

    if aliases:
        update_alias_cache(aliases)
        # Apply filters
        filtered_aliases = [
            a for a in aliases
            #if is_relevant_alias(a) and is_valid_alias(a, name)
            if is_relevant_alias(a) and name not in a
        ]

        # Improved sort: prioritize exact match, then similarity, then length
        ranked = sorted(
            filtered_aliases,
            key=lambda x: (
                0 if x == name else 1,               # exact match first
                -fuzz.token_sort_ratio(name, x),     # highest similarity
                len(x)                               # shorter is better
            )
        )
        top_aliases = ranked[:max_aliases]
        if not top_aliases:
            print(f"[Alias Fallback] No valid aliases for '{name}', reverting to original.")
            return name, [name]
        print(f"[Query Alias] {name} → Filtered aliases: {top_aliases}")#WORK ON: NOT ENOUGH ALIASES
        return name, top_aliases

    fuzzy = fuzzy_match_alias(name)
    if fuzzy:
        print(f"[Correction] '{name}' autocorrected to alias: '{fuzzy}'")
        return fuzzy, [fuzzy]

    return name, [name]

#### Open Food Facts Seeding

In [5]:
def seed_aliases_from_open_food_facts(limit=10000):
    url = "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv"
    response = requests.get(url, stream=True)
    response.encoding = 'utf-8'

    alias_dict = defaultdict(set)
    lines = (line.decode('utf-8') for line in response.iter_lines())
    reader = csv.DictReader(lines, delimiter='\t')

    langs = ['fr', 'de', 'es', 'it']
    count = 0

    for row in reader:
        if count >= limit:
            break
        count += 1

        ingredients_text = row.get("ingredients_text", "")
        if not ingredients_text.strip():
            continue

        for ing in ingredients_text.split(','):
            ing = ing.strip().lower()
            if not ing:
                continue
            alias_dict[ing].add(ing)
            alias_frequency[ing] += 1

            for lang in langs:
                key = f"ingredients_text_{lang}"
                alt = row.get(key)
                if alt:
                    for alt_ing in alt.split(','):
                        alt_ing = alt_ing.strip().lower()
                        if alt_ing:
                            alias_dict[ing].add(alt_ing)
                            alias_dict[alt_ing].add(ing)

    for aliases in alias_dict.values():
        update_alias_cache(list(aliases))
    print(f"[✓] Seeded {len(global_alias_set)} unique aliases from Open Food Facts.")

loaded_alias = load_alias_cache()
loaded_freqs = load_alias_frequency()

if not (loaded_alias and loaded_freqs):
    seed_aliases_from_open_food_facts(limit=5000)
    save_alias_cache()
    save_alias_frequency()

[✓] Loaded 6151 aliases from cache.
[✓] Loaded 6397 alias frequencies from cache.


#### External API Queries

In [6]:
def query_pubmed(ingredient, aliases=None, max_results=100):
    try:
        alias_query = " OR ".join([f'"{a}"' for a in aliases])
        keywords_query = " OR ".join(keyword_config["all"])
        final_query = f"({alias_query}) AND ({keywords_query})"

        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        params = {"db": "pubmed", "term": final_query, "retmode": "json", "retmax": max_results}
        ids = requests.get(base_url, params=params).json()["esearchresult"].get("idlist", [])
        
        summaries = []
        for pmid in ids:
            summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
            r = requests.get(summary_url, params={"db": "pubmed", "id": pmid, "retmode": "json"}).json()
            result = r["result"].get(pmid)
            if result:
                summaries.append({"title": result.get("title"), "source": result.get("source"), "pubdate": result.get("pubdate")})
        return summaries
    except:
        return []
    
def is_fda_entry_relevant(text, aliases):
    irrelevant_keywords = [
        "recall", "undeclared", "labeling", "distribution",
        "pasteurization", "packaging", "incorrect", "contain"
    ]
    text_lower = text.lower()
    if any(kw in text_lower for kw in irrelevant_keywords):
        return False
    return any(alias.lower() in text_lower for alias in aliases)

def query_openfda(ingredient, aliases=None, max_results=100):
    aliases = aliases or [ingredient]
    try:
        query = " OR ".join([f'"{a}"' for a in aliases])
        base_url = "https://api.fda.gov/food/enforcement.json"
        params = {"search": f"product_description:({query})", "limit": max_results}
        r = requests.get(base_url, params=params).json()
        results = r.get("results", [])

        filtered = []
        for rec in results:
            reason = rec.get("reason_for_recall", "")
            if is_fda_entry_relevant(reason, aliases):
                filtered.append(reason)

        return list(set(filtered))  # deduplicate
    except Exception as e:
        print(f"[OpenFDA error for aliases {aliases}]: {e}")
        return []

def query_rxnorm(ingredient, aliases=None, max_results=100):
    aliases = aliases or [ingredient]
    try:
        url = "https://rxnav.nlm.nih.gov/REST/rxcui.json"
        all_ids = set()

        for alias in aliases[:max_results]:
            rxcui = requests.get(url, params={"name": alias}).json()
            ids = rxcui.get("idGroup", {}).get("rxnormId", [])
            all_ids.update(ids)

        return list(all_ids)
    except Exception as e:
        print(f"[RxNorm error for aliases {aliases}]: {e}")
        return []
    
def query_academic_health_docs(ingredient, aliases=None, max_results=100):
    def query_europe_pmc(aliases=None, max_results=100):
        try:
            base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
            
            if aliases is None:
                aliases = [ingredient]

            alias_query = " OR ".join([f'"{a}"' for a in aliases])
            keywords_query = " OR ".join(keyword_config["all"])
            query = f"({alias_query}) AND ({keywords_query})"

            params = {
                "query": query,
                "format": "json",
                "resultType": "core",
                "sort": "P_PDATE_D",
                "pageSize": max_results
            }
            r = requests.get(base_url, params=params).json()
            results = []
            for record in r.get("resultList", {}).get("result", []):
                abstract = record.get("abstractText")
                if not abstract or len(abstract.strip()) < 100:
                    continue
                results.append({
                    "title": record.get("title"),
                    "source": record.get("journalTitle"),
                    "pubdate": record.get("firstPublicationDate", record.get("pubYear")),
                    "url": f"https://europepmc.org/article/{record.get('source')}/{record.get('id')}",
                    "full_text": abstract.strip()
                })
            return results
        except Exception as e:
            print(f"[EuropePMC JSON API error for '{ingredient}']: {e}")
            return []

    def query_pubmed_central(aliases=None, max_results=100):
        try:
            search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
            
            if aliases is None:
                aliases = [ingredient]

            alias_query = " OR ".join([f'"{a}"' for a in aliases])
            keywords_query = " OR ".join(keyword_config["all"])
            query = f"({alias_query}) AND ({keywords_query})"

            params = {
                "db": "pmc",
                "term": query,
                "retmode": "json",
                "retmax": max_results,
                "sort": "pub+date"
            }
            r = requests.get(search_url, params=params).json()
            ids = r.get("esearchresult", {}).get("idlist", [])
            summaries = []
            for pmid in ids:
                try:
                    summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
                    summary_resp = requests.get(summary_url, params={"db": "pmc", "id": pmid, "retmode": "json"}).json()
                    result = summary_resp.get("result", {}).get(pmid)
                    if not result:
                        continue
                    title = result.get("title", "")
                    url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmid}/"
                    summaries.append({"title": title, "url": url, "full_text": title})
                except:
                    continue
            return summaries
        except Exception as e:
            print(f"[PubMedCentral error for '{ingredient}']: {e}")
            return []

    # Combine and deduplicate by title
    pmc_results = query_pubmed_central(aliases)
    europepmc_results = query_europe_pmc(aliases)
    combined = pmc_results + europepmc_results
    seen_titles = set()
    unique_results = []
    for r in combined:
        if r["title"] and r["title"] not in seen_titles:
            unique_results.append(r)
            seen_titles.add(r["title"])
    return unique_results[:max_results]

#### NER + Semantic Extraction

In [7]:
def extract_entities(text, aliases=None, health_keywords=None):
    doc = nlp(text)

    invalid_labels = {
        "CARDINAL", "DATE", "ORDINAL", "PERCENT", "LANGUAGE", "TIME", "QUANTITY", "MONEY", "NORP", "EVENT"
    }
    forbidden_words = {
        "recall", "product", "distribution", "ingredient", "label", "cookie", "brownie", "package",
        "expiration", "sell", "pasteurization"
    }

    spans = []
    for ent in doc.ents:
        span_text = ent.text.strip()
        span_clean = span_text.lower()
        label = ent.label_

        # Re-tag known aliases incorrectly labeled as PERSON
        if label == "PERSON" and aliases and span_clean in aliases:
            print(f"[NER Correction] '{span_text}' was labeled as PERSON, relabeling as INGREDIENT")
            label = "INGREDIENT"

        # Skip irrelevant
        if label in invalid_labels:
            continue
        if len(span_text) < 3:
            continue
        if any(word in span_clean for word in forbidden_words):
            continue

        # Contextual scoring for filtering or ranking
        context_window = text[max(0, ent.start_char - 50):ent.end_char + 50].lower()
        context_score = sum(1 for kw in (health_keywords or []) if kw in context_window)

        spans.append((ent.start_char, ent.end_char, label, span_text, context_score))

    # Deduplicate overlapping spans (keep most relevant)
    merged = []
    spans.sort(key=lambda x: (x[0], -(x[1]-x[0])))
    for start, end, label, text_span, score in spans:
        if merged and start < merged[-1][1]:
            prev = merged[-1]
            if score > prev[4]:  # keep if more relevant
                merged[-1] = (start, end, label, text_span, score)
        else:
            merged.append((start, end, label, text_span, score))

    return [(text[start:end], label) for start, end, label, _, _ in merged]

#### Web Scraping and Google CSE

In [8]:
def search_web_snippets(ingredient, aliases=None, num_results=100, api_key=None, cse_id=None):
    if not api_key or not cse_id:
        raise ValueError("Google API key and CSE ID are required.")

    if aliases is None:
        aliases = [ingredient]

    alias_query = " OR ".join([f'"{a}"' for a in aliases])
    keywords = keyword_config["all"]
    keyword_groups = [keywords[i:i+20] for i in range(0, len(keywords), 20)]

    all_snippets = []

    for group in keyword_groups:
        keyword_query = " OR ".join([f'"{kw}"' for kw in group])
        query = f"({alias_query}) AND ({keyword_query})"

        try:
            service = build("customsearch", "v1", developerKey=api_key)
            res = service.cse().list(
                q=query,
                cx=cse_id,
                num=min(num_results, 10),  # API limit: max 10 per call
                sort="date"
            ).execute()
            items = res.get("items", [])
            snippets = [item.get("snippet", "") for item in items if item.get("snippet")]
            print(f"[Google CSE] Retrieved {len(snippets)} snippets for keyword batch: {group[:3]}...")
            all_snippets.extend(snippets)
        except Exception as e:
            print(f"[Google CSE Error for query]: {query}\n{e}")
            continue

    return list(set(all_snippets))  # Deduplicate

def semantic_scrape_summary(ingredient, api_key=None, cse_id=None):
    fallback_attempts = 0
    all_snippets = search_web_snippets(ingredient, aliases=[ingredient], api_key=api_key, cse_id=cse_id)

    if not all_snippets:
        fallback_attempts += 1
        fallback_term = re.sub(r'[^\w\s]', '', ingredient)
        print(f"[Fallback] Trying sanitized alias: '{fallback_term}'")
        all_snippets = search_web_snippets(fallback_term, aliases=[fallback_term], api_key=api_key, cse_id=cse_id)

    if not all_snippets:
        print(f"[Fallback] Google CSE returned no snippets even after fallback. Skipping.")
        return []

    all_ents = []
    for i, snippet in enumerate(all_snippets):
        if not snippet.strip():
            continue

        #print(f"\n[Google Snippet #{i+1} for '{ingredient}']:\n{snippet}")
        ents = extract_entities(snippet, aliases=global_alias_set, health_keywords=keyword_config["all"])
        #if ents:
            #print("  → Extracted Entities:")
            #for text, label in ents:
                #print(f"     - {text} ({label})")
        #else:
            #print("  → No entities found.")

        all_ents.extend(ents)

    return list(set(all_ents))  # Deduplicate final entity list

#### Main Processing Flow

In [9]:
def get_all_health_info(ingredient, aliases=None):
    aliases = aliases or [ingredient]
    return {
        "PubMed": query_pubmed(ingredient, aliases),
        "OpenFDA": query_openfda(ingredient, aliases),
        "RxNorm": query_rxnorm(ingredient, aliases),
        "Academic_Articles": query_academic_health_docs(ingredient, aliases)
    }

def preprocess_ingredient_list_with_health(text):
    raw_ingredients = re.split(r'[\,\n;/••]+', text)
    processed = []
    seen_terms = set()

    for raw in raw_ingredients:
        raw = raw.strip()
        if not raw:
            continue

        if not is_phonetically_valid(raw):
            print(f"[!] Skipping '{raw}' — unlikely to be a valid ingredient (too short or invalid vowel/consonant pattern)")
            continue

        standard, aliases = standardize_ingredient_name(raw)
        aliases = list(set([standard] + aliases))
        filtered_aliases = [a for a in aliases if is_relevant_alias(a)]
        
        # Sort and limit to top 5 by similarity
        ranked_aliases = sorted(
            filtered_aliases,
            key=lambda x: fuzz.token_sort_ratio(raw.lower(), x.lower()),
            reverse=True
        )
        selected_aliases = ranked_aliases[:5]

        combined_health_info = {
            "PubMed": [],
            "OpenFDA": [],
            "RxNorm": [],
            "NER_Snippets": [],
            "Academic_Articles": []
        }

        found_any_data = False

        for term in selected_aliases:
            if term.lower() in seen_terms:
                continue
            seen_terms.add(term.lower())

            api_info = get_all_health_info(term, selected_aliases)
            ner_info = semantic_scrape_summary(term, api_key=GOOGLE_API_KEY, cse_id=GOOGLE_CSE_ID)

            if any(api_info[k] for k in api_info) or ner_info:
                found_any_data = True

            for k in combined_health_info:
                if k == "NER_Snippets":
                    combined_health_info[k].extend([i for i in ner_info if i not in combined_health_info[k]])
                else:
                    combined_health_info[k].extend([i for i in api_info[k] if i not in combined_health_info[k]])

        if not found_any_data:
            print(f"⚠️ No data found for '{raw}'. Did you spell this correctly or is it too obscure?")

        processed.append({
            "standard": standard,
            "aliases": aliases,
            "health_info": combined_health_info
        })

    return processed

#### Tokenization and Summarization

In [10]:
def num_tokens(text, model="gpt-3.5-turbo"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def split_into_token_chunks(text, max_tokens=2000, model="gpt-3.5-turbo"):
    doc = nlp_sentencizer(text)
    chunks = []
    current_chunk = ""
    for sent in doc.sents:
        if num_tokens(current_chunk + sent.text, model) > max_tokens:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sent.text
        else:
            current_chunk += " " + sent.text

    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    return chunks

def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
    encoding = tiktoken.encoding_for_model(model)
    tokens_per_message = 4  # each message key structure (role, content, etc.)
    tokens = 0
    for message in messages:
        tokens += tokens_per_message
        for key, value in message.items():
            tokens += len(encoding.encode(value))
    return tokens + 2  # every reply is primed with <|start|>assistant

def classify_chunk(chunk, model="your-local-llm"):
    # Placeholder: use your own model here
    # This could be a call to a HuggingFace pipeline or custom model
    return "Category: [Benefits]\nReason: Simulated classification for now."

def summarize_by_category(classified_chunks, model="your-local-llm"):
    # Placeholder: simulate summary for now
    summaries = {}
    for cat, texts in defaultdict(list).items():
        summaries[cat] = "Simulated summary."
    return summaries

def enrich_with_health_summaries(results, model="gpt-3.5-turbo"):
    for entry in results:
        all_chunks = []

        for article in entry["health_info"].get("Academic_Articles", []):
            text = article.get("full_text")
            if text and len(text.strip()) > 200:
                chunks = split_into_token_chunks(text, max_tokens=2000, model=model)
                all_chunks.extend(chunks)

        if not all_chunks:
            # Try fallback: summarize from available titles + sources
            fallback_chunks = []
            for article in entry["health_info"].get("Academic_Articles", []):
                title = article.get("title", "").strip()
                source = article.get("source", "").strip()
                if len(title) > 20:
                    snippet = f"Title: {title}"
                    if source:
                        snippet += f" | Source: {source}"
                    fallback_chunks.append(snippet)

            if fallback_chunks:
                classified = [(chunk, classify_chunk(chunk, model=model)) for chunk in fallback_chunks]
                summaries = summarize_by_category(classified, model=model)
                entry["health_summary"] = summaries
            else:
                entry["health_summary"] = {"note": "No usable full-text content or fallback title."}
            continue

        # Regular summarization path
        classified = [(chunk, classify_chunk(chunk, model=model)) for chunk in all_chunks]
        summaries = summarize_by_category(classified, model=model)
        entry["health_summary"] = summaries

    return results

##### llama3gptq integration

In [11]:
# --- Summarization Functions ---
def generate_summary(text, category, ingredient_name, model, tokenizer):
    if category == "health_effects":
        instruction = f"From the following scientific findings, summarize how {ingredient_name} affects human health. Include both health benefits and health concerns clearly labeled. Limit to 5 bullet points."
    elif category == "restrictions":
        instruction = f"From the following scientific findings, summarize how {ingredient_name} affects human dietary restrictions (such as allergies, intolerances, and religious restrictions). Limit to 5 bullet points."
    else:
        raise ValueError("Invalid category")

    prompt = instruction + "\n\n" + text

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    lines = summary.splitlines()
    lines = lines[1:]
    bullets = []
    for i, line in enumerate(lines, start=1):
        stripped = line.lstrip()
        if not stripped:
            continue
        # Only keep lines that are likely bullet points (or convertable to them)
        if stripped.startswith("-"):
            content = stripped[1:].strip()
        else:
            content = stripped

        # Prepend number if missing
        if not re.match(r"^\d+\.\s", content):
            content = f"{i}. {content}"
        content = f"        {content}"
        bullets.append(content)
    bullets[0] = f"\n{bullets[0]}"
    # Deduplicate semantically
    unique_bullets = []
    for bullet in bullets:
        # Remove bullets that are too short or appear cutoff
        if len(bullet.strip()) < 50:
            continue
        if all(fuzz.ratio(bullet, b) < 90 for b in unique_bullets):
            unique_bullets.append(bullet)
        
    return "\n".join(unique_bullets[:5]) if unique_bullets else summary


def clean_and_chunk_sentences(text, health_keywords=None, max_sentences=20):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    
    if health_keywords:
        sentences = [s for s in sentences if any(kw in s.lower() for kw in health_keywords)]
    
    # Chunk sentences into groups
    chunks = []
    for i in range(0, len(sentences), max_sentences):
        chunks.append(" ".join(sentences[i:i+max_sentences]))
    
    return chunks

def deduplicate_sentences(sentences, threshold=92):
    """Deduplicate semantically similar sentences based on fuzzy matching."""
    unique = []
    for s in sentences:
        if all(fuzz.ratio(s, u) < threshold for u in unique):
            unique.append(s)
    return unique

def truncate_to_token_limit(text, tokenizer, max_tokens=800):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_tokens)
    return tokenizer.decode(tokens, skip_special_tokens=True)

def filter_sentences(sentences, task):
    if task in keyword_config:
        keywords = keyword_config[task]
        return [s for s in sentences if any(word in s.lower() for word in keywords)]
    return sentences

def is_probably_human_study(title, abstract):
    non_human_terms = [
        "mouse", "mice", "rat", "rabbits", "pigs", "cattle", "chicken", "fish", "gobies", 
        "canine", "murine", "insect", "drosophila", "dog", "animal model", "rodent", "zebrafish"
    ]
    combined_text = (title + " " + abstract).lower()
    return not any(term in combined_text for term in non_human_terms)

def extract_relevant_sentences(text, aliases=None, title=None):
    outcome_terms = [
        "result", "conclusion", "finding", "found", "significant", "associated",
        "led to", "revealed", "observed", "showed", "demonstrated", "lead to"
    ]
    skip_phrases = {"this study aims", "background", "introduction", "study design", "was conducted", "was performed"}
    keywords = set(keyword_config["all"] + outcome_terms)

    aliases = [a.lower() for a in (aliases or [])]
    reference_text = ((title or "") + " " + text).lower()
    
    # Only check alias presence in the overall reference text
    if not any(alias in reference_text for alias in aliases):
        return []

    return [
        sent.text for sent in nlp(text).sents
        if any(kw in sent.text.lower() for kw in keywords)
        and not any(phrase in sent.text.lower() for phrase in skip_phrases)
    ]

def normalize_aliases(sentences, aliases, ingredient_name):
    """Replaces all aliases in the sentences with the main ingredient name."""
    normalized = []
    for sentence in sentences:
        for alias in sorted(aliases, key=len, reverse=True):  # replace longest matches first
            pattern = re.compile(rf"\b{re.escape(alias)}\b", flags=re.IGNORECASE)
            sentence = pattern.sub(ingredient_name, sentence)
        normalized.append(sentence)
    return normalized

def enrich_with_health_summaries_v2(results, model, tokenizer):
    health_keywords = keyword_config["all"]

    for entry in results:
        ingredient_name = entry['standard']
        aliases = entry.get("aliases", [ingredient_name])
        articles = entry["health_info"].get("Academic_Articles", [])
        print(f"\n🧪 Processing ingredient: {ingredient_name}")
        print(f"→ Total academic articles found: {len(articles)}")

        if not articles:
            entry["health_summary"] = {"note": "No academic articles available."}
            continue

        human_articles = [
            a for a in articles 
            if is_probably_human_study(a.get("title", ""), a.get("full_text", ""))
        ]
        print(f"→ Human-relevant articles retained: {len(human_articles)}")
        print(f"→ Articles rejected for non-human focus: {len(articles) - len(human_articles)}")

        if not human_articles:
            entry["health_summary"] = {"note": "No human-relevant articles found."}
            continue

        entry["health_summary"] = {}

        for category in ["health_effects", "restrictions"]:
            category_keywords = (
                keyword_config["benefits"] + keyword_config["concerns"]
                if category == "health_effects"
                else keyword_config["restrictions"]
            )

            category_summaries = []
            total_relevant_sentences = 0

            for article in human_articles:
                raw_text = article.get("full_text", "")
                title = article.get("title", "")
                if not raw_text:
                    continue

                relevant_sentences = extract_relevant_sentences(raw_text, aliases=aliases, title=title)
                total_relevant_sentences += len(relevant_sentences)

                filtered_sentences = [
                    s for s in relevant_sentences if any(kw in s.lower() for kw in category_keywords)
                ]
                filtered_sentences = deduplicate_sentences(filtered_sentences)
                filtered_sentences = normalize_aliases(filtered_sentences, aliases, ingredient_name)

                if not filtered_sentences:
                    continue

                combined_text = " ".join(filtered_sentences)
                truncated_input = truncate_to_token_limit(combined_text, tokenizer)

                summary = generate_summary(
                    truncated_input, category, ingredient_name, model, tokenizer
                )
                category_summaries.append(summary)

            # ✅ Fallback title-based summarization if nothing was found
            if not category_summaries:
                fallback_chunks = []
                for article in human_articles:
                    title = article.get("title", "").strip()
                    source = article.get("source", "").strip()
                    if len(title) > 20:
                        snippet = ""  # skip titles entirely
                        #snippet = f"Title: {title}"
                        if source:
                            snippet += f" | Source: {source}"
                        fallback_chunks.append(snippet)
                fallback_chunks = normalize_aliases(fallback_chunks, aliases, ingredient_name)
                if fallback_chunks:
                    # Combine chunks, truncate, and summarize
                    combined_text = " ".join(fallback_chunks)
                    truncated_input = truncate_to_token_limit(combined_text, tokenizer)
                    fallback_summary = generate_summary(
                        truncated_input, category, ingredient_name, model, tokenizer
                    )
                    entry["health_summary"][category] = fallback_summary
                else:
                    entry["health_summary"][category] = f"No relevant {category} information found."
            else:
                entry["health_summary"][category] = "\n".join(list(set(category_summaries)))
    return results

##### test run

In [12]:
dynamize_fruity_pebbles_ingredients = "Hydrolyzed Whey Protein Isolate, Whey Protein Isolate, Natural Flavors, Artificial Flavors, Gum Arabic, FD&C #3, FD&C Red #40, FD&C Blue#1, FD&C Blue #1, FD&C Yellow#5, Salt, Soy Lecithin, Sucralose, Stevia"
dynamize_fruity_pebbles_results = preprocess_ingredient_list_with_health(dynamize_fruity_pebbles_ingredients)

# Run LLM-based summarization on your enriched ingredient data
dynamize_fruity_pebbles_results = enrich_with_health_summaries_v2(dynamize_fruity_pebbles_results, model, tokenizer)

[Google CSE] Retrieved 10 snippets for keyword batch: ['fitness', 'healing', 'side effect']...
[Google CSE] Retrieved 10 snippets for keyword batch: ['therapy', 'absorption', 'metabolic']...
[Google CSE] Retrieved 10 snippets for keyword batch: ['kidney', 'hepatotoxic', 'halal']...
[Google CSE] Retrieved 10 snippets for keyword batch: ['aiding', 'treats', 'focus']...
[Google CSE] Retrieved 10 snippets for keyword batch: ['cognitive', 'restricted', 'reduces']...
[Google CSE] Retrieved 10 snippets for keyword batch: ['improves', 'irritation', 'danger']...
[NER Correction] 'Whey Protein Isolate' was labeled as PERSON, relabeling as INGREDIENT
[NER Correction] 'Whey Protein Isolate' was labeled as PERSON, relabeling as INGREDIENT
[NER Correction] 'Whey Protein Isolate' was labeled as PERSON, relabeling as INGREDIENT
[Correction] 'whey protein isolate' autocorrected to alias: 'whey protein isolate'
[Google CSE] Retrieved 10 snippets for keyword batch: ['fitness', 'healing', 'side effect']..

  attn_output = torch.nn.functional.scaled_dot_product_attention(



🧪 Processing ingredient: whey protein isolate
→ Total academic articles found: 38
→ Human-relevant articles retained: 27
→ Articles rejected for non-human focus: 11

🧪 Processing ingredient: natural flavors
→ Total academic articles found: 36
→ Human-relevant articles retained: 27
→ Articles rejected for non-human focus: 9

🧪 Processing ingredient: artificial flavors
→ Total academic articles found: 36
→ Human-relevant articles retained: 30
→ Articles rejected for non-human focus: 6

🧪 Processing ingredient: gum arabic
→ Total academic articles found: 39
→ Human-relevant articles retained: 32
→ Articles rejected for non-human focus: 7

🧪 Processing ingredient: fd&c blue#1
→ Total academic articles found: 0

🧪 Processing ingredient: fd&c blue #1
→ Total academic articles found: 0

🧪 Processing ingredient: fd&c yellow#5
→ Total academic articles found: 0

🧪 Processing ingredient: salt
→ Total academic articles found: 89
→ Human-relevant articles retained: 72
→ Articles rejected for non-

##### Output

In [13]:
# Print trusted API info cleanly
for entry in dynamize_fruity_pebbles_results:
    print(f"\n🧪 Ingredient: {entry['standard']}")
    print("→ Aliases:", entry["aliases"])

    print("\nTrusted API Info:")
    for source, data in entry["health_info"].items():
        if source == "NER_Snippets":
            continue
        print(f"  • {source}:")
        if isinstance(data, list) and data:
            for item in data:
                if isinstance(item, str):
                    print(f"     - {item}")
                elif isinstance(item, dict):
                    print(f"     - {item.get('title', '')}")
        elif isinstance(data, list):
            print("     - No results")
        else:
            print(f"     - {data}")


🧪 Ingredient: hydrolyzed whey protein isolate
→ Aliases: ['hydrolyzed whey protein isolate']

Trusted API Info:
  • PubMed:
     - No results
  • OpenFDA:
     - No results
  • RxNorm:
     - No results
  • Academic_Articles:
     - Unraveling the Biological Properties of Whey Peptides and Their Role as Emerging Therapeutics in Immune Tolerance
     - Effects of time-of-day resistance training on muscle strength, hormonal adaptations, and sleep quality during Ramadan fasting
     - Lactiplantibacillus plantarum LM1001 Improves Digestibility of Branched-Chain Amino Acids in Whey Proteins and Promotes Myogenesis in C2C12 Myotubes
     - Proteins and Amino Acids Treated with Atmospheric Plasma Show Significantly Increased Bioavailability in Humans
     - Do the anatomical and physiological properties of a muscle determine its adaptive response to different loading protocols?
     - Four Weeks of Time-Restricted Feeding Combined with Resistance Training Does Not Differentially Influence M

In [14]:
# Print the NER Entitiies cleanly
for entry in dynamize_fruity_pebbles_results:
    print(f"\n🧪 Ingredient: {entry['standard']}")
    print("→ Aliases:", entry["aliases"])

    print("\nNER Entities from Web Snippets:")
    if entry["health_info"]["NER_Snippets"]:
        for ent_text, ent_label in entry["health_info"]["NER_Snippets"]:
            print(f"     - {ent_text} ({ent_label})")
    else:
        print("     - No named entities found.")


🧪 Ingredient: hydrolyzed whey protein isolate
→ Aliases: ['hydrolyzed whey protein isolate']

NER Entities from Web Snippets:
     - Labrada (GPE)
     - MAayan Apr (PERSON)
     - Diet (ORG)
     - Digestion & Gut (ORG)
     - Dymatize ISO (ORG)
     - Dymatize ISO100 (PERSON)
     - Protein & Fitness (ORG)
     - Servings (ORG)
     - Peanut Butter Chocolate N-ISO (ORG)
     - Healthy Awards ® (PERSON)
     - Healthy Snacks (PERSON)
     - Protein Isolate - Fruity (ORG)
     - Gaming & Entertainment (ORG)
     - Mar 24, 2025 (PERSON)
     - Healthy Aging (PERSON)
     - Gluten Free (PERSON)
     - Health & Personal Care (See Top (ORG)
     - Essential Amino Acids (WORK_OF_ART)
     - Muscle Growth & Immune Health (ORG)
     - Health & Personal Care (ORG)
     - ISO100 (PERSON)
     - Energy & Physical (ORG)
     - Gut-Friendly Protein (ORG)
     - Applied Nutrition (ORG)
     - diet (PERSON)
     - Whey Protein Isolate (ORG)
     - Health & Household (ORG)
     - Grass-Fed (ORG)
   

In [15]:
# Print the results (Summaries) cleanly
for entry in dynamize_fruity_pebbles_results:
    print(f"\n🧪 Ingredient: {entry['standard']}")
    print("\n💬 LLM-Generated Health Summaries:")
    for category, summary in entry.get("health_summary", {}).items():
        print(f"  [{category.upper()}]: {summary}")
    print("\n")

save_search_cache()
save_alias_cache()
save_alias_frequency()


🧪 Ingredient: hydrolyzed whey protein isolate

💬 LLM-Generated Health Summaries:
  [HEALTH_EFFECTS]: 
        1. Improved Muscle Protein Synthesis: Hydrolyzed whey protein isolate (WPI) has been shown to enhance muscle protein synthesis, leading to increased muscle mass and strength in both resistance-trained and untrained individuals (Paddon-Jones et al., 2001).
        2. Faster Digestion and Absorption: The hydrolysis process breaks down the protein into smaller peptides and amino acids, making it easier for the body to digest and absorb quickly, providing a rapid source of amino acids for muscle recovery and growth (Tang et al., 1999).
        3. Enhanced Immune Function: WPI contains immunoglobulins, lactoferrin, and cysteine, which have been shown to support immune function and reduce inflammation, making it beneficial for individuals with compromised immune systems or
  [RESTRICTIONS]: 
        1. Hydrolyzed whey protein isolate (WPI) is a milk-derived protein that undergoes hy

##### Evaluation

In [17]:
# --- Reference summaries ---
reference_summaries = {
    "hydrolyzed whey protein isolate-health_effects": (
        "Hydrolyzed whey protein isolate is quickly absorbed by the body and supports rapid muscle recovery after exercise. "
        "It contains essential amino acids that promote muscle growth and repair. "
        "It may help maintain lean body mass and improve athletic performance. "
        "Its hydrolyzed form may be easier to digest than regular whey. "
        "It may also contribute to increased satiety and better metabolic health."
    ),
    "hydrolyzed whey protein isolate-restrictions": (
        "Hydrolyzed whey protein isolate is derived from milk and is not suitable for individuals with milk allergies or strict vegan diets. "
        "Though hydrolyzation reduces lactose, individuals with severe lactose intolerance may still experience discomfort. "
        "It is also unsuitable for those following dairy-free diets for ethical or health reasons."
    ),

    "whey protein isolate-health_effects": (
        "Whey protein isolate promotes muscle growth and supports recovery after exercise. "
        "It is rich in branched-chain amino acids (BCAAs) that are crucial for muscle synthesis. "
        "It may help regulate appetite and support weight management. "
        "It is low in fat and carbohydrates, making it ideal for lean mass gain. "
        "Some studies suggest it supports immune function due to its cysteine content."
    ),
    "whey protein isolate-restrictions": (
        "Whey protein isolate is not suitable for individuals with milk allergies or those following a vegan lifestyle. "
        "Although it is low in lactose, some individuals with lactose intolerance may still experience symptoms. "
        "People with kidney disease should consult a healthcare provider before consuming high-protein supplements."
    ),

    "natural flavors-health_effects": (
        "They generally do not have significant health effects in small amounts. "
        "The specific health impact is hard to assess due to the lack of transparency in flavor composition."
    ),
    "natural flavors-restrictions": (
        "Natural flavors can be derived from animal sources and may not be suitable for vegans or vegetarians. "
        "Individuals with food allergies should exercise caution, as natural flavors may contain allergenic substances. "
        "People following halal or kosher diets may need clarification from manufacturers to determine acceptability."
    ),

    "artificial flavors-health_effects": (
        "They provide no nutritional benefits. "
        "Some consumers prefer to avoid them due to concerns about long-term safety and chemical exposure."
    ),
    "artificial flavors-restrictions": (
        "Artificial flavors are not suitable for individuals seeking clean-label or all-natural products. "
        "Certain artificial flavoring agents may be restricted in specific countries due to regulatory guidelines. "
        "Individuals with chemical sensitivities may prefer to avoid them."
    ),
    "gum arabic-health_effects": (
        "Gum arabic is a natural fiber derived from the acacia tree and used as a stabilizer and emulsifier. "
        "It may support gut health due to its prebiotic properties. "
        "It has a low glycemic index and is considered safe for most people. "
        "Some evidence suggests it may help with cholesterol reduction and appetite regulation."
    ),
    "gum arabic-restrictions": (
        "Gum arabic may cause bloating or mild gastrointestinal discomfort in sensitive individuals. "
        "It is generally well tolerated but should be avoided in high doses by those with irritable bowel syndrome. "
        "Some people may be allergic to it, especially if they have tree pollen allergies."
    ),

    "fd&c #3-health_effects": (
        "It has been linked to thyroid tumors in animal studies, although not definitively in humans. "
        "Some studies suggest it may cause hyperactivity in children. "
        "It provides no nutritional benefit and is banned in certain countries for use in cosmetics."
    ),
    "fd&c #3-restrictions": (
        "Individuals with dye sensitivities or behavioral conditions such as ADHD may wish to avoid it. "
        "It is not suitable for consumers seeking clean-label or additive-free products."
    ),

    "fd&c red #40-health_effects": (
        "It has been associated with allergic reactions, particularly in individuals with aspirin sensitivity. "
        "Some research links it to hyperactivity in children, though findings are mixed. "
    ),
    "fd&c red #40-restrictions": (
        "It is not suitable for people with dye allergies. "
        "Those following strict diets like Feingold may avoid it."
    ),

    "fd&c blue #1-health_effects": (
        "It is generally considered safe but may cause allergic reactions in sensitive individuals. "
        "Some animal studies have shown adverse effects on nerve cells, although evidence in humans is limited."
    ),
    "fd&c blue #1-restrictions": (
        "FD&C Blue 1 may cause hypersensitivity reactions in some individuals. "
        "It is unsuitable for those avoiding synthetic dyes or practicing natural food diets."
    ),

    "fd&c yellow #5-health_effects": (
        "It has been linked to hyperactivity in children and allergic reactions in some individuals. "
        "It provides no nutritional value and is often viewed negatively by clean-eating proponents."
    ),
    "fd&c yellow #5-restrictions": (
        "FD&C Yellow 5 must be declared on food labels in the U.S. due to known allergic reactions. "
        "It is not acceptable in additive-free, organic, or dye-restricted diets."
    ),

    "salt-health_effects": (
        "Salt (sodium chloride) is essential for nerve and muscle function. "
        "However, excessive intake can lead to high blood pressure, heart disease, and stroke. "
        "It can cause water retention and kidney strain when consumed in large amounts. "
        "Moderation is key to balancing its benefits and risks."
    ),
    "salt-restrictions": (
        "Salt must be restricted in low-sodium diets, often prescribed for hypertension or cardiovascular disease. "
        "It may be avoided in renal diets to reduce fluid retention. "
        "Some religious fasting practices also restrict salt intake."
    ),

    "soy lecithin-health_effects": (
        "Soy lecithin is a natural emulsifier that may support cognitive health due to its choline content. "
        "It may help manage cholesterol levels and promote liver health. "
        "Some studies suggest potential antioxidant effects."
    ),
    "soy lecithin-restrictions": (
        "Soy lecithin is derived from soybeans and is not suitable for individuals with soy allergies. "
        "It is also unsuitable for those avoiding genetically modified ingredients, unless labeled non-GMO. "
        "People following soy-free or elimination diets should avoid it."
    ),

    "sucralose-health_effects": (
        "It is generally considered safe but may alter gut microbiota with long-term use. "
        "Some individuals report headaches or bloating after consumption. "
    ),
    "sucralose-restrictions": (
        "Sucralose is not suitable for individuals avoiding artificial sweeteners. "
        "Some people with digestive disorders may prefer to avoid it. "
        "It is not allowed in certain organic food certifications."
    ),

    "stevia-health_effects": (
        "It has no calories and may help manage blood sugar levels. "
        "It is considered safe for most people and may support weight management. "
        "Some studies suggest antioxidant and anti-inflammatory properties."
    ),
    "stevia-restrictions": (
        "Stevia may cause bloating or nausea in some sensitive individuals. "
        "Individuals allergic to ragweed may also react to stevia."
    ),
}


In [18]:
# --- Evaluation functions ---
def compute_recall_at_k(generated_bullets, reference_bullets, k=5):
    matches = sum(
        1 for b in generated_bullets[:k]
        if any(fuzz.partial_ratio(b.lower(), r.lower()) > 80 for r in reference_bullets)
    )
    return matches / min(k, len(reference_bullets)) if reference_bullets else 0

def compute_mrr(generated_bullets, reference_bullets):
    for rank, b in enumerate(generated_bullets, start=1):
        if any(fuzz.partial_ratio(b.lower(), r.lower()) > 80 for r in reference_bullets):
            return 1 / rank
    return 0

# Set up metric containers
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
model = SentenceTransformer("all-MiniLM-L6-v2")

recalls, mrrs, rouge1_scores, rougeL_scores = [], [], [], []
cosine_scores, bert_precisions, bert_recalls, bert_f1s = [], [], [], []

# Evaluation loop
for entry in dynamize_fruity_pebbles_results:
    ingredient = entry["standard"].lower()
    for category, summary in entry.get("health_summary", {}).items():
        key = f"{ingredient}-{category}"
        reference = reference_summaries.get(key)

        if not reference or not summary.strip():
            continue

        # Extract generated bullets
        generated_bullets = [
            line.strip()[2:].strip()
            for line in summary.splitlines()
            if re.match(r"^\d+\.", line.strip())
        ]
        if not generated_bullets:
            continue

        # Extract reference sentences
        reference_bullets = [s.strip() for s in reference.split('.') if s.strip()]
        if not reference_bullets:
            continue

        # Recall@K and MRR
        recalls.append(compute_recall_at_k(generated_bullets, reference_bullets, k=5))
        mrrs.append(compute_mrr(generated_bullets, reference_bullets))

        # ROUGE scores
        rouge_scores = scorer.score(summary, reference)
        rouge1_scores.append(rouge_scores["rouge1"].fmeasure)
        rougeL_scores.append(rouge_scores["rougeL"].fmeasure)

        # SBERT cosine similarity
        gen_emb = model.encode(" ".join(generated_bullets), convert_to_tensor=True)
        ref_emb = model.encode(reference, convert_to_tensor=True)
        cosine_scores.append(float(util.pytorch_cos_sim(gen_emb, ref_emb)[0][0]))

        # BERTScore
        P, R, F1 = bert_score(
            [" ".join(generated_bullets)], [reference],
            lang="en", verbose=False
        )
        bert_precisions.append(P[0].item())
        bert_recalls.append(R[0].item())
        bert_f1s.append(F1[0].item())

# --- Final Output ---
print("\n📊 Evaluation Metrics Summary:")
print(f"Avg Recall@5:        {np.mean(recalls):.3f}")
print(f"Avg MRR:             {np.mean(mrrs):.3f}")
print(f"Avg ROUGE-1:         {np.mean(rouge1_scores):.3f}")
print(f"Avg ROUGE-L:         {np.mean(rougeL_scores):.3f}")
print(f"Avg Cosine Similar.: {np.mean(cosine_scores):.3f}")
print(f"Avg BERTScore-F1:    {np.mean(bert_f1s):.3f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho


📊 Evaluation Metrics Summary:
Avg Recall@5:        0.019
Avg MRR:             0.056
Avg ROUGE-1:         0.252
Avg ROUGE-L:         0.162
Avg Cosine Similar.: 0.724
Avg BERTScore-F1:    0.861


### TODO:
- Reorganize & Cleanup Code
- Modify Markdown
- Create Surveys
- Run Survey Data
- Write Report
- Make Presentation
- Record Presentation
- Upload and Finit 🕺💃