## Final Project - NLP-Driven Ingredient Health and Dietary Restriction Analysis

*Name: Laura Obermaier*

*Stevens ID: 20027358*

#### Imports

In [None]:
import re
import csv
import requests
import spacy
import pubchempy as pcp
from rapidfuzz import process, fuzz
from collections import defaultdict
import json
import time
from spacy.lang.en import English
import tiktoken
from googleapiclient.discovery import build
from dotenv import load_dotenv
import os
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import torch
from nltk.tokenize import sent_tokenize
import nltk

#### Environment and Global Setup

In [None]:
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")

csv.field_size_limit(2**20)

nlp = spacy.load("en_core_web_sm")
nlp_sentencizer = English()
nlp_sentencizer.add_pipe("sentencizer")

global_alias_set = set()
search_cache = {}
alias_frequency = defaultdict(int)

CACHE_FILE = "search_cache.json"
ALIAS_FREQ_FILE = "alias_frequency.json"
ALIAS_CACHE_FILE = "alias_cache.json"

try:
    with open(CACHE_FILE, "r", encoding="utf-8") as f:
        search_cache = json.load(f)
except FileNotFoundError:
    search_cache = {}

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    use_fast=True,
    trust_remote_code=True
)

model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    use_safetensors=True,
    device_map="auto",
    trust_remote_code=True,
    revision="main"
)

keyword_config = {
    "benefits": [
        "benefit", "supports", "improves", "boosts", "enhances", "aids", "reduces", "prevents",
        "protects", "promotes", "stimulates", "strengthens", "aiding", "improving", "healing",
        "facilitates", "enhancing", "balances", "restores", "ameliorates", "treats", "alleviates",
        "relieves", "contributes to", "beneficial", "positive", "advantageous", "favorable",
        "healthy", "wellness", "nutrient", "nutritional", "immune", "well-being", "absorption",
        "energy", "fitness", "cognitive", "focus", "clarity", "relief", "anti-inflammatory",
        "cramps", "gut", "probiotic", "enzyme", "alkaline", "acidic", "bloating", "constipation"
    ],
    "concerns": [
        "risk", "toxic", "harm", "adverse", "cause", "increased", "linked to", "danger",
        "poisonous", "unsafe", "negatively", "exacerbates", "side effect",
        "carcinogenic", "neurotoxic", "hepatotoxic", "irritation", "may lead to", "trigger",
        "overdoes", "reaction", "symptom", "pain", "toxins"
    ],
    "restrictions": [
        "allergy", "intolerance", "sensitivity", "restricted", "avoid", "not suitable",
        "contraindicated", "dietary restriction", "religious restriction", "vegan", "vegetarian",
        "halal", "haram", "gluten", "lactose", "kosher", "FODMAP", "contains", "may contain",
        "cross-contamination", "not recommended", "not advised", "not suitable for",
        "not safe for"
    ],
    "neutral": [
        "kidney", "liver", "cholesterol", "diabetes", "inflammation", "cancer", "cardiovascular",
        "digestion", "therapy", "treatment", "metabolism", "metabolic", "calories"
    ]
}

# Compose superset for health relevance filtering
keyword_config["all"] = list(set(
    keyword_config["benefits"] +
    keyword_config["concerns"] +
    keyword_config["restrictions"] +
    keyword_config["neutral"] +
    ["health", "nutrition", "disease"]
))

#### Cache Utilities

In [None]:
def save_search_cache():
    try:
        with open(CACHE_FILE, "w", encoding="utf-8") as f:
            json.dump(search_cache, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"[!] Failed to save search cache: {e}")

def save_alias_frequency(path=ALIAS_FREQ_FILE):
    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(alias_frequency, f, ensure_ascii=False, indent=2)
        print(f"[✓] Alias frequency saved to {path}")
    except Exception as e:
        print(f"[!] Error saving alias frequency: {e}")

def load_alias_frequency(path=ALIAS_FREQ_FILE):
    global alias_frequency
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
            alias_frequency = defaultdict(int, {k: int(v) for k, v in data.items()})
        print(f"[✓] Loaded {len(alias_frequency)} alias frequencies from cache.")
        return True
    except FileNotFoundError:
        print(f"[ ] Alias frequency cache not found at {path}. Will regenerate.")
        return False
    except Exception as e:
        print(f"[!] Error loading alias frequency: {e}")
        return False
    
def save_alias_cache(path=ALIAS_CACHE_FILE):
    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(sorted(global_alias_set), f, ensure_ascii=False, indent=2)
        print(f"[✓] Alias cache saved to {path}")
    except Exception as e:
        print(f"[!] Error saving alias cache: {e}")

def load_alias_cache(path=ALIAS_CACHE_FILE):
    global global_alias_set
    try:
        with open(path, "r", encoding="utf-8") as f:
            global_alias_set = set(json.load(f))
        print(f"[✓] Loaded {len(global_alias_set)} aliases from cache.")
        return True
    except FileNotFoundError:
        print(f"[ ] Alias cache not found at {path}. Will seed from source...")
        return False
    except Exception as e:
        print(f"[!] Error loading alias cache: {e}")
        return False

#### Alias + Name Handling

In [None]:
def is_relevant_alias(alias):
    alias_clean = alias.strip().lower()
    if len(alias_clean.split()) > 4:
        return False
    if re.search(r'\d{3,}|\d+%|[^\w\s\-]', alias_clean):  # long numeric sequence or symbols
        if alias_frequency[alias_clean] < 5:  # require higher frequency to keep
            return False
    if re.search(r'^\d{2,5}-\d{2,5}-\d$', alias_clean): 
        return False
    if len(alias_clean) > 40:
        return False
    if alias_clean.count(',') > 0 or alias_clean.count('(') > 1:
        return False
    if any(keyword in alias_clean for keyword in ['acs', 'usp', 'grade', 'reference', 'powder', 'solution', 'mist']):
        return False
    return True

def get_pubchem_aliases(ingredient_name):
    try:
        compounds = pcp.get_compounds(ingredient_name, 'name')
        if compounds:
            synonyms = compounds[0].synonyms
            filtered = [s.lower() for s in synonyms if is_relevant_alias(s)]
            return list(set(filtered))
        return []
    except Exception as e:
        print(f"[PubChem error for '{ingredient_name}']: {e}")
        return []

def update_alias_cache(aliases):
    for a in aliases:
        if a:
            global_alias_set.add(a.lower().strip())

def fuzzy_match_alias(name, threshold=90):
    if not global_alias_set:
        print("[Warning] Alias set is empty — did you run seed_aliases_from_open_food_facts?")
        return None
    result = process.extractOne(name, global_alias_set, scorer=fuzz.token_sort_ratio)
    if result is None:
        return None
    match, score, _ = result
    return match if score >= threshold else None

def is_valid_alias(alias, reference, threshold=85):
    return fuzz.ratio(alias.lower(), reference.lower()) >= threshold

def is_phonetically_valid(word):
    word = word.lower()
    if len(word) < 3:
        return False
    vowels = sum(1 for c in word if c in "aeiou")
    consonants = sum(1 for c in word if c.isalpha() and c not in "aeiou")
    if consonants == 0:
        return False
    ratio = vowels / (consonants + vowels)
    return 0.2 <= ratio <= 0.8  # extremely low/high = junk

def standardize_ingredient_name(name, max_aliases=5):
    name = name.lower().strip()
    aliases = get_pubchem_aliases(name)

    if aliases:
        update_alias_cache(aliases)
        # Apply filters
        filtered_aliases = [
            a for a in aliases
            if is_relevant_alias(a) and is_valid_alias(a, name)
        ]

        # Improved sort: prioritize exact match, then similarity, then length
        ranked = sorted(
            filtered_aliases,
            key=lambda x: (
                0 if x == name else 1,               # exact match first
                -fuzz.token_sort_ratio(name, x),     # highest similarity
                len(x)                               # shorter is better
            )
        )
        top_aliases = ranked[:max_aliases]
        if not top_aliases:
            print(f"[Alias Fallback] No valid aliases for '{name}', reverting to original.")
            return name, [name]
        print(f"[Query Alias] {name} → Filtered aliases: {top_aliases}")
        return top_aliases[0], top_aliases

    fuzzy = fuzzy_match_alias(name)
    if fuzzy:
        print(f"[Correction] '{name}' autocorrected to alias: '{fuzzy}'")
        return fuzzy, [fuzzy]

    return name, [name]

#### Open Food Facts Seeding

In [None]:
def seed_aliases_from_open_food_facts(limit=10000):
    url = "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv"
    response = requests.get(url, stream=True)
    response.encoding = 'utf-8'

    alias_dict = defaultdict(set)
    lines = (line.decode('utf-8') for line in response.iter_lines())
    reader = csv.DictReader(lines, delimiter='\t')

    langs = ['fr', 'de', 'es', 'it']
    count = 0

    for row in reader:
        if count % 500 == 0:
            print(f"Processing row {count}...")
        if count >= limit:
            break
        count += 1

        ingredients_text = row.get("ingredients_text", "")
        if not ingredients_text.strip():
            continue

        for ing in ingredients_text.split(','):
            ing = ing.strip().lower()
            if not ing:
                continue
            alias_dict[ing].add(ing)
            alias_frequency[ing] += 1

            for lang in langs:
                key = f"ingredients_text_{lang}"
                alt = row.get(key)
                if alt:
                    for alt_ing in alt.split(','):
                        alt_ing = alt_ing.strip().lower()
                        if alt_ing:
                            alias_dict[ing].add(alt_ing)
                            alias_dict[alt_ing].add(ing)

    for aliases in alias_dict.values():
        update_alias_cache(list(aliases))
    print(f"[✓] Seeded {len(global_alias_set)} unique aliases from Open Food Facts.")

loaded_alias = load_alias_cache()
loaded_freqs = load_alias_frequency()

if not (loaded_alias and loaded_freqs):
    seed_aliases_from_open_food_facts(limit=5000)
    save_alias_cache()
    save_alias_frequency()

#### External API Queries

In [None]:
def query_pubmed(ingredient, max_results=50):
    try:
        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        params = {"db": "pubmed", "term": ingredient, "retmode": "json", "retmax": max_results}
        ids = requests.get(base_url, params=params).json()["esearchresult"].get("idlist", [])
        summaries = []
        for pmid in ids:
            summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
            r = requests.get(summary_url, params={"db": "pubmed", "id": pmid, "retmode": "json"}).json()
            result = r["result"].get(pmid)
            if result:
                summaries.append({"title": result.get("title"), "source": result.get("source"), "pubdate": result.get("pubdate")})
        return summaries
    except:
        return []
    
def is_fda_entry_relevant(text, ingredient):
    irrelevant_keywords = ["recall", "undeclared", "labeling", "distribution", "pasteurization", "packaging", "incorrect", "contain"]
    text_lower = text.lower()
    if any(kw in text_lower for kw in irrelevant_keywords):
        return False
    return ingredient.lower() in text_lower

def query_openfda(ingredient):
    try:
        base_url = "https://api.fda.gov/food/enforcement.json"
        params = {"search": f"product_description:{ingredient}", "limit": 5}
        r = requests.get(base_url, params=params).json()
        return [rec["reason_for_recall"] for rec in r.get("results", []) if is_fda_entry_relevant(rec["reason_for_recall"], ingredient)]
    except:
        return []

def query_rxnorm(ingredient):
    try:
        url = "https://rxnav.nlm.nih.gov/REST/rxcui.json"
        rxcui = requests.get(url, params={"name": ingredient}).json()
        return rxcui.get("idGroup", {}).get("rxnormId", [])
    except:
        return []
    
def query_academic_health_docs(ingredient, max_results=50):
    def query_europe_pmc(max_results=50):
        try:
            base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
            query = f'"{ingredient}" AND (nutrition OR health OR diet)'
            params = {
                "query": query,
                "format": "json",
                "resultType": "core",
                "sort": "P_PDATE_D",
                "pageSize": max_results
            }
            r = requests.get(base_url, params=params).json()
            results = []
            for record in r.get("resultList", {}).get("result", []):
                abstract = record.get("abstractText")
                if not abstract or len(abstract.strip()) < 100:
                    continue
                results.append({
                    "title": record.get("title"),
                    "source": record.get("journalTitle"),
                    "pubdate": record.get("firstPublicationDate", record.get("pubYear")),
                    "url": f"https://europepmc.org/article/{record.get('source')}/{record.get('id')}",
                    "full_text": abstract.strip()
                })
            return results
        except Exception as e:
            print(f"[EuropePMC JSON API error for '{ingredient}']: {e}")
            return []

    def query_pubmed_central(max_results=50):
        try:
            search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
            params = {
                "db": "pmc",
                "term": f'"{ingredient}" AND (nutrition OR health OR diet)',
                "retmode": "json",
                "retmax": max_results,
                "sort": "pub+date"
            }
            r = requests.get(search_url, params=params).json()
            ids = r.get("esearchresult", {}).get("idlist", [])
            summaries = []
            for pmid in ids:
                try:
                    summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
                    summary_resp = requests.get(summary_url, params={"db": "pmc", "id": pmid, "retmode": "json"}).json()
                    result = summary_resp.get("result", {}).get(pmid)
                    if not result:
                        continue
                    title = result.get("title", "")
                    url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmid}/"
                    summaries.append({"title": title, "url": url, "full_text": title})
                except:
                    continue
            return summaries
        except Exception as e:
            print(f"[PubMedCentral error for '{ingredient}']: {e}")
            return []

    # Combine and deduplicate by title
    pmc_results = query_pubmed_central()
    europepmc_results = query_europe_pmc()
    combined = pmc_results + europepmc_results
    seen_titles = set()
    unique_results = []
    for r in combined:
        if r["title"] and r["title"] not in seen_titles:
            unique_results.append(r)
            seen_titles.add(r["title"])
    return unique_results[:max_results]

#### NER + Semantic Extraction

In [None]:
def extract_entities(text, aliases=None, health_keywords=None):
    doc = nlp(text)

    invalid_labels = {
        "CARDINAL", "DATE", "ORDINAL", "PERCENT", "LANGUAGE", "TIME", "QUANTITY", "MONEY", "NORP", "EVENT"
    }
    forbidden_words = {
        "recall", "product", "distribution", "ingredient", "label", "cookie", "brownie", "package",
        "expiration", "sell", "pasteurization"
    }

    spans = []
    for ent in doc.ents:
        span_text = ent.text.strip()
        span_clean = span_text.lower()
        label = ent.label_

        # Re-tag known aliases incorrectly labeled as PERSON
        if label == "PERSON" and aliases and span_clean in aliases:
            print(f"[NER Correction] '{span_text}' was labeled as PERSON, relabeling as INGREDIENT")
            label = "INGREDIENT"

        # Skip irrelevant
        if label in invalid_labels:
            continue
        if len(span_text) < 3:
            continue
        if any(word in span_clean for word in forbidden_words):
            continue

        # Contextual scoring for filtering or ranking
        context_window = text[max(0, ent.start_char - 50):ent.end_char + 50].lower()
        context_score = sum(1 for kw in (health_keywords or []) if kw in context_window)

        spans.append((ent.start_char, ent.end_char, label, span_text, context_score))

    # Deduplicate overlapping spans (keep most relevant)
    merged = []
    spans.sort(key=lambda x: (x[0], -(x[1]-x[0])))
    for start, end, label, text_span, score in spans:
        if merged and start < merged[-1][1]:
            prev = merged[-1]
            if score > prev[4]:  # keep if more relevant
                merged[-1] = (start, end, label, text_span, score)
        else:
            merged.append((start, end, label, text_span, score))

    return [(text[start:end], label) for start, end, label, _, _ in merged]

#### Web Scraping and Google CSE

In [None]:
def search_web_snippets(ingredient, num_results=50, api_key=None, cse_id=None):
    if not api_key or not cse_id:
        raise ValueError("Google API key and CSE ID are required.")

    if ingredient in search_cache:
        return search_cache[ingredient]

    #query = f'"{ingredient}" AND (nutrition OR dietary OR health OR benefits OR concerns OR restrictions)'
    query = f'"{ingredient}" AND (health OR diet OR nutrition)'

    try:
        service = build("customsearch", "v1", developerKey=api_key)
        res = service.cse().list(q=query, cx=cse_id, num=num_results, sort='date').execute()
        items = res.get("items", [])
        snippets = [item.get("snippet", "") for item in items if item.get("snippet")]
        print(f"[Google CSE Success] Retrieved {len(snippets)} snippets for '{ingredient}'")
        print(f"[Google CSE Log] Query: {query}")
        for i, snippet in enumerate(snippets[:3]):
            print(f"  Snippet {i+1}: {snippet}")
        search_cache[ingredient] = snippets
        return snippets
    except Exception as e:
        print(f"[Google Search Error for '{ingredient}']: {e}")
        return []

def semantic_scrape_summary(ingredient, api_key=None, cse_id=None):
    fallback_attempts = 0
    all_snippets = search_web_snippets(ingredient, api_key=api_key, cse_id=cse_id)

    if not all_snippets:
        fallback_attempts += 1
        fallback_term = re.sub(r'[^\w\s]', '', ingredient)
        print(f"[Fallback] Trying sanitized alias: '{fallback_term}'")
        all_snippets = search_web_snippets(fallback_term, api_key=api_key, cse_id=cse_id)

    if not all_snippets:
        print(f"[Fallback] Google CSE returned no snippets even after fallback. Skipping.")
        return []

    all_ents = []
    for i, snippet in enumerate(all_snippets):
        if not snippet.strip():
            continue

        print(f"\n[Google Snippet #{i+1} for '{ingredient}']:\n{snippet}")
        """ents = extract_entities(snippet, aliases=global_alias_set, health_keywords=[
            "health", "benefit", "risk", "effect", "toxicity", "nutrition", "nutrient", "disease", "metabolism", "metabolic",
            "wellness", "digestion", "digestive", "immunity", "immune", "safety", "intolerance", "allergy", "reaction",
            "therapy", "treatment", "deficiency", "excess", "overdose", "lead to", "cause", "inflammation", "danger", "impact",
            "cardiovascular", "result", "liver", "kidney", "blood", "hormone", "cholesterol", "diabetes", "body", "brain",
            "mental", "physical", "absorption", "energy", "weight", "fat", "calories", "fitness", "detox", "toxins",
            "cognitive", "focus", "clarity", "cancer", "symptom", "pain", "relief", "anti-inflammatory", "sensitivity",
            "cramps", "gut", "probiotic", "enzyme", "alkaline", "acidic", "bloating", "constipation", "restriction", "avoid"
        ])"""
        ents = extract_entities(snippet, aliases=global_alias_set, health_keywords=keyword_config["all"])
        if ents:
            print("  → Extracted Entities:")
            for text, label in ents:
                print(f"     - {text} ({label})")
        else:
            print("  → No entities found.")

        all_ents.extend(ents)

    return list(set(all_ents))  # Deduplicate final entity list

#### Main Processing Flow

In [None]:
def get_all_health_info(ingredient):
    return {
        "PubMed": query_pubmed(ingredient),
        "OpenFDA": query_openfda(ingredient),
        "RxNorm": query_rxnorm(ingredient),
        "Academic_Articles": query_academic_health_docs(ingredient)
    }

def preprocess_ingredient_list_with_health(text):
    raw_ingredients = re.split(r'[\,\n;/••]+', text)
    processed = []
    seen_terms = set()

    for raw in raw_ingredients:
        raw = raw.strip()
        if not raw:
            continue

        if not is_phonetically_valid(raw):
            print(f"[!] Skipping '{raw}' — unlikely to be a valid ingredient (too short or invalid vowel/consonant pattern)")
            continue

        standard, aliases = standardize_ingredient_name(raw)
        aliases = list(set([standard] + aliases))
        filtered_aliases = [a for a in aliases if is_relevant_alias(a)]
        
        # Sort and limit to top 5 by similarity
        ranked_aliases = sorted(
            filtered_aliases,
            key=lambda x: fuzz.token_sort_ratio(raw.lower(), x.lower()),
            reverse=True
        )
        selected_aliases = ranked_aliases[:5]

        combined_health_info = {
            "PubMed": [],
            "OpenFDA": [],
            "RxNorm": [],
            "NER_Snippets": [],
            "Academic_Articles": []
        }

        found_any_data = False

        for term in selected_aliases:
            if term.lower() in seen_terms:
                continue
            seen_terms.add(term.lower())

            api_info = get_all_health_info(term)
            ner_info = semantic_scrape_summary(term, api_key=GOOGLE_API_KEY, cse_id=GOOGLE_CSE_ID)

            if any(api_info[k] for k in api_info) or ner_info:
                found_any_data = True

            for k in combined_health_info:
                if k == "NER_Snippets":
                    combined_health_info[k].extend([i for i in ner_info if i not in combined_health_info[k]])
                else:
                    combined_health_info[k].extend([i for i in api_info[k] if i not in combined_health_info[k]])

        if not found_any_data:
            print(f"⚠️ No data found for '{raw}'. Did you spell this correctly or is it too obscure?")

        processed.append({
            "standard": standard,
            "aliases": aliases,
            "health_info": combined_health_info
        })

    return processed

#### Tokenization and Summarization

In [None]:
def num_tokens(text, model="gpt-3.5-turbo"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def split_into_token_chunks(text, max_tokens=2000, model="gpt-3.5-turbo"):
    doc = nlp_sentencizer(text)
    chunks = []
    current_chunk = ""
    for sent in doc.sents:
        if num_tokens(current_chunk + sent.text, model) > max_tokens:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sent.text
        else:
            current_chunk += " " + sent.text

    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    return chunks

def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
    encoding = tiktoken.encoding_for_model(model)
    tokens_per_message = 4  # each message key structure (role, content, etc.)
    tokens = 0
    for message in messages:
        tokens += tokens_per_message
        for key, value in message.items():
            tokens += len(encoding.encode(value))
    return tokens + 2  # every reply is primed with <|start|>assistant

def classify_chunk(chunk, model="your-local-llm"):
    # Placeholder: use your own model here
    # This could be a call to a HuggingFace pipeline or custom model
    return "Category: [Benefits]\nReason: Simulated classification for now."

def summarize_by_category(classified_chunks, model="your-local-llm"):
    # Placeholder: simulate summary for now
    summaries = {}
    for cat, texts in defaultdict(list).items():
        summaries[cat] = "Simulated summary."
    return summaries

def enrich_with_health_summaries(results, model="gpt-3.5-turbo"):
    for entry in results:
        all_chunks = []

        for article in entry["health_info"].get("Academic_Articles", []):
            text = article.get("full_text")
            if text and len(text.strip()) > 200:
                chunks = split_into_token_chunks(text, max_tokens=2000, model=model)
                all_chunks.extend(chunks)

        if not all_chunks:
            # Try fallback: summarize from available titles + sources
            fallback_chunks = []
            for article in entry["health_info"].get("Academic_Articles", []):
                title = article.get("title", "").strip()
                source = article.get("source", "").strip()
                if len(title) > 20:
                    snippet = f"Title: {title}"
                    if source:
                        snippet += f" | Source: {source}"
                    fallback_chunks.append(snippet)

            if fallback_chunks:
                classified = [(chunk, classify_chunk(chunk, model=model)) for chunk in fallback_chunks]
                summaries = summarize_by_category(classified, model=model)
                entry["health_summary"] = summaries
            else:
                entry["health_summary"] = {"note": "No usable full-text content or fallback title."}
            continue

        # Regular summarization path
        classified = [(chunk, classify_chunk(chunk, model=model)) for chunk in all_chunks]
        summaries = summarize_by_category(classified, model=model)
        entry["health_summary"] = summaries

    return results

##### llama3gptq integration

In [None]:
# --- New Summarization Functions ---
def generate_summary(text, category, ingredient_name, model, tokenizer):
    if category == "benefits":
        instruction = f"From the following scientific findings, summarize how {ingredient_name} affects human health. Only include specific beneficial effects backed by studies. Limit to 5 bullet points. Format each point as a separate line starting with a dash."
    elif category == "concerns":
        instruction = f"From the following scientific findings, summarize how {ingredient_name} affects human health. Only include specific negative effects backed by studies. Limit to 5 bullet points. Format each point as a separate line starting with a dash."
    elif category == "restrictions":
        instruction = f"From the following scientific findings, summarize how {ingredient_name} affects human dietary restrictions (such as allergies, intolerances, and religious restrictions). Limit to 5 bullet points. Format each point as a separate line starting with a dash."
    else:
        raise ValueError("Invalid category")

    prompt = instruction + "\n\n" + text

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # Extract bullet lines
    bullets = [line.strip() for line in summary.splitlines() if line.strip().startswith("-")]

    # Deduplicate semantically
    unique_bullets = []
    for bullet in bullets:
        if all(fuzz.ratio(bullet, b) < 90 for b in unique_bullets):
            unique_bullets.append(bullet)

    return "\n".join(unique_bullets[:5]) if unique_bullets else summary

def clean_and_chunk_sentences(text, health_keywords=None, max_sentences=20):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    
    if health_keywords:
        sentences = [s for s in sentences if any(kw in s.lower() for kw in health_keywords)]
    
    # Chunk sentences into groups
    chunks = []
    for i in range(0, len(sentences), max_sentences):
        chunks.append(" ".join(sentences[i:i+max_sentences]))
    
    return chunks

def deduplicate_sentences(sentences, threshold=92):
    """Deduplicate semantically similar sentences based on fuzzy matching."""
    unique = []
    for s in sentences:
        if all(fuzz.ratio(s, u) < threshold for u in unique):
            unique.append(s)
    return unique

def truncate_to_token_limit(text, tokenizer, max_tokens=800):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_tokens)
    return tokenizer.decode(tokens, skip_special_tokens=True)

def filter_sentences(sentences, task):
    if task in keyword_config:
        keywords = keyword_config[task]
        return [s for s in sentences if any(word in s.lower() for word in keywords)]
    return sentences

def is_probably_human_study(title, abstract):
    non_human_terms = [
        "mouse", "mice", "rat", "rabbits", "pigs", "cattle", "chicken", "fish", "gobies", 
        "canine", "murine", "insect", "drosophila", "dog", "animal model", "rodent", "zebrafish"
    ]
    combined_text = (title + " " + abstract).lower()
    return not any(term in combined_text for term in non_human_terms)

def extract_relevant_sentences(text, aliases=None):
    outcome_terms = [
        "result", "conclusion", "finding", "found", "significant", "associated",
        "led to", "revealed", "observed", "showed", "demonstrated", "lead to"
    ]

    skip_phrases = {"this study aims", "background", "introduction", "study design", "was conducted", "was performed"}
    keywords = set(keyword_config["all"] + outcome_terms)
    aliases = [a.lower() for a in (aliases or [])]

    return [
        sent.text for sent in nlp(text).sents
        if (
            any(kw in sent.text.lower() for kw in keywords)
            and any(alias in sent.text.lower() for alias in aliases)
            and not any(phrase in sent.text.lower() for phrase in skip_phrases)
        )
    ]

def enrich_with_health_summaries_v2(results, model, tokenizer):
    health_keywords = keyword_config["all"]

    for entry in results:
        ingredient_name = entry['standard']
        articles = entry["health_info"].get("Academic_Articles", [])
        print(f"\n🧪 Processing ingredient: {ingredient_name}")
        print(f"→ Total academic articles found: {len(articles)}")

        if not articles:
            entry["health_summary"] = {"note": "No academic articles available."}
            continue

        human_articles = [
            a for a in articles 
            if is_probably_human_study(a.get("title", ""), a.get("full_text", ""))
        ]
        print(f"→ Human-relevant articles retained: {len(human_articles)}")
        print(f"→ Articles rejected for non-human focus: {len(articles) - len(human_articles)}")

        if not human_articles:
            entry["health_summary"] = {"note": "No human-relevant articles found."}
            continue

        entry["health_summary"] = {}

        for category in ["benefits", "concerns", "restrictions"]:
            category_summaries = []
            total_relevant_sentences = 0

            for article in human_articles:
                raw_text = article.get("full_text", "")
                if not raw_text:
                    continue

                relevant_sentences = extract_relevant_sentences(raw_text, aliases=entry.get("aliases", []))
                total_relevant_sentences += len(relevant_sentences)

                filtered_sentences = filter_sentences(relevant_sentences, category)
                filtered_sentences = deduplicate_sentences(filtered_sentences)

                if not filtered_sentences:
                    continue

                # Truncate cleanly by tokens
                combined_text = " ".join(filtered_sentences)
                truncated_input = truncate_to_token_limit(combined_text, tokenizer)

                summary = generate_summary(
                    truncated_input, category, ingredient_name, model, tokenizer
                )
                category_summaries.append(summary)

            print(f"→ [{category.upper()}] Relevant sentences across all articles: {total_relevant_sentences}")
            print(f"→ [{category.upper()}] Summaries generated: {len(category_summaries)}")

            if category_summaries:
                entry["health_summary"][category] = "\n".join(list(set(category_summaries)))
            else:
                entry["health_summary"][category] = f"No relevant {category} information found."

    return results

##### test run

In [None]:
sample4 = "whey protein isolate, sugar, edamame"
results4 = preprocess_ingredient_list_with_health(sample4)

# Run LLM-based summarization on your enriched ingredient data
results4 = enrich_with_health_summaries_v2(results4, model, tokenizer)

In [None]:
# Print the results cleanly
for entry in results4:
    print(f"\n🧪 Ingredient: {entry['standard']}")
    print("→ Aliases:", entry["aliases"])

    print("\nTrusted API Info:")
    for source, data in entry["health_info"].items():
        if source == "NER_Snippets":
            continue
        print(f"  • {source}:")
        if isinstance(data, list) and data:
            for item in data:
                if isinstance(item, str):
                    print(f"     - {item}")
                elif isinstance(item, dict):
                    print(f"     - {item.get('title', '')}")
        elif isinstance(data, list):
            print("     - No results")
        else:
            print(f"     - {data}")

    print("\nNER Entities from Web Snippets:")
    if entry["health_info"]["NER_Snippets"]:
        for ent_text, ent_label in entry["health_info"]["NER_Snippets"]:
            print(f"     - {ent_text} ({ent_label})")
    else:
        print("     - No named entities found.")

    print("\n💬 LLM-Generated Health Summaries:")
    for category, summary in entry.get("health_summary", {}).items():
        print(f"  [{category.upper()}]: {summary}")

save_search_cache()
save_alias_cache()
save_alias_frequency()

##### LLM Improvements TO-DO:
- Removing duplicates
- Combining into one summary

#### ADD EVALUATION METRICS