## Final Project - NLP-Driven Ingredient Health and Dietary Restriction Analysis

*Name: Laura Obermaier*

*Stevens ID: 20027358*

### Ingredient List Processing

##### imports and setup

In [None]:
import re
import csv
import requests
import spacy
import langdetect
from langdetect import detect
from googletrans import Translator
import pubchempy as pcp
from rapidfuzz import process, fuzz
from collections import defaultdict
import sys
import io
import pickle
import json
import time
from duckduckgo_search import DDGS
from bs4 import BeautifulSoup
from duckduckgo_search.exceptions import DuckDuckGoSearchException
from difflib import SequenceMatcher

csv.field_size_limit(2**20)
# Load English tokenizer
nlp = spacy.load("en_core_web_sm")
translator = Translator()

# Global cache to track discovered aliases
global_alias_set = set()

# Local search result cache
search_cache = {}

##### alias relivancy filter

In [None]:
def is_relevant_alias(alias):
    alias_clean = alias.strip().lower()

    if len(alias_clean.split()) > 4:
        return False
    if re.search(r'\d{3,}|\d+%|[^\w\s\-]', alias_clean):
        return False
    if re.search(r'^\d{2,5}-\d{2,5}-\d$', alias_clean): 
        return False

    return True

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

##### PubChem(aliases) integration

In [75]:
def get_pubchem_aliases(ingredient_name):
    try:
        compounds = pcp.get_compounds(ingredient_name, 'name')
        if compounds:
            synonyms = compounds[0].synonyms
            return list(set(s.lower() for s in synonyms))
        return []
    except Exception as e:
        print(f"[PubChem error for '{ingredient_name}']: {e}")
        return []
    
def update_alias_cache(aliases):
    for a in aliases:
        if a:
            global_alias_set.add(a.lower().strip())

##### fuzzy matching

In [76]:
def fuzzy_match_alias(name, threshold=90):
    if not global_alias_set:
        print("[Warning] Alias set is empty — did you run seed_aliases_from_open_food_facts?")
        return None
    result = process.extractOne(name, global_alias_set, scorer=fuzz.token_sort_ratio)
    if result is None:
        return None
    match, score, _ = result
    return match if score >= threshold else None

##### seed aliases from Open Food Facts

In [77]:
def seed_aliases_from_open_food_facts(limit=10000):
    url = "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv"
    response = requests.get(url, stream=True)
    response.encoding = 'utf-8'

    alias_dict = defaultdict(set)
    lines = (line.decode('utf-8') for line in response.iter_lines())
    reader = csv.DictReader(lines, delimiter='\t')

    langs = ['fr', 'de', 'es', 'it']
    count = 0

    for row in reader:
        if count % 500 == 0:
            print(f"Processing row {count}...")
        if count >= limit:
            break
        count += 1

        ingredients_text = row.get("ingredients_text", "")
        if not ingredients_text.strip():
            continue

        for ing in ingredients_text.split(','):
            ing = ing.strip().lower()
            if not ing:
                continue
            alias_dict[ing].add(ing)

            for lang in langs:
                key = f"ingredients_text_{lang}"
                alt = row.get(key)
                if alt:
                    for alt_ing in alt.split(','):
                        alt_ing = alt_ing.strip().lower()
                        if alt_ing:
                            alias_dict[ing].add(alt_ing)
                            alias_dict[alt_ing].add(ing)

    for aliases in alias_dict.values():
        update_alias_cache(list(aliases))
    print(f"[✓] Seeded {len(global_alias_set)} unique aliases from Open Food Facts.")

##### preprocessing

In [78]:
def standardize_ingredient_name(name):
    name = name.lower().strip()
    aliases = get_pubchem_aliases(name)
    if aliases:
        update_alias_cache(aliases)
        return aliases[0], aliases
    fuzzy = fuzzy_match_alias(name)
    if fuzzy:
        return fuzzy, [fuzzy]
    return name, [name]

def translate_to_english(text):
    try:
        lang = detect(text)
        return translator.translate(text, src=lang, dest='en').text if lang != 'en' else text
    except Exception:
        return text

##### Web-based Health info retrieval via trusted APIs

In [79]:
def query_pubmed(ingredient, max_results=5):
    try:
        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        params = {"db": "pubmed", "term": ingredient, "retmode": "json", "retmax": max_results}
        ids = requests.get(base_url, params=params).json()["esearchresult"].get("idlist", [])
        summaries = []
        for pmid in ids:
            summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
            r = requests.get(summary_url, params={"db": "pubmed", "id": pmid, "retmode": "json"}).json()
            result = r["result"].get(pmid)
            if result:
                summaries.append({"title": result.get("title"), "source": result.get("source"), "pubdate": result.get("pubdate")})
        return summaries
    except:
        return []

def query_openfda(ingredient):
    try:
        base_url = "https://api.fda.gov/food/enforcement.json"
        params = {"search": f"product_description:{ingredient}", "limit": 5}
        r = requests.get(base_url, params=params).json()
        return [rec["reason_for_recall"] for rec in r.get("results", [])]
    except:
        return []

def query_rxnorm(ingredient):
    try:
        url = "https://rxnav.nlm.nih.gov/REST/rxcui.json"
        rxcui = requests.get(url, params={"name": ingredient}).json()
        return rxcui.get("idGroup", {}).get("rxnormId", [])
    except:
        return []

def get_all_health_info(ingredient):
    return {
        "PubMed": query_pubmed(ingredient),
        "OpenFDA": query_openfda(ingredient),
        "RxNorm": query_rxnorm(ingredient)
    }

##### NER + Semantic Web Search

In [80]:
# --- NER + Semantic Web Search ---
ner_nlp = spacy.load("en_core_web_sm")  # use a separate instance

def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def search_web_snippets(ingredient, num_results=3):
    if ingredient in search_cache:
        return search_cache[ingredient]
    try:
        with DDGS() as ddgs:
            results = ddgs.text(f"{ingredient} health benefits site:.gov OR site:.org", max_results=num_results)
            snippets = [res['body'] for res in results if 'body' in res]
            search_cache[ingredient] = snippets
            time.sleep(1.2)  # delay to prevent rate-limit
            return snippets
    except Exception as e:
        print(f"[DuckDuckGo Error for '{ingredient}']: {e}")
        return []

def semantic_scrape_summary(ingredient):
    snippets = search_web_snippets(ingredient)
    all_ents = []
    for text in snippets:
        ents = extract_entities(text)
        all_ents.extend(ents)
    return all_ents

##### updating ingredient list processing based on health info

In [81]:
def preprocess_ingredient_list_with_health(text):
    raw_ingredients = re.split(r'[\,\n;/••]+', text)
    processed = []
    seen_terms = set()

    for raw in raw_ingredients:
        raw = raw.strip()
        if not raw:
            continue

        translated = translate_to_english(raw)
        standard, aliases = standardize_ingredient_name(translated)
        all_terms = list(set([standard] + aliases))

        combined_health_info = {"PubMed": [], "OpenFDA": [], "RxNorm": [], "NER_Snippets": []}

        for term in all_terms:
            if term.lower() in seen_terms:
                continue
            seen_terms.add(term.lower())

            api_info = get_all_health_info(term)
            ner_info = semantic_scrape_summary(term)

            for k in combined_health_info:
                if k == "NER_Snippets":
                    combined_health_info[k].extend([i for i in ner_info if i not in combined_health_info[k]])
                else:
                    combined_health_info[k].extend([i for i in api_info[k] if i not in combined_health_info[k]])

        processed.append({
            "standard": standard,
            "aliases": aliases,
            "health_info": combined_health_info
        })

    return processed

##### store aliases for reuse

In [82]:
ALIAS_CACHE_FILE = "alias_cache.json"

def save_alias_cache(path=ALIAS_CACHE_FILE):
    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(sorted(global_alias_set), f, ensure_ascii=False, indent=2)
        print(f"[✓] Alias cache saved to {path}")
    except Exception as e:
        print(f"[!] Error saving alias cache: {e}")

def load_alias_cache(path=ALIAS_CACHE_FILE):
    global global_alias_set
    try:
        with open(path, "r", encoding="utf-8") as f:
            global_alias_set = set(json.load(f))
        print(f"[✓] Loaded {len(global_alias_set)} aliases from cache.")
        return True
    except FileNotFoundError:
        print(f"[ ] Alias cache not found at {path}. Will seed from source...")
        return False
    except Exception as e:
        print(f"[!] Error loading alias cache: {e}")
        return False

In [83]:
if not load_alias_cache():
    seed_aliases_from_open_food_facts(limit=5000)
    save_alias_cache()

[✓] Loaded 5833 aliases from cache.


##### test run

In [None]:
sample4 = "sodium chloride, ascorbic acid, curcuma longa, E300, sal, suagr"
results4 = preprocess_ingredient_list_with_health(sample4)

for entry in results4:
    print("\nIngredient:", entry["standard"])
    print("→ Aliases:", entry["aliases"])
    
    print("\nTrusted API Info:")
    for source, data in entry["health_info"]["TrustedAPIs"].items():
        print(f"  • {source}:")
        if isinstance(data, list) and data:
            for item in data:
                print(f"     - {item}")
        elif isinstance(data, list):
            print("     - No results")
        else:
            print(f"     - {data}")

    print("\nNER Entities from Web Snippets:")
    if entry["health_info"]["NER_Snippets"]:
        for ent_text, ent_label in entry["health_info"]["NER_Snippets"]:
            print(f"     - {ent_text} ({ent_label})")
    else:
        print("     - No named entities found.")

[DuckDuckGo Error for 'sodium chloride, jis special grade, >=99.5%']: https://html.duckduckgo.com/html 202 Ratelimit
[DuckDuckGo Error for 'sodium chloride 23.4% in plastic container']: https://lite.duckduckgo.com/lite/ 202 Ratelimit
[DuckDuckGo Error for 'corvatrol 0.9%']: https://lite.duckduckgo.com/lite/ 202 Ratelimit
[DuckDuckGo Error for 'vamousse pro lice treatment']: https://html.duckduckgo.com/html 202 Ratelimit
[DuckDuckGo Error for 'sykes 0.9% sodium chloride']: https://html.duckduckgo.com/html 202 Ratelimit
[DuckDuckGo Error for 'natrii chloridum']: https://lite.duckduckgo.com/lite/ 202 Ratelimit
[DuckDuckGo Error for 'amphenol-40']: https://lite.duckduckgo.com/lite/ 202 Ratelimit
[DuckDuckGo Error for 'walgreens saline mistextra strength']: https://lite.duckduckgo.com/lite/ 202 Ratelimit
[DuckDuckGo Error for 'sodium chloride']: https://html.duckduckgo.com/html 202 Ratelimit
[DuckDuckGo Error for 'saline nasal2oz']: https://lite.duckduckgo.com/lite/ 202 Ratelimit
[DuckDuckG