## Final Project - NLP-Driven Ingredient Health and Dietary Restriction Analysis

*Name: Laura Obermaier*

*Stevens ID: 20027358*

### Ingredient List Processing

##### imports and setup

In [57]:
import re
import csv
import requests
import spacy
import langdetect
from langdetect import detect
from googletrans import Translator
import pubchempy as pcp
from rapidfuzz import process, fuzz
from collections import defaultdict
import sys
import io
import pickle
import requests
import json

csv.field_size_limit(2**20)
# Load English tokenizer
nlp = spacy.load("en_core_web_sm")
translator = Translator()

# Global cache to track discovered aliases
global_alias_set = set()

##### PubChem(aliases) integration

In [58]:
def get_pubchem_aliases(ingredient_name):
    try:
        compounds = pcp.get_compounds(ingredient_name, 'name')
        if compounds:
            synonyms = compounds[0].synonyms
            return list(set(s.lower() for s in synonyms))
        return []
    except Exception as e:
        print(f"[PubChem error for '{ingredient_name}']: {e}")
        return []

##### fuzzy matching

In [59]:
def update_alias_cache(aliases):
    for a in aliases:
        if a:
            global_alias_set.add(a.lower().strip())

def fuzzy_match_alias(name, threshold=90):
    if not global_alias_set:
        print("[Warning] Alias set is empty — did you run seed_aliases_from_open_food_facts?")
        return None
    result = process.extractOne(name, global_alias_set, scorer=fuzz.token_sort_ratio)
    if result is None:
        return None
    match, score, _ = result
    return match if score >= threshold else None

##### seed aliases from Open Food Facts

In [60]:
def seed_aliases_from_open_food_facts(limit=10000):
    url = "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv"
    response = requests.get(url, stream=True)
    response.encoding = 'utf-8'

    alias_dict = defaultdict(set)

    # Read as tab-delimited!
    lines = (line.decode('utf-8') for line in response.iter_lines())
    reader = csv.DictReader(lines, delimiter='\t')

    langs = ['fr', 'de', 'es', 'it']
    count = 0

    for row in reader:
        if count % 500 == 0:
            print(f"Processing row {count}...")

        if count >= limit:
            break
        count += 1

        ingredients_text = row.get("ingredients_text", "")
        if not ingredients_text.strip():
            continue

        for ing in ingredients_text.split(','):
            ing = ing.strip().lower()
            if not ing:
                continue
            alias_dict[ing].add(ing)

            for lang in langs:
                key = f"ingredients_text_{lang}"
                alt = row.get(key)
                if alt:
                    for alt_ing in alt.split(','):
                        alt_ing = alt_ing.strip().lower()
                        if alt_ing:
                            alias_dict[ing].add(alt_ing)
                            alias_dict[alt_ing].add(ing)

    # Push to global alias set
    for aliases in alias_dict.values():
        update_alias_cache(list(aliases))

    print(f"[✓] Seeded {len(global_alias_set)} unique aliases from Open Food Facts.")


##### preprocessing

In [61]:
def standardize_ingredient_name(name):
    name = name.lower().strip()

    aliases = get_pubchem_aliases(name)
    if aliases:
        update_alias_cache(aliases)
        return aliases[0], aliases

    fuzzy = fuzzy_match_alias(name)
    if fuzzy:
        return fuzzy, [fuzzy]

    return name, [name]


def translate_to_english(text):
    try:
        lang = detect(text)
        return translator.translate(text, src=lang, dest='en').text if lang != 'en' else text
    except Exception:
        return text


def preprocess_ingredient_list(text):
    raw_ingredients = re.split(r'[,\n;/••]+', text)
    processed = []

    for raw in raw_ingredients:
        raw = raw.strip()
        if raw:
            translated = translate_to_english(raw)
            standard, aliases = standardize_ingredient_name(translated)
            processed.append({
                "standard": standard,
                "aliases": aliases,
            })

    return processed



##### store aliases for reuse

In [62]:
ALIAS_CACHE_FILE = "alias_cache.json"

def save_alias_cache(path=ALIAS_CACHE_FILE):
    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(sorted(global_alias_set), f, ensure_ascii=False, indent=2)
        print(f"[✓] Alias cache saved to {path}")
    except Exception as e:
        print(f"[!] Error saving alias cache: {e}")

def load_alias_cache(path=ALIAS_CACHE_FILE):
    global global_alias_set
    try:
        with open(path, "r", encoding="utf-8") as f:
            global_alias_set = set(json.load(f))
        print(f"[✓] Loaded {len(global_alias_set)} aliases from cache.")
        return True
    except FileNotFoundError:
        print(f"[ ] Alias cache not found at {path}. Will seed from source...")
        return False
    except Exception as e:
        print(f"[!] Error loading alias cache: {e}")
        return False

##### usage

In [63]:
if not load_alias_cache():
    seed_aliases_from_open_food_facts(limit=5000)
    save_alias_cache()

[✓] Loaded 5833 aliases from cache.


In [64]:
print(sorted(list(global_alias_set))[:50])  # see a preview
print("Total aliases:", len(global_alias_set))

['& calendula & sunflowers', '& enzymes', "' farine de blé", "'salade 17", '( glycoside de steviol)', '(about 20g) 70 hydrolyzed gelatin', '(dont arôme de fumée).', '(iowhey®)', '(may contain hazelnut)', '(nur schoko kakaopulver entölt)', '(sucre', '(unbleached enriched flour', '-fdorvège', '0', '0 %', "0 added sugar and 0 mess. ingredients: organic apples. free top manufactured in a facility free from 12 rlergens from the top 12 allergens 00 certified non gmo project gluten free usda organic verified nongmoproject.org gfco.org p paleo brought to you by that's it nutrition lic. los angeles", '0% (varkens - en rundsvlees', '0% 13% dipotassium phosphate', '0%)', '0.1% sodium benzoate (preservative)', '000 calorie diet', '000 calorie diet. 50 mg cakdum caseinate 14 g 1daly value not established other ingre fructo ntein concentrate', '000 calories a day is used for general ution advice 0% 2% ingredients corn flour (processed with lime)', '000 calories diet. nutrition facts valeur nutritive

In [None]:
# Sample 1
sample1 = "Sal, Zucker, ascorbic acid, curcuma longa, Sodium chloride, E300"
results1 = preprocess_ingredient_list(sample1)
for r in results1:
    print(f"\n{r['standard']}\nAliases: {r['aliases']}")

# Sample 2 (with typos and fuzzy needs)
sample2 = "sal, suagr, ascorbic acid, kurkuma longa, table sald"
results2 = preprocess_ingredient_list(sample2)
for r in results2:
    print(f"\n{r['standard']}\nAliases: {r['aliases']}")


sal
Aliases: ['sal']

d-(+)-sucrose
Aliases: ['d-(+)-sucrose', 'sucrose [jan]', 'alpha-d-glucopyranosyl beta-d-fructofuranoside', '92004-84-7', 'dtxcid101288', 'sucrose biochemical grade', 'bmse000918', '85456-51-5', 'd70407', 'sugar', 'saccarose', 'hy-b1779r', 'sucrose, vetec(tm) reagent grade, rnase and dnase free', 'sucrose [ii]', 'd-saccharose 20000 microg/ml in water', 'sucrose, p.a., acs reagent', 'amerfond', '(2r,3r,4s,5s,6r)-2-(((2s,3s,4s,5r)-3,4-dihydroxy-2,5-bis(hydroxymethyl)tetrahydrofuran-2-yl)oxy)-6-(hydroxymethyl)tetrahydro-2h-pyran-3,4,5-triol', 'sucrose, for molecular biology, >=99.5% (gc)', 'sucraloxum [inn-latin]', 'os59592', '57-50-1', 'alpha-d-glucopyranosylbeta-d-fructofuranoside', 'sucrose, ultrapure', 'alpha-d-glc-(1-2)-beta-d-fru', 'sucraloxum', 'sucrose [ep impurity]', '8027-47-2', 'sr-01000883983', 'sucrose (jp18/nf)', 'glc(alpha1->2beta)fru', 's-67f', 'd-saccharose 1000 microg/ml in methanol', 'sucrose (ii)', 'd00025', 'sucrose, molecular biology grade', 'm

In [None]:
sample3 = "Sodium chloride, E300, Sal"
results3 = preprocess_ingredient_list(sample3)
for r in results3:
    print(f"\n{r['standard']}\nAliases: {r['aliases']}")


watesal a
Aliases: ['watesal a', 'sodium chloridenormal salt', 'ss salt', 'natrium muriaticum', 'sodium chloride 23.4% in plastic container', 'sodium chloride, jis special grade, >=99.5%', 'corvatrol 0.9%', 'vamousse pro lice treatment', 'sykes 0.9% sodium chloride', 'natrii chloridum', 'amphenol-40', 'walgreens saline mistextra strength', 'sodium chloride', 'saline nasal2oz', 'wln: na g', 'sodium chloride, anhydrous, beads, -10 mesh, 99.999% trace metals basis', 'cs-0909803', 'sodium chloride crystal optic disc, 32mm x 3mm, polished both sides', 'sodium standard for aas, ready-to-use, traceable to bam, in h2o', 'nacl solution, 1m', 'sodium chloride for dnase, rnase, and protease, none detected', 'aqua maris extra strong', 'mono-sodium chloride salt', 'veterinary 0.9% sodium chloride', 'sodium chloride, nist(r) srm(r) 919b', 'sodium chloride, puriss. p.a., >=99.5% (at)', 'sodium chloride, tested according to ph.eur.', 'clear mist', 'tid clean care', '0.9% saline', 'extra strenght sali

##### retrieve health info

In [68]:
def retrieve_health_info(ingredient_name):
    """
    Retrieve health-related effects or warnings about an ingredient.
    For now: stub function. Later: connect to APIs or web scraping.
    """
    # Simulate retrieved results
    fake_db = {
        "sodium chloride": "Excessive intake can lead to high blood pressure.",
        "ascorbic acid": "Supports immune function and acts as an antioxidant.",
        "e300": "E300 is Vitamin C, generally considered safe."
    }
    return fake_db.get(ingredient_name.lower(), "No info found.")

##### Web-based Health info retrieval via trusted APIs

In [69]:
# --- Trusted API Integration ---

def query_pubmed(ingredient, max_results=5):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": ingredient,
        "retmode": "json",
        "retmax": max_results,
    }
    ids = requests.get(base_url, params=params).json()["esearchresult"].get("idlist", [])

    summaries = []
    for pmid in ids:
        summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
        summary_params = {"db": "pubmed", "id": pmid, "retmode": "json"}
        r = requests.get(summary_url, params=summary_params).json()
        result = r["result"].get(pmid)
        if result:
            summaries.append({
                "title": result.get("title"),
                "source": result.get("source"),
                "pubdate": result.get("pubdate")
            })
    return summaries

def query_openfda(ingredient):
    base_url = "https://api.fda.gov/food/enforcement.json"
    params = {
        "search": f"product_description:{ingredient}",
        "limit": 5
    }
    try:
        r = requests.get(base_url, params=params).json()
        return [rec["reason_for_recall"] for rec in r.get("results", [])]
    except:
        return []

def query_rxnorm(ingredient):
    url = "https://rxnav.nlm.nih.gov/REST/rxcui.json"
    try:
        rxcui = requests.get(url, params={"name": ingredient}).json()
        return rxcui.get("idGroup", {}).get("rxnormId", [])
    except:
        return []

def get_all_health_info(ingredient):
    return {
        "PubMed": query_pubmed(ingredient),
        "OpenFDA": query_openfda(ingredient),
        "RxNorm": query_rxnorm(ingredient)
    }


##### NER + Semantic Web Search

In [70]:
# --- NER + Semantic Web Search ---

import spacy
from bs4 import BeautifulSoup
import time
from duckduckgo_search import DDGS
from duckduckgo_search.exceptions import DuckDuckGoSearchException

ner_nlp = spacy.load("en_core_web_sm")  # use a separate instance

def extract_entities(text):
    doc = ner_nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def search_web_snippets(ingredient, num_results=3):
    try:
        with DDGS() as ddgs:
            results = ddgs.text(f"{ingredient} health benefits site:.gov OR site:.org", max_results=num_results)
            return [res['body'] for res in results if 'body' in res]
    except DuckDuckGoSearchException as e:
        print(f"[DuckDuckGoSearch] Rate-limited or failed: {e}")
        time.sleep(10)  # Wait before retrying or skipping
        return []
    except Exception as e:
        print(f"[DuckDuckGoSearch] Unexpected error: {e}")
        return []

def semantic_scrape_summary(ingredient):
    snippets = search_web_snippets(ingredient)
    all_ents = []
    for text in snippets:
        ents = extract_entities(text)
        all_ents.extend(ents)
    return all_ents

##### updating ingredient list processing based on health info

In [71]:
def preprocess_ingredient_list_with_health(text):
    raw_ingredients = re.split(r'[,\n;/••]+', text)
    processed = []

    for raw in raw_ingredients:
        raw = raw.strip()
        if raw:
            translated = translate_to_english(raw)
            standard, aliases = standardize_ingredient_name(translated)

            # Add health info from multiple APIs and scraping
            health_info = {
                "TrustedAPIs": get_all_health_info(standard),
                "NER_Snippets": semantic_scrape_summary(standard)
            }

            processed.append({
                "standard": standard,
                "aliases": aliases,
                "health_info": health_info
            })

    return processed

##### test run

In [73]:
sample4 = "sodium chloride, ascorbic acid, curcuma longa, E300, sal, suagr"
results4 = preprocess_ingredient_list_with_health(sample4)

for entry in results4:
    print("\nIngredient:", entry["standard"])
    print("→ Aliases:", entry["aliases"])
    
    print("\nTrusted API Info:")
    for source, data in entry["health_info"]["TrustedAPIs"].items():
        print(f"  • {source}:")
        if isinstance(data, list) and data:
            for item in data:
                print(f"     - {item}")
        elif isinstance(data, list):
            print("     - No results")
        else:
            print(f"     - {data}")

    print("\nNER Entities from Web Snippets:")
    if entry["health_info"]["NER_Snippets"]:
        for ent_text, ent_label in entry["health_info"]["NER_Snippets"]:
            print(f"     - {ent_text} ({ent_label})")
    else:
        print("     - No named entities found.")

[DuckDuckGoSearch] Rate-limited or failed: https://lite.duckduckgo.com/lite/ 202 Ratelimit
[DuckDuckGoSearch] Rate-limited or failed: https://html.duckduckgo.com/html 202 Ratelimit
[DuckDuckGoSearch] Rate-limited or failed: https://lite.duckduckgo.com/lite/ 202 Ratelimit
[DuckDuckGoSearch] Rate-limited or failed: https://lite.duckduckgo.com/lite/ 202 Ratelimit
[DuckDuckGoSearch] Rate-limited or failed: https://html.duckduckgo.com/html 202 Ratelimit
[DuckDuckGoSearch] Rate-limited or failed: https://lite.duckduckgo.com/lite/ 202 Ratelimit

Ingredient: watesal a
→ Aliases: ['watesal a', 'sodium chloridenormal salt', 'ss salt', 'natrium muriaticum', 'sodium chloride 23.4% in plastic container', 'sodium chloride, jis special grade, >=99.5%', 'corvatrol 0.9%', 'vamousse pro lice treatment', 'sykes 0.9% sodium chloride', 'natrii chloridum', 'amphenol-40', 'walgreens saline mistextra strength', 'sodium chloride', 'saline nasal2oz', 'wln: na g', 'sodium chloride, anhydrous, beads, -10 mesh, 99