## Final Project - NLP-Driven Ingredient Health and Dietary Restriction Analysis

*Name: Laura Obermaier*

*Stevens ID: 20027358*

### Ingredient List Processing

##### imports and setup

In [37]:
import re
import csv
import requests
import spacy
import langdetect
from langdetect import detect
from googletrans import Translator
import pubchempy as pcp
from rapidfuzz import process, fuzz
from collections import defaultdict
import sys
import io
import pickle
import requests
import json

csv.field_size_limit(2**20)
# Load English tokenizer
nlp = spacy.load("en_core_web_sm")
translator = Translator()

# Global cache to track discovered aliases
global_alias_set = set()

##### PubChem(aliases) integration

In [38]:
def get_pubchem_aliases(ingredient_name):
    try:
        compounds = pcp.get_compounds(ingredient_name, 'name')
        if compounds:
            synonyms = compounds[0].synonyms
            return list(set(s.lower() for s in synonyms))
        return []
    except Exception as e:
        print(f"[PubChem error for '{ingredient_name}']: {e}")
        return []

##### fuzzy matching

In [39]:
def update_alias_cache(aliases):
    for a in aliases:
        if a:
            global_alias_set.add(a.lower().strip())

def fuzzy_match_alias(name, threshold=90):
    if not global_alias_set:
        print("[Warning] Alias set is empty — did you run seed_aliases_from_open_food_facts?")
        return None
    result = process.extractOne(name, global_alias_set, scorer=fuzz.token_sort_ratio)
    if result is None:
        return None
    match, score, _ = result
    return match if score >= threshold else None

##### seed aliases from Open Food Facts

In [40]:
def seed_aliases_from_open_food_facts(limit=10000):
    url = "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv"
    response = requests.get(url, stream=True)
    response.encoding = 'utf-8'

    alias_dict = defaultdict(set)

    # Read as tab-delimited!
    lines = (line.decode('utf-8') for line in response.iter_lines())
    reader = csv.DictReader(lines, delimiter='\t')

    langs = ['fr', 'de', 'es', 'it']
    count = 0

    for row in reader:
        if count % 500 == 0:
            print(f"Processing row {count}...")

        if count >= limit:
            break
        count += 1

        ingredients_text = row.get("ingredients_text", "")
        if not ingredients_text.strip():
            continue

        for ing in ingredients_text.split(','):
            ing = ing.strip().lower()
            if not ing:
                continue
            alias_dict[ing].add(ing)

            for lang in langs:
                key = f"ingredients_text_{lang}"
                alt = row.get(key)
                if alt:
                    for alt_ing in alt.split(','):
                        alt_ing = alt_ing.strip().lower()
                        if alt_ing:
                            alias_dict[ing].add(alt_ing)
                            alias_dict[alt_ing].add(ing)

    # Push to global alias set
    for aliases in alias_dict.values():
        update_alias_cache(list(aliases))

    print(f"[✓] Seeded {len(global_alias_set)} unique aliases from Open Food Facts.")


##### health info retrieval

In [41]:
def retrieve_health_info(ingredient_name):
    """
    Retrieve health-related effects or warnings about an ingredient.
    For now: stub function. Later: connect to APIs or web scraping.
    """
    # Simulate retrieved results
    fake_db = {
        "sodium chloride": "Excessive intake can lead to high blood pressure.",
        "ascorbic acid": "Supports immune function and acts as an antioxidant.",
        "e300": "E300 is Vitamin C, generally considered safe."
    }
    return fake_db.get(ingredient_name.lower(), "No info found.")

##### preprocessing

In [42]:
def standardize_ingredient_name(name):
    name = name.lower().strip()

    aliases = get_pubchem_aliases(name)
    if aliases:
        update_alias_cache(aliases)
        return aliases[0], aliases

    fuzzy = fuzzy_match_alias(name)
    if fuzzy:
        return fuzzy, [fuzzy]

    return name, [name]


def translate_to_english(text):
    try:
        lang = detect(text)
        return translator.translate(text, src=lang, dest='en').text if lang != 'en' else text
    except Exception:
        return text


def preprocess_ingredient_list_with_health(text):
    raw_ingredients = re.split(r'[,\n;/••]+', text)
    processed = []

    for raw in raw_ingredients:
        raw = raw.strip()
        if raw:
            translated = translate_to_english(raw)
            standard, aliases = standardize_ingredient_name(translated)
            health_info = retrieve_health_info(standard)
            processed.append({
                "standard": standard,
                "aliases": aliases,
                "health_info": health_info
            })

    return processed



##### store aliases for reuse

In [43]:
ALIAS_CACHE_FILE = "alias_cache.json"

def save_alias_cache(path=ALIAS_CACHE_FILE):
    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(sorted(global_alias_set), f, ensure_ascii=False, indent=2)
        print(f"[✓] Alias cache saved to {path}")
    except Exception as e:
        print(f"[!] Error saving alias cache: {e}")

def load_alias_cache(path=ALIAS_CACHE_FILE):
    global global_alias_set
    try:
        with open(path, "r", encoding="utf-8") as f:
            global_alias_set = set(json.load(f))
        print(f"[✓] Loaded {len(global_alias_set)} aliases from cache.")
        return True
    except FileNotFoundError:
        print(f"[ ] Alias cache not found at {path}. Will seed from source...")
        return False
    except Exception as e:
        print(f"[!] Error loading alias cache: {e}")
        return False

##### usage

In [44]:
# Run ONLY once on startup!!!
seed_aliases(limit=5000)

[✓] Loaded 5833 aliases from cache.


In [45]:
print(sorted(list(global_alias_set))[:50])  # see a preview
print("Total aliases:", len(global_alias_set))

['& calendula & sunflowers', '& enzymes', "' farine de blé", "'salade 17", '( glycoside de steviol)', '(about 20g) 70 hydrolyzed gelatin', '(dont arôme de fumée).', '(iowhey®)', '(may contain hazelnut)', '(nur schoko kakaopulver entölt)', '(sucre', '(unbleached enriched flour', '-fdorvège', '0', '0 %', "0 added sugar and 0 mess. ingredients: organic apples. free top manufactured in a facility free from 12 rlergens from the top 12 allergens 00 certified non gmo project gluten free usda organic verified nongmoproject.org gfco.org p paleo brought to you by that's it nutrition lic. los angeles", '0% (varkens - en rundsvlees', '0% 13% dipotassium phosphate', '0%)', '0.1% sodium benzoate (preservative)', '000 calorie diet', '000 calorie diet. 50 mg cakdum caseinate 14 g 1daly value not established other ingre fructo ntein concentrate', '000 calories a day is used for general ution advice 0% 2% ingredients corn flour (processed with lime)', '000 calories diet. nutrition facts valeur nutritive

In [47]:
# Sample 1
sample = "Sal, Zucker, ascorbic acid, curcuma longa, Sodium chloride, E300"
results = preprocess_ingredient_list_with_health(sample)
for r in results:
    print(f"\n{r['standard']}\nAliases: {r['aliases']}\nHealth Info: {r['health_info']}")

# Sample 2 (with typos and fuzzy needs)
sample = "sal, suagr, ascorbic acid, kurkuma longa, table sald"
results = preprocess_ingredient_list_with_health(sample)
for r in results:
    print(f"\n{r['standard']}\nAliases: {r['aliases']}\nHealth Info: {r['health_info']}")


sal
Aliases: ['sal']
Health Info: No info found.

sucrose, grade ii, plant cell culture tested
Aliases: ['sucrose, grade ii, plant cell culture tested', 'db02772', 'd-sucrose', 'sucrose, >=99.5% (gc)', '57-50-1', 'os02339', 'sucrose, acs reagent', 'sugar [vandf]', 'sacharose', 'sucrose, usp', 'sugar,(s)', 'sucrose, saj first grade', 'os59592', 'saccharose', 'sucrose [jan]', 'sucrose, analytical standard, for enzymatic assay kit sca20', 'sucrose', 'compressible sugar', 'sugar spheres (nf)', 'bdbm50108105', 'beta-d-fructofuranosyl-(2&harr;1)-alpha-d-glucopyranoside', 'sucrose, purified', 'sucrose, ultrapure', 'sucrose [ii]', 'gne-410', 'sucraloxum [inn-latin]', 'sucraloxum (inn-latin)', 'sucrose, >=99.5%', 'sugar, white', 'sucrose, meets usp testing specifications', 'sucrose (tn)', 'ncgc00258948-01', 'chebi:65313', 'sucrose [ep monograph]', 'saccarose', '1-alpha-d-glucopyranosyl-2-beta-d-fructofuranoside', 'frost sugar', 'sucrose acs grade', "sugar, confectioner's (nf)", 'alpha-d-glc-(1

In [48]:
sample = "Sodium chloride, E300, Sal"
results = preprocess_ingredient_list_with_health(sample)
for r in results:
    print(f"\n{r['standard']}\nAliases: {r['aliases']}\nHealth Info: {r['health_info']}")


sodium chloride solution, 1 mol/l -1 n volumetric standard solution
Aliases: ['sodium chloride solution, 1 mol/l -1 n volumetric standard solution', 'adsorbanac', 'aqua maris daily clean', 'api pond pond salt', 'chebi:26710', 'muro 128', 'q2314', 'natrium muriaticum', 'trisodium trichloride', 'akos024457457', 'sel gemme8146', 'sodium chloride, nist(r) srm(r) 919b', 'usepa/opp pesticide code: 13905', 'vamousse lice treatment', 'homeopathic pediculicide', 'tid clean dent', '7647-14-5', 'hy-y0344h', 'sodium chloride1 gram', 'isotonic saline', 'natrum muriaticum kit refill', 'sodium standard for aas, ready-to-use, traceable to bam, in h2o', 'cs-0376163', 'b1655 [langual]', 'nacl solution, 1m', 'wln: na g', 'beanguard gargle', 'sodium chloride for dnase, rnase, and protease, none detected', 'akos024438089', 'sodium chloride, reagent grade, >=98%, +80 mesh particle size', 'sodium chloride brine, purified', 'natrii chloridum [who-ip latin]', 'white crystal', '0.9% sodium chloride', 'arm-a-vi

### Ingredient Data Retrieval

In [None]:
def retrieve_health_info(ingredient_name):
    """
    Retrieve health-related effects or warnings about an ingredient.
    For now: stub function. Later: connect to APIs or web scraping.
    """
    # Simulate retrieved results
    fake_db = {
        "sodium chloride": "Excessive intake can lead to high blood pressure.",
        "ascorbic acid": "Supports immune function and acts as an antioxidant.",
        "e300": "E300 is Vitamin C, generally considered safe."
    }
    return fake_db.get(ingredient_name.lower(), "No info found.")