# recipe preprocessing

In [33]:
# === Imports & Constants ======================================================
import re, math, ast, unicodedata, json, os
from datetime import datetime, timezone
from fractions import Fraction
from pathlib import Path
import pandas as pd
from tqdm import tqdm


In [44]:
# === Helpers =================================================================

def safe_str(x):
    """Return '' for None/NaN; else str(x)."""
    if x is None:
        return ""
    if isinstance(x, float) and math.isnan(x):
        return ""
    return str(x)

def norm(s: str) -> str:
    """Normalize Unicode, lowercase, collapse whitespace."""
    s = safe_str(s)
    s = unicodedata.normalize("NFKC", s).strip()
    return re.sub(r"\s+", " ", s).lower()

def kebab(s: str) -> str:
    """Convert to kebab-case for canonical tags."""
    s = norm(s)
    s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
    return s

def slugify(s: str) -> str:
    """Generate a URL-friendly slug."""
    return kebab(s)

def safe_join(parts):
    """Join a list of strings safely."""
    return " ".join(safe_str(p) for p in parts if safe_str(p))

def now_mongo_date():
    """ISO date compatible with Mongo import."""
    return {"$date": datetime.now(timezone.utc).isoformat().replace("+00:00","Z")}


# === Ingredient Parsing ======================================================

UNITS = {"tsp","teaspoon","tbsp","tablespoon","cup","cups","g","kg","ml","l","pound","lb","oz"}

def to_number(qs: str):
    qs = (qs or "").strip()
    if not qs:
        return None
    try:
        if " " in qs:  # mixed number e.g. "1 1/2"
            a, b = qs.split(" ", 1)
            return float(Fraction(a) + Fraction(b))
        if "/" in qs:  # fraction
            return float(Fraction(qs))
        return float(qs)
    except Exception:
        return None

def parse_ingredient(raw: str):
    """Naive but robust parsing: qty + unit + name"""
    t = norm(raw)
    m = re.match(r"^([\d\s\/\.]+)\s*([a-zA-Z]+)?\s*(.*)$", t)
    qty, unit, rest = None, None, t
    if m:
        qty_s, unit_s, rest = m.groups()
        qty = to_number(qty_s) if qty_s else None
        if unit_s and unit_s.lower() in UNITS:
            unit = unit_s.lower()
        else:
            rest = ((unit_s or "") + " " + rest).strip() if unit_s else rest
    name = rest.strip(", ")
    return {"name": name, "qty": qty, "unit": unit, "raw": raw}


# === Tagging Lexicons & Constants ============================================

PREPROC_VERSION = "v1.0-tags"

DIETARY   = {"vegan","vegetarian","pescatarian","halal","kosher","gluten-free","dairy-free","nut-free","egg-free","low-carb","low-fat"}
ALLERGENS = {"gluten","dairy","egg","peanut","tree-nut","soy","shellfish","fish","sesame"}
FLAVOURS  = {"spicy","sweet","sour","bitter","salty","umami","smoky","tangy","herby","garlicky","citrusy","creamy","rich","fresh"}
TECHNIQUE = {"grill","roast","bake","fry","deep-fry","stir-fry","braise","stew","steam","poach","sous-vide","marinate","pickle","ferment"}

VEGAN_BLACKLIST = {"beef","chicken","pork","lamb","shrimp","prawn","fish","tuna","salmon","anchovy","egg","milk","butter","cheese","yogurt","honey","gelatin"}
VEGETARIAN_BLACKLIST = {"beef","chicken","pork","lamb","shrimp","prawn","fish","tuna","salmon","anchovy","gelatin"}
GLUTEN_SOURCES = {"wheat","flour","breadcrumbs","barley","rye","semolina","farina","spelt","bulgur","seitan","graham","soy sauce"}

ALLERGEN_MAP = {
    "wheat":"gluten","barley":"gluten","rye":"gluten","flour":"gluten","breadcrumbs":"gluten","spelt":"gluten","semolina":"gluten","bulgur":"gluten","seitan":"gluten",
    "milk":"dairy","butter":"dairy","cheese":"dairy","yogurt":"dairy","cream":"dairy","ghee":"dairy",
    "egg":"egg","eggs":"egg","albumen":"egg","mayonnaise":"egg",
    "peanut":"peanut","peanuts":"peanut",
    "almond":"tree-nut","walnut":"tree-nut","hazelnut":"tree-nut","cashew":"tree-nut","pistachio":"tree-nut","pecan":"tree-nut","nut":"tree-nut",
    "soy":"soy","tofu":"soy","tempeh":"soy","edamame":"soy","miso":"soy","soy sauce":"soy","tamari":"soy",
    "shrimp":"shellfish","prawn":"shellfish","crab":"shellfish","lobster":"shellfish","clam":"shellfish","scallop":"shellfish","mussel":"shellfish",
    "fish":"fish","anchovy":"fish","salmon":"fish","tuna":"fish","cod":"fish","haddock":"fish",
    "sesame":"sesame","tahini":"sesame"
}

FLAVOUR_FROM_ING = {
    "chili":"spicy","chilli":"spicy","jalapeno":"spicy","gochugaru":"spicy","harissa":"spicy","sriracha":"spicy","cayenne":"spicy","gochujang":"spicy umami",
    "anchovy":"umami","mushroom":"umami","kombu":"umami","parmesan":"umami","soy":"umami","miso":"umami",
    "lemon":"citrusy","lime":"citrusy","orange":"citrusy","sumac":"tangy",
    "smoked":"smoky","chipotle":"smoky","smoke":"smoky","paprika":"smoky",
    "garlic":"garlicky","basil":"herby","parsley":"herby","cilantro":"herby","dill":"herby","oregano":"herby","thyme":"herby","rosemary":"herby","mint":"herby","sage":"herby",
    "cream":"creamy","coconut milk":"creamy","yogurt":"creamy","butter":"rich","maple":"sweet","sugar":"sweet","honey":"sweet"
}

FLAVOUR_KEYWORDS = {
    "spicy": {"chili","chile","jalapeno","harissa","gochugaru","sriracha","cayenne","chilli"},
    "smoky": {"smoked","chipotle","smoky"},
    "sweet": {"sugar","honey","maple","syrup","molasses"},
    "herby": {"basil","parsley","cilantro","dill","oregano","thyme","rosemary","mint","sage"},
    "tangy": {"lemon","lime","vinegar","sumac","tamarind","yogurt"},
    "umami": {"miso","soy","tamari","anchovy","mushroom","kombu","parmesan"}
}

TECHNIQUE_PATTERNS = [
    ("grilled","grill"),("grill","grill"),
    ("roasted","roast"),("roast","roast"),
    ("baked","bake"),("bake","bake"),
    ("fried","fry"),("fry","fry"),("pan-fry","fry"),
    ("deep-fry","deep-fry"),("stir-fry","stir-fry"),
    ("braise","braise"),("stew","stew"),("steam","steam"),
    ("poach","poach"),("sous vide","sous-vide"),
    ("marinate","marinate"),("pickle","pickle"),("ferment","ferment")
]

ANIMAL = {
    "beef","steak","veal","pork","bacon","ham","sausage","lamb","mutton",
    "chicken","turkey","duck","goose",
    "fish","salmon","tuna","cod","anchovy","sardine","trout","herring",
    "shrimp","prawn","crab","lobster","clam","oyster","mussel",
    "gelatin","lard","broth","stock","bone","honey"
}

def derive_dietary(ingredient_names, title="", steps=""):
    """Return conservative dietary tags based on ingredients + title text."""
    names = {n.lower() for n in ingredient_names}
    text  = f"{title} {' '.join(ingredient_names)} {steps}".lower()

    vegan = not any(x in text for x in ANIMAL)
    vegetarian = not any(x in text for x in ANIMAL - {"milk","butter","cheese","yogurt","egg","honey"})
    gf = not any(x in text for x in GLUTEN_SOURCES)

    tags = []
    if vegan:
        tags.append("vegan")
    elif vegetarian:
        tags.append("vegetarian")
    if gf:
        tags.append("gluten-free")
    return tags





## tag assignment 

In [45]:
# === Tagging Functions =======================================================

# === Tagging Functions (final) ===============================================

def assign_structured_tags(title, ingredients_list, steps_list):
    """Deterministic rule-based tag assignment (with improved dietary)."""
    text_all = norm(" ".join([title] + ingredients_list + steps_list))
    ing_norm = [norm(i) for i in ingredients_list]

    # canonical ingredient tags
    ing_names = [kebab(parse_ingredient(i)["name"]) for i in ingredients_list]
    ingredient_tags = sorted({x for x in ing_names if x})

    # Allergens: scan ingredients + text (title/steps) for robustness
    allergens = set()
    for i in ing_norm + [text_all]:
        for k, v in ALLERGEN_MAP.items():
            if k in i:
                allergens.add(v)

    # Dietary: use your stricter function (title + steps aware)
    dietary = set(derive_dietary(ingredient_tags, title=title, steps=" ".join(steps_list)))

    # Flavour
    flavour = set()
    for i in ing_norm:
        for k, vs in FLAVOUR_FROM_ING.items():
            if k in i:
                flavour.update(vs.split())
    for label, keys in FLAVOUR_KEYWORDS.items():
        if any(k in text_all for k in keys):
            flavour.add(label)

    # Technique
    technique = {tag for pat, tag in TECHNIQUE_PATTERNS if pat in text_all}

    # Clamp to allowed sets (defensive)
    dietary  &= DIETARY
    allergens &= ALLERGENS
    flavour  &= FLAVOURS
    technique &= TECHNIQUE

    return {
        "ingredient_tags": sorted(ingredient_tags),
        "dietary_tags": sorted(dietary),
        "allergen_tags": sorted(allergens),
        "flavour_tags": sorted(flavour),
        "technique_tags": sorted(technique),
    }

def provenance_obj():
    """Attach provenance using Extended JSON so mongoimport accepts it."""
    return {
        "version": PREPROC_VERSION,
        "methods": ["rules"],
        "ts": {"$date": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")},
    }


## load dataset from data folder

In [46]:
# === Load CSV ================================================================
print("CWD:", os.getcwd())  
CSV_PATH = Path("data") / "recipe_nlg" / "full_dataset.csv"
print("CSV exists?", CSV_PATH.exists(), "→", CSV_PATH)

df = pd.read_csv(
    CSV_PATH,
    usecols=["title","ingredients","directions","link","source","NER"],
    low_memory=False
)

def to_list_from_cell(x):
    """Parse list-like strings safely."""
    if x is None or (isinstance(x, float) and math.isnan(x)):
        return []
    s = safe_str(x).strip()
    try:
        v = ast.literal_eval(s)
        if isinstance(v, (list, tuple)):
            return [safe_str(i) for i in v if safe_str(i)]
        return [safe_str(v)] if safe_str(v) else []
    except Exception:
        if "|" in s:
            return [p.strip() for p in s.split("|") if p.strip()]
        if "\n" in s:
            return [p.strip() for p in s.split("\n") if p.strip()]
        return [s] if s else []


CWD: /Users/Lorena/Developer/FlavorNet/mongoDB
CSV exists? True → data/recipe_nlg/full_dataset.csv


In [47]:
df.shape

(2231142, 6)

In [48]:
df.head()

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


### Original columns in `full_dataset.csv`

Included in **RecipeNLG** dataset:

| column | meaning |
|--------|----------|
| `title` | the recipe’s name (e.g., “No-Bake Nut Cookies”) |
| `ingredients` | list-like string of raw ingredient phrases (e.g., `["1 c. sugar", "1/2 c. milk", ...]`) |
| `directions` | list-like string of recipe steps |
| `link` | original web source URL |
| `source` | scraped website or author |
| `NER` | list of named entities / ingredient names extracted by NLP |

---

### Our preprocessing / augmentation

Each record is transformed into a **Mongo-ready JSON document** with normalized fields, structured tags, and provenance.  
The pipeline standardizes text, derives interpretable attributes for filtering, and prepares data for embedding-based similarity search.

| stage | purpose / transformation | output fields |
|--------|--------------------------|----------------|
| **Normalization** | Clean and lowercase all text, normalize Unicode, and slugify recipe titles for unique IDs. | `title`, `slug` |
| **Ingredient parsing** | Split each raw ingredient phrase into structured parts (`qty`, `unit`, `name`, `raw`). Also extract canonical ingredient names in kebab-case. | `ingredients[]`, `ingredient_tags[]` |
| **Dietary tagging** | Detect dietary categories from ingredients using deterministic rules (e.g., no animal products → `vegan`; no gluten sources → `gluten-free`). | `dietary_tags[]` |
| **Allergen detection** | Map ingredients to the top common allergens (gluten, dairy, egg, peanut, tree-nut, soy, shellfish, fish, sesame). | `allergen_tags[]` |
| **Flavour profiling** | Identify sensory descriptors from ingredients and text (e.g., chili → `spicy`, lemon → `citrusy`, miso → `umami`). | `flavour_tags[]` |
| **Technique extraction** | Parse cooking verbs in title + instructions to detect methods such as `bake`, `grill`, `stew`, `marinate`, etc. | `technique_tags[]` |
| **Metadata carry-over** | Preserve any available author, link, and source information. | `author`, `source_url` |
| **Provenance & versioning** | Record preprocessing version and method trace (e.g., `rules`, `llm`, `model`) for reproducibility. | `tags_provenance` |
| **Timestamping** | Attach UTC creation/update times for database imports. | `created_at`, `updated_at` |

---

### Result

Each recipe becomes a clean, semantically-rich document ready for:

- **Faceted search** in MongoDB (filter by dietary / flavour / technique / allergens)  
- **Vector similarity retrieval** in Qdrant (using embeddings on text + ingredients)  
- **Topic modeling** with BERTopic (to discover latent clusters such as *“weeknight creamy bakes”*)  
- **LLM enrichment** (for cuisine classification, sub-cuisine detection, or pairing suggestions)


## transform to mongo schema

In [49]:
# === Convert Rows to Mongo-Ready Documents ===================================

out_docs = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    title = safe_str(row.get("title"))
    ing_raw = to_list_from_cell(row.get("ingredients"))
    steps   = to_list_from_cell(row.get("directions"))
    
    tags = assign_structured_tags(title, ing_raw, steps)
    ings = [parse_ingredient(x) for x in ing_raw]

    doc = {
        "title": title,
        "slug": slugify(title),
        "ingredients": ings,
        "steps": steps,
        "tags": [],
        "dietary_tags": tags["dietary_tags"],
        "allergen_tags": tags["allergen_tags"],
        "flavour_tags": tags["flavour_tags"],
        "technique_tags": tags["technique_tags"],
        "ingredient_tags": tags["ingredient_tags"],
        "cuisine": None,
        "cuisine_confidence": None,
        "cuisine_method": [],
        "course": None,
        "tags_provenance": provenance_obj(),
        "servings": None,
        "times": {"prep_min": None, "cook_min": None, "total_min": None},
        "nutrition": {},
        "rating": {"value": None, "count": None},
        "images": [],
        "source_url": row.get("link"),
        "author": row.get("source"),
        "created_at": now_mongo_date(),
        "updated_at": now_mongo_date()
    }
    out_docs.append(doc)

print(f"Generated {len(out_docs)} recipe documents.")


100%|██████████| 2231142/2231142 [28:39<00:00, 1297.33it/s] 

Generated 2231142 recipe documents.





In [50]:
out_docs

[{'title': 'No-Bake Nut Cookies',
  'slug': 'no-bake-nut-cookies',
  'ingredients': [{'name': 'c . firmly packed brown sugar',
    'qty': 1.0,
    'unit': None,
    'raw': '1 c. firmly packed brown sugar'},
   {'name': 'c . evaporated milk',
    'qty': 0.5,
    'unit': None,
    'raw': '1/2 c. evaporated milk'},
   {'name': '. vanilla', 'qty': 0.5, 'unit': 'tsp', 'raw': '1/2 tsp. vanilla'},
   {'name': 'c . broken nuts (pecans)',
    'qty': 0.5,
    'unit': None,
    'raw': '1/2 c. broken nuts (pecans)'},
   {'name': '. butter or margarine',
    'qty': 2.0,
    'unit': 'tbsp',
    'raw': '2 Tbsp. butter or margarine'},
   {'name': 'c . bite size shredded rice biscuits',
    'qty': 3.5,
    'unit': None,
    'raw': '3 1/2 c. bite size shredded rice biscuits'}],
  'steps': ['In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.',
   'Stir over medium heat until mixture bubbles all over top.',
   'Boil and stir 5 minutes more. Take off heat.',
   'St

## mongo ready JSONL

In [51]:
# === Write JSONL =============================================================
def provenance_obj():
    return {
        "version": PREPROC_VERSION,
        "methods": ["rules"],
        "ts": {"$date": datetime.now(timezone.utc).isoformat().replace("+00:00","Z")}
    }
for d in out_docs:
    if d.get("tags_provenance") and isinstance(d["tags_provenance"].get("ts"), datetime):
        d["tags_provenance"]["ts"] = {"$date": d["tags_provenance"]["ts"].isoformat().replace("+00:00","Z")}

OUT_JSONL = Path("init") / "03_recipe_csv_sample.jsonl"
OUT_JSONL.parent.mkdir(parents=True, exist_ok=True)

with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for d in out_docs:
        f.write(json.dumps(d, ensure_ascii=False) + "\n")

print("Wrote:", OUT_JSONL, "records:", len(out_docs))


Wrote: init/03_recipe_csv_sample.jsonl records: 2231142


In [52]:
# preview

pd.DataFrame([
    {
        "title": d["title"],
        "dietary": d["dietary_tags"],
        "flavour": d["flavour_tags"],
        "technique": d["technique_tags"],
        "allergens": d["allergen_tags"]
    }
    for d in out_docs[:10]
])


Unnamed: 0,title,dietary,flavour,technique,allergens
0,No-Bake Nut Cookies,"[gluten-free, vegan]","[rich, sweet]",[bake],"[dairy, tree-nut]"
1,Jewell Ball'S Chicken,[gluten-free],"[creamy, umami]",[bake],[dairy]
2,Creamy Corn,"[gluten-free, vegan]","[creamy, garlicky, rich]",[],[dairy]
3,Chicken Funny,[gluten-free],"[creamy, umami]",[bake],"[dairy, tree-nut]"
4,Reeses Cups(Candy),[],"[rich, sweet]",[],"[dairy, peanut, tree-nut]"
5,Cheeseburger Potato Soup,[],"[creamy, rich]",[],"[dairy, gluten, tree-nut]"
6,Rhubarb Coffee Cake,[vegan],"[rich, sweet]",[],"[dairy, egg, gluten]"
7,Scalloped Corn,"[gluten-free, vegan]","[creamy, rich]",[bake],"[dairy, egg, shellfish]"
8,Nolan'S Pepper Steak,[],[],[],"[gluten, tree-nut]"
9,Millionaire Pie,[],"[citrusy, tangy]",[],"[dairy, tree-nut]"


### import into mongo -> rebuild_mongo.sh uses 03_recipe_csv_sample.jsonl