# recipe preprocessing

In [1]:

import pandas as pd, json, re, unicodedata, ast, math, os
from fractions import Fraction
from datetime import datetime, timezone
from tqdm.auto import tqdm


In [22]:
# === Helpers (one cell) =======================================================
import re, math, ast, unicodedata
from datetime import datetime, timezone
from fractions import Fraction

# --- Constants ----------------------------------------------------------------
UNITS   = {"tsp","teaspoon","tbsp","tablespoon","cup","cups","g","kg","ml","l","pound","lb","oz"}
ANIMAL  = {"beef","chicken","pork","lamb","shrimp","prawn","fish","tuna","salmon","anchovy","egg","milk","butter","cheese","yogurt","honey"}
GLUTEN  = {"wheat","flour","breadcrumbs","barley","rye","semolina","farina","spelt","bulgur","seitan","graham"}
FLAVOUR_KEYWORDS = {
    "spicy": {"chili","chile","jalapeno","harissa","gochugaru","sriracha","cayenne","chilli"},
    "smoky": {"smoked","chipotle","smoky"},
    "sweet": {"sugar","honey","maple","syrup","molasses"},
    "herby": {"basil","parsley","cilantro","dill","oregano","thyme","rosemary","mint","sage"},
    "tangy": {"lemon","lime","vinegar","sumac","tamarind","yogurt"},
    "umami": {"miso","soy","tamari","anchovy","mushroom","kombu","parmesan"}
}

# --- General utilities --------------------------------------------------------
def now_mongo_date():
    return {"$date": datetime.now(timezone.utc).isoformat().replace("+00:00","Z")}

def norm(s) -> str:
    # make sure we never pass None/NaN to unicodedata.normalize
    s = safe_str(s)
    s = unicodedata.normalize("NFKC", s).strip()
    return re.sub(r"\s+", " ", s).lower()


def slugify(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "-", norm(s)).strip("-")

def safe_str(x):
    """Return '' for None/NaN; else str(x)."""
    if x is None:
        return ""
    if isinstance(x, float) and math.isnan(x):
        return ""
    return str(x)

def safe_join(parts):
    """Join after converting each part to safe_str and dropping empties."""
    parts = [safe_str(p) for p in parts]
    return " ".join(p for p in parts if p)

# --- Parsing & tagging --------------------------------------------------------
def to_number(qs: str):
    qs = (qs or "").strip()
    if not qs: 
        return None
    try:
        if " " in qs:            # mixed number e.g., "1 1/2"
            a, b = qs.split(" ", 1)
            return float(Fraction(a) + Fraction(b))
        if "/" in qs:            # fraction e.g., "1/2"
            return float(Fraction(qs))
        return float(qs)
    except Exception:
        return None

def parse_ingredient(raw: str):
    t = norm(raw)
    # qty + unit + name (naive but solid starting point)
    m = re.match(r"^([\d\s\/\.]+)\s*([a-zA-Z]+)?\s*(.*)$", t)
    qty, unit, rest = None, None, t
    if m:
        qty_s, unit_s, rest = m.groups()
        qty = to_number(qty_s) if qty_s else None
        if unit_s and unit_s.lower() in UNITS:
            unit = unit_s.lower()
        else:
            rest = ((unit_s or "") + " " + rest).strip() if unit_s else rest
    name = rest.strip(", ")
    return {"name": name, "qty": qty, "unit": unit, "raw": raw}

def derive_dietary(ingredient_names):
    names = set(ingredient_names)
    vegan = not any(x in names for x in ANIMAL)
    vegetarian = not any(x in names for x in {"beef","chicken","pork","lamb","shrimp","prawn","fish","tuna","salmon","anchovy"})
    gf = not any(x in names for x in GLUTEN)
    tags = []
    if vegan:
        tags.append("vegan")
    elif vegetarian:
        tags.append("vegetarian")
    if gf:
        tags.append("gluten-free")
    return tags

def derive_flavour_keyword(all_text: str):
    t = norm(all_text)
    tags = []
    for label, keys in FLAVOUR_KEYWORDS.items():
        if any(k in t for k in keys):
            tags.append(label)
    return sorted(set(tags))

def to_list_from_cell(x):
    """
    Parse list-like strings safely. Always returns List[str] with NaNs removed.
    Accepts forms like "[""foo"", ""bar""]", or falls back to line/pipe split.
    """
    if x is None or (isinstance(x, float) and math.isnan(x)):
        return []
    s = safe_str(x).strip()
    try:
        v = ast.literal_eval(s)
        if isinstance(v, (list, tuple)):
            return [safe_str(i) for i in v if safe_str(i)]
        return [safe_str(v)] if safe_str(v) else []
    except Exception:
        if "|" in s:
            return [p.strip() for p in s.split("|") if p.strip()]
        if "\n" in s:
            return [p.strip() for p in s.split("\n") if p.strip()]
        return [s] if s else []


## load dataset from data folder

In [23]:
from pathlib import Path
import os, glob
print("CWD:", os.getcwd())  

CSV_PATH = Path("data") / "recipe_nlg" / "full_dataset.csv"   
print("CSV exists?", CSV_PATH.exists(), "→", CSV_PATH)



CWD: /Users/Lorena/Developer/FlavorNet/mongoDB
CSV exists? True → data/recipe_nlg/full_dataset.csv


In [24]:
import pandas as pd

df = pd.read_csv(
    CSV_PATH,
    usecols=["title","ingredients","directions","link","source","NER"],
    low_memory=False  
)



In [25]:
df.shape

(2231142, 6)

In [26]:
df.head()

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


### Original columns in `full_dataset.csv`

Inlcuded in **RecipeNLG** dataset:

| column | meaning |
|--------|----------|
| `title` | the recipe’s name (e.g., “No-Bake Nut Cookies”) |
| `ingredients` | list-like string of raw ingredient phrases (e.g., `["1 c. sugar", "1/2 c. milk", ...]`) |
| `directions` | list-like string of recipe steps |
| `link` | original web source URL |
| `source` | scraped website or author |
| `NER` | list of named entities / ingredient names extracted by NLP |

---

### Our preprocessing / augmentation


| new field | how it was produced | example |
|------------|--------------------|----------|
| `ingredients (parsed)` | converted each text ingredient into `{name, qty, unit, raw}` via `parse_ingredient()` | `{"name": "sugar", "qty": 1.0, "unit": "cup", ...}` |
| `ingredient_tags` | merged `NER` entities + parsed ingredient names → normalized lowercase unique list | `["sugar", "milk", "nuts", ...]` |
| `dietary_tags` | added by preprocessing (`derive_dietary()`): rule-based detection of vegan / vegetarian / gluten-free from `ingredient_tags` | `["vegetarian", "gluten-free"]` |
| `flavour_tags` | added by preprocessing (`derive_flavour_keyword()`): keyword matching on ingredients + directions for flavour profiles | `["sweet"]`, `["umami"]`, etc. |
| `slug` | generated URL-safe version of the title (`slugify()`) | `"no-bake-nut-cookies"` |
| `created_at`, `updated_at` | timestamps for Mongo | ISODate values |
| `times`, `nutrition`, `rating`, `images` | empty placeholders for schema compatibility | — |

---

### table  (`dietary`, `flavour`, `n_ings`)

- **`dietary`** → built by rule-based `derive_dietary()` function.  
- **`flavour`** → built by keyword `derive_flavour_keyword()` function.  
- **`n_ings`** → simple count of `len(d["ingredients"])` (sanity-check parsing).


## transform to mongo schema

In [27]:
out_docs = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    title = safe_str(row.get("title"))
    ing_raw = to_list_from_cell(row.get("ingredients"))
    steps   = to_list_from_cell(row.get("directions"))
    ner     = to_list_from_cell(row.get("NER"))
    
    ings = [parse_ingredient(x) for x in ing_raw]
    ner_tags = [norm(x) for x in ner if x]
    parsed_tags = [norm(i["name"]) for i in ings if i["name"]]
    ingredient_tags = sorted(set([t for t in ner_tags if t] + parsed_tags))
    
    dietary = derive_dietary(ingredient_tags)
    flavour = derive_flavour_keyword(safe_join([title] + ing_raw + steps))



    doc = {
        "title": title,
        "slug": slugify(title),
        "ingredients": ings,
        "steps": steps,
        "tags": [],
        "dietary_tags": dietary,
        "flavour_tags": flavour,
        "ingredient_tags": ingredient_tags,
        "cuisine": None,
        "course": None,
        "servings": None,
        "times": {"prep_min": None, "cook_min": None, "total_min": None},
        "nutrition": {},
        "rating": {"value": None, "count": None},
        "images": [],
        "source_url": row.get("link"),
        "author": row.get("source"),
        "created_at": now_mongo_date(),
        "updated_at": now_mongo_date()
    }
    out_docs.append(doc)

len(out_docs)


  0%|          | 0/2231142 [00:00<?, ?it/s]

Exception in thread Thread-5:
Traceback (most recent call last):
  File "/Users/Lorena/.local/share/uv/python/cpython-3.12.11-macos-aarch64-none/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "/Users/Lorena/Developer/FlavorNet/.venv/lib/python3.12/site-packages/tqdm/_monitor.py", line 84, in run
    instance.refresh(nolock=True)
  File "/Users/Lorena/Developer/FlavorNet/.venv/lib/python3.12/site-packages/tqdm/std.py", line 1347, in refresh
    self.display()
  File "/Users/Lorena/Developer/FlavorNet/.venv/lib/python3.12/site-packages/tqdm/notebook.py", line 171, in display
    rtext.value = right
    ^^^^^^^^^^^
  File "/Users/Lorena/Developer/FlavorNet/.venv/lib/python3.12/site-packages/traitlets/traitlets.py", line 716, in __set__
    self.set(obj, value)
  File "/Users/Lorena/Developer/FlavorNet/.venv/lib/python3.12/site-packages/traitlets/traitlets.py", line 706, in set
    obj._notify_trait(self.name, old_value, new_value)
  File "/Users/Lorena/

2231142

In [29]:
out_docs

[{'title': 'No-Bake Nut Cookies',
  'slug': 'no-bake-nut-cookies',
  'ingredients': [{'name': 'c . firmly packed brown sugar',
    'qty': 1.0,
    'unit': None,
    'raw': '1 c. firmly packed brown sugar'},
   {'name': 'c . evaporated milk',
    'qty': 0.5,
    'unit': None,
    'raw': '1/2 c. evaporated milk'},
   {'name': '. vanilla', 'qty': 0.5, 'unit': 'tsp', 'raw': '1/2 tsp. vanilla'},
   {'name': 'c . broken nuts (pecans)',
    'qty': 0.5,
    'unit': None,
    'raw': '1/2 c. broken nuts (pecans)'},
   {'name': '. butter or margarine',
    'qty': 2.0,
    'unit': 'tbsp',
    'raw': '2 Tbsp. butter or margarine'},
   {'name': 'c . bite size shredded rice biscuits',
    'qty': 3.5,
    'unit': None,
    'raw': '3 1/2 c. bite size shredded rice biscuits'}],
  'steps': ['In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.',
   'Stir over medium heat until mixture bubbles all over top.',
   'Boil and stir 5 minutes more. Take off heat.',
   'St

In [30]:
pd.DataFrame([{
    "title": d["title"][:40],
    "dietary": ",".join(d["dietary_tags"]),
    "flavour": ",".join(d["flavour_tags"]),
    "n_ings": len(d["ingredients"])
} for d in out_docs[:10]])


Unnamed: 0,title,dietary,flavour,n_ings
0,No-Bake Nut Cookies,"vegetarian,gluten-free",sweet,6
1,Jewell Ball'S Chicken,gluten-free,umami,4
2,Creamy Corn,"vegetarian,gluten-free",,6
3,Chicken Funny,gluten-free,umami,5
4,Reeses Cups(Candy),"vegetarian,gluten-free",sweet,5
5,Cheeseburger Potato Soup,"vegetarian,gluten-free",,10
6,Rhubarb Coffee Cake,vegetarian,sweet,10
7,Scalloped Corn,"vegetarian,gluten-free",,6
8,Nolan'S Pepper Steak,"vegan,gluten-free",,7
9,Millionaire Pie,"vegan,gluten-free",tangy,6


## mongo ready JSONL

In [32]:
from pathlib import Path
import json, os

OUT_JSONL = Path("init") / "03_recipe_csv_sample.jsonl"  
OUT_JSONL.parent.mkdir(parents=True, exist_ok=True)

with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for d in out_docs:
        f.write(json.dumps(d, ensure_ascii=False) + "\n")

print("Wrote:", OUT_JSONL, "records:", len(out_docs))


Wrote: init/03_recipe_csv_sample.jsonl records: 2231142


### import into mongo -> rebuild_mongo.sh uses 03_recipe_csv_sample.jsonl