
# Ingredient NER Training & Encoding Notebook

This notebook trains a spaCy NER model to identify ingredients in free text (e.g. raw columns from a new dataset).
It optionally bootstraps labels using your lexicon and then fine-tunes a model for better generalization, and finally
normalizes & encodes predictions using your existing pipeline artifacts.

**Output:**
- A trained spaCy pipeline that recognizes `INGREDIENT` entities.
- Optional lexicon-driven bootstrapping with `EntityRuler`.
- Normalization + encoding into integer IDs using your `IngredientEncoder` maps (or fitting one if not present).
- A few utility cells to apply the model to any new dataset (CSV/Parquet) and write out `NER_clean` + encoded IDs.

> **Dependencies:** `spacy>=3.6`, `pyarrow`, `pandas`, `scikit-learn`, `tqdm` (optional). For better results you can also install `spacy-transformers` and use a transformer backbone.


In [1]:
# --- Dependencies (install once in your venv) ---
# %pip install -U spacy spacy-transformers torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# %pip install -U pandas pyarrow scikit-learn tqdm

import os, torch, spacy
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # pick your NVIDIA GPU index
spacy.prefer_gpu()                        # Torch-backed components use GPU

print("torch.cuda.is_available():", torch.cuda.is_available())
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    print("Torch CUDA:", torch.version.cuda, "| GPU:", torch.cuda.get_device_name(0))




torch.cuda.is_available(): True
Torch CUDA: 12.1 | GPU: NVIDIA GeForce RTX 4070



## 1) Configuration

Adjust the paths below to point to your training corpus and pipeline artifacts.  
You can train from **CSV or Parquet**. The notebook expects:
- A text column (e.g., `raw_text`, `ingredients_text`, etc.) that contains free text.
- Optionally, a **lexicon** JSON (list of canonical ingredient phrases) to bootstrap labels.
- Optionally, a **dedupe mapping** JSONL (phrase → canonical) generated by your W2V or cosine dedupe stage.
- Optionally, `IngredientEncoder` maps to encode tokens into IDs.


In [2]:
from pathlib import Path

# === Data ===
DATA_IS_PARQUET = False
TRAIN_PATH      = Path("../../data/raw/wilmerarltstrmberg_data.csv")
NER_COL         = "NER"   # list-like column of ingredient strings

# Optional bootstrapping lexicon (list[str])
LEXICON_JSON    = None    # Path("../../data/lexicon/ingredients.json") or None

# Dedupe map from your pipeline (JSONL lines {"from":..., "to":...})
DEDUPE_JSONL    = Path("../../data/normalized/cosine_dedupe_map.jsonl")

# IngredientEncoder maps (Stage 6 artifacts)
ING_ID2TOK_JSON = Path("../../data/encoded/ingredient_id_to_token.json")
ING_TOK2ID_JSON = Path("../../data/encoded/ingredient_token_to_id.json")

# === Outputs ===
OUT_DIR     = Path("../../models/ingredient_ner_trf")
MODEL_DIR   = OUT_DIR / "model-best"
BOOT_DIR    = OUT_DIR / "bootstrapped"
TRAIN_DIR   = BOOT_DIR / "train"
VALID_DIR   = BOOT_DIR / "valid"
PRED_OUT    = Path("../../data/training/predictions.parquet")
for p in [OUT_DIR, TRAIN_DIR, VALID_DIR]: p.mkdir(parents=True, exist_ok=True)

# === Training hyperparams ===
TRANSFORMER_MODEL = "distilbert-base-uncased"  # good default; try roberta-base / bert-base-uncased / sentence-transformers/all-MiniLM-L6-v2
N_EPOCHS          = 10
LEARNING_RATE     = 5e-5                       # typical for transformers
DROPOUT           = 0.1
BATCH_START       = 256                        # dynamic batch schedule
BATCH_END         = 4096
ACCUM_STEPS       = 2                          # gradient accumulation
VALID_FRACTION    = 0.2
EARLY_STOP_PATIENCE = 2                        # epochs without F1 improvement
EVAL_SNAPSHOT_MAX   = 5000                     # cap validation examples per epoch
RANDOM_SEED      = 42
SHARD_SIZE       = 10_000                      # DocBin shard size



## 2) Load Data
Supports Parquet (preferred for large data) or CSV. The notebook expects a single free-text column.


In [3]:
import ast, json, pandas as pd, pyarrow.parquet as pq, spacy
from spacy.tokens import Doc, DocBin
from tqdm import tqdm

def load_data(path: Path, is_parquet: bool, col: str) -> pd.DataFrame:
    if is_parquet:
        pf = pq.ParquetFile(str(path))
        frames = [pf.read_row_group(i).to_pandas() for i in range(pf.num_row_groups)]
        df = pd.concat(frames, ignore_index=True)
    else:
        df = pd.read_csv(path, dtype=str)
    if col not in df.columns:
        raise KeyError(f"Column '{col}' not found.")
    return df[[col]].dropna().reset_index(drop=True)

def parse_listlike(v):
    if isinstance(v, (list, tuple)): return [str(x).strip() for x in v if str(x).strip()]
    s = str(v).strip()
    if not s: return []
    for parser in (ast.literal_eval, json.loads):
        try:
            out = parser(s)
            if isinstance(out, (list, tuple)):
                return [str(x).strip() for x in out if str(x).strip()]
        except Exception:
            pass
    return [x.strip() for x in s.split(",") if x.strip()]

def join_with_offsets(tokens, sep=", "):
    text, spans, pos = [], [], 0
    for i, tok in enumerate(tokens):
        start, end = pos, pos + len(tok)
        text.append(tok); spans.append((start, end)); pos = end
        if i < len(tokens)-1: text.append(sep); pos += len(sep)
    return "".join(text), spans

df = load_data(TRAIN_PATH, DATA_IS_PARQUET, NER_COL)
print(f"Loaded rows: {len(df):,}")

blank = spacy.blank("en")
docs_from_ner = []
for lst in tqdm(df[NER_COL].tolist(), desc="Synthesizing"):
    toks = parse_listlike(lst)
    if not toks:
        docs_from_ner.append(blank.make_doc(""))
        continue
    text, offs = join_with_offsets(toks)
    d = blank.make_doc(text)
    ents = []
    for (a,b) in offs:
        sp = d.char_span(a,b, label="INGREDIENT", alignment_mode="contract")
        if sp is not None: ents.append(sp)
    d.ents = spacy.util.filter_spans(ents)
    docs_from_ner.append(d)

print("Docs:", len(docs_from_ner), "| Total ents:", sum(len(d.ents) for d in docs_from_ner))


Loaded rows: 2,231,142


  from .autonotebook import tqdm as notebook_tqdm
Synthesizing: 100%|██████████| 2231142/2231142 [01:58<00:00, 18788.30it/s]


Docs: 2231142 | Total ents: 18419930



## 3) Train/valid split → write DocBin shards (RAM-safe)


In [4]:
from sklearn.model_selection import train_test_split
import math, os

train_docs, valid_docs = train_test_split(docs_from_ner, test_size=VALID_FRACTION, random_state=RANDOM_SEED)
print(f"train={len(train_docs):,}, valid={len(valid_docs):,}")

def write_docbins(docs, out_dir: Path, shard_size: int = SHARD_SIZE):
    out_dir.mkdir(parents=True, exist_ok=True)
    n = len(docs); shards = math.ceil(n/shard_size)
    for i in range(shards):
        db = DocBin(store_user_data=False)
        for d in docs[i*shard_size : (i+1)*shard_size]:
            db.add(d)
        db.to_disk(out_dir / f"shard_{i:04d}.spacy")
    print(f"Wrote {n} docs to {out_dir} in {shards} shard(s).")

write_docbins(train_docs, TRAIN_DIR)
write_docbins(valid_docs, VALID_DIR)


train=1,784,913, valid=446,229
Wrote 1784913 docs to ..\..\models\ingredient_ner_trf\bootstrapped\train in 179 shard(s).
Wrote 446229 docs to ..\..\models\ingredient_ner_trf\bootstrapped\valid in 45 shard(s).


## 4) Build a transformer+NER pipeline (Torch) and helpers

In [None]:
import random, spacy
from spacy.training import Example
from spacy.tokens import DocBin

random.seed(RANDOM_SEED)

def build_nlp(model_name: str = "distilbert-base-uncased") -> spacy.language.Language:
    """
    Create a lean transformer-backed NER pipeline.
    Only passes `model.name` to avoid schema drift across versions.
    Requires: pip install spacy spacy-transformers torch
    """
    # IMPORTANT: run this in a fresh kernel/session, or ensure you don't reuse a mutated `nlp`
    nlp = spacy.blank("en")
    nlp.add_pipe("transformer", config={"model": {"name": model_name}})
    ner = nlp.add_pipe("ner")
    ner.add_label("INGREDIENT")
    return nlp

def iter_examples_from_docbins(nlp, dir_path: Path, shuffle=False):
    shard_paths = sorted(p for p in dir_path.glob("*.spacy"))
    if shuffle:
        random.shuffle(shard_paths)
    for sp_path in shard_paths:
        db = DocBin().from_disk(sp_path)
        for d in db.get_docs(nlp.vocab):
            ents = [(e.start_char, e.end_char, e.label_) for e in d.ents]
            yield Example.from_dict(nlp.make_doc(d.text), {"entities": ents})

def sample_validation(nlp, dir_path: Path, cap=EVAL_SNAPSHOT_MAX):
    out, n = [], 0
    for eg in iter_examples_from_docbins(nlp, dir_path, shuffle=False):
        out.append(eg); n += 1
        if n >= cap: break
    return out



## 5) Training loop (streamed, mixed precision, accumulation, early stopping)


In [7]:
# ----- cell: train_transformer_ner.py -----
import time, random
import torch, spacy
from spacy.training import Example

# Optional: small PyTorch speed knobs
torch.backends.cudnn.benchmark = True
try:
    torch.set_float32_matmul_precision("high")  # TF32 on Ampere+
except Exception:
    pass

def _fmt(x): 
    try: return f"{float(x):.3f}"
    except: return "0.000"

def _examples_from_docs(nlp, docs, shuffle=False):
    idxs = list(range(len(docs)))
    if shuffle:
        random.shuffle(idxs)
    for i in idxs:
        d = docs[i]
        ents = [(e.start_char, e.end_char, e.label_) for e in d.ents]
        yield Example.from_dict(nlp.make_doc(d.text), {"entities": ents})

def train_trf_ner(
    train_docs, valid_docs,
    model_name="distilbert-base-uncased",
    n_epochs=10, lr=5e-5, dropout=0.1,
    start_bs=64, end_bs=1024, seed=42,
):
    # fresh pipeline
    from build_nlp import build_nlp
    spacy.prefer_gpu()
    nlp = build_nlp(model_name)

    random.seed(seed)
    try:
        import numpy as np; np.random.seed(seed)
    except Exception: pass

    # warm init
    warm = []
    for i, eg in enumerate(_examples_from_docs(nlp, train_docs, shuffle=True)):
        warm.append(eg)
        if i >= min(256, max(16, len(train_docs)//100)):  # small warm set
            break
    optimizer = nlp.initialize(lambda: warm)
    if hasattr(optimizer, "learn_rate"):
        optimizer.learn_rate = float(lr)

    # compact snapshot for eval
    valid_snapshot = []
    for i, eg in enumerate(_examples_from_docs(nlp, valid_docs, shuffle=False)):
        valid_snapshot.append(eg)
        if i >= 1500:
            break

    def comp_bs(ep):
        if n_epochs <= 1: return end_bs
        r = ep / (n_epochs - 1)
        return int(start_bs * ((end_bs / start_bs) ** r))

    best_f1 = -1.0
    for ep in range(n_epochs):
        t0 = time.time()
        losses = {}
        bs = comp_bs(ep)

        buf = []
        for eg in _examples_from_docs(nlp, train_docs, shuffle=True):
            buf.append(eg)
            if len(buf) >= bs:
                nlp.update(buf, sgd=optimizer, drop=dropout, losses=losses)
                buf.clear()
        if buf:
            nlp.update(buf, sgd=optimizer, drop=dropout, losses=losses)

        # eval
        with nlp.select_pipes(enable=["transformer","ner"]):
            scores = nlp.evaluate(valid_snapshot)
        p, r, f1 = (scores.get("ents_p") or 0.0,
                    scores.get("ents_r") or 0.0,
                    scores.get("ents_f") or 0.0)

        print(f"Epoch {ep+1:02d}/{n_epochs} | bs={bs:<4d} | "
              f"loss={losses.get('ner',0):.2f} | P/R/F1={_fmt(p)}/{_fmt(r)}/{_fmt(f1)} | "
              f"{time.time()-t0:.1f}s | torch.cuda={torch.cuda.is_available()}")

        if f1 > best_f1 + 1e-6:
            best_f1 = f1
            MODEL_DIR.mkdir(parents=True, exist_ok=True)
            nlp.to_disk(MODEL_DIR)

    print(f"Best dev F1: {best_f1:.3f}")
    return spacy.load(MODEL_DIR)


nlp_trained = train_trf_ner()


TypeError: train_trf_ner() missing 2 required positional arguments: 'train_docs' and 'valid_docs'

## 6) Load dedupe map + encoder maps

In [None]:
import json
from typing import Dict, Union

def load_jsonl_map(path: Union[str, Path]) -> Dict[str, str]:
    m = {}
    p = Path(path)
    if not p.exists(): return m
    with open(p, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                obj = json.loads(line)
                m[str(obj["from"]).strip()] = str(obj["to"]).strip()
    return m

def load_encoder_maps(id2tok_path: Path, tok2id_path: Path):
    if not id2tok_path.exists() or not tok2id_path.exists():
        return None, None
    with open(id2tok_path, "r", encoding="utf-8") as f:
        id2tok_raw = json.load(f)
    with open(tok2id_path, "r", encoding="utf-8") as f:
        tok2id_raw = json.load(f)
    id2tok = {int(k): str(v) for k, v in id2tok_raw.items()}
    tok2id = {str(k): int(v) for k, v in tok2id_raw.items()}
    return id2tok, tok2id

dedupe_map = load_jsonl_map(DEDUPE_JSONL)
ing_id2tok, ing_tok2id = load_encoder_maps(ING_ID2TOK_JSON, ING_TOK2ID_JSON)
print(f"Dedupe map entries: {len(dedupe_map):,} | Encoder maps present: {ing_id2tok is not None}")


## 7) Inference → normalize → encode

In [None]:
import pandas as pd
from tqdm import tqdm

def normalize_token(s: str) -> str:
    return " ".join(str(s).strip().lower().split())

def apply_dedupe(tok: str, mapping: Dict[str, str]) -> str:
    return mapping.get(tok, tok) if mapping else tok

def predict_normalize_encode(
    nlp_dir: Path,
    data_path: Path,
    is_parquet: bool,
    text_col: str,
    dedupe: Dict[str, str],
    tok2id: Dict[str, int] = None,
    out_path: Path = None,
    batch_size: int = 256
) -> pd.DataFrame:
    import spacy, pyarrow as pa, pyarrow.parquet as pq
    nlp = spacy.load(nlp_dir)
    df_in = load_data(data_path, is_parquet, text_col)
    texts = df_in[text_col].astype(str).tolist()

    preds = []
    for doc in tqdm(nlp.pipe(texts, batch_size=batch_size, n_process=1, disable=[]), total=len(texts), desc="Infer"):
        seen, out = set(), []
        for ent in doc.ents:
            if ent.label_ != "INGREDIENT": continue
            t = apply_dedupe(normalize_token(ent.text), dedupe)
            if t and t not in seen:
                seen.add(t); out.append(t)
        preds.append(out)

    df_out = df_in.copy()
    df_out["NER_clean"] = preds

    if tok2id is not None:
        df_out["Ingredients"] = df_out["NER_clean"].apply(lambda lst: [tok2id.get(t, 0) for t in (lst or [])])

    if out_path is not None:
        try:
            pq.write_table(pa.Table.from_pandas(df_out, preserve_index=False).replace_schema_metadata(None), out_path)
            print(f"Wrote → {out_path}")
        except Exception as e:
            print("[WARN] Parquet write failed; falling back to CSV:", e)
            df_out.to_csv(out_path.with_suffix(".csv"), index=False)
    return df_out

# Example sanity-check on training source itself:
# _ = predict_normalize_encode(MODEL_DIR, TRAIN_PATH, DATA_IS_PARQUET, NER_COL, dedupe_map, ing_tok2id, out_path=PRED_OUT)
