# Ingredient NER — Training, Normalization & Encoding

This notebook trains a spaCy Named Entity Recognition (NER) model to identify **ingredients** in free text. 
It also supports **lexicon-driven bootstrapping** (optional) to auto-label raw text, and includes **normalization** + **encoding** to integer IDs using your pipeline artifacts.


In [1]:

# If needed, install dependencies (uncomment and run in your environment)
# !pip install -U spacy pyarrow pandas scikit-learn tqdm
# Optional for transformer-based NER:
# !pip install -U spacy-transformers torch
# Download a small English model if you don't have one:
# !python -m spacy download en_core_web_sm


## 0) Setup & optional installs

- Keep the `pip` cells commented in your managed environment (e.g., when dependencies are preinstalled).
- If you’re missing `spacy-transformers` or `torch`, uncomment and run the `pip` cells.
- CUDA is optional; if available, we’ll use it automatically.


In [2]:

# %% Optional installs (uncomment as needed)
# %pip install -U spacy spacy-transformers torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# %pip install -U pandas pyarrow scikit-learn tqdm

import os, sys, json, ast, math, random, warnings
from dataclasses import dataclass, asdict
from pathlib import Path

import pandas as pd

try:
    import pyarrow as pa
    import pyarrow.parquet as pq
    _HAS_PA = True
except Exception:
    _HAS_PA = False

import numpy as np
import spacy
from spacy.tokens import Doc, DocBin
from spacy.training import Example
from tqdm import tqdm

# Torch is optional (only for transformers / GPU); we gracefully degrade otherwise.
try:
    import torch
except Exception as _e:
    torch = None
    warnings.warn("Torch not available. Training will fall back to CPU tok2vec if transformers cannot be used.")

def set_global_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    if torch is not None:
        try:
            torch.manual_seed(seed)
            if torch.cuda.is_available():
                torch.cuda.manual_seed_all(seed)
        except Exception:
            pass

# Prefer GPU for spaCy-backed Torch components if present
if torch is not None:
    if torch.cuda.is_available():
        os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
        spacy.prefer_gpu()
        if hasattr(torch.backends, "cudnn"):
            torch.backends.cudnn.benchmark = True
        print("✅ CUDA available:", torch.version.cuda if hasattr(torch, "version") else "unknown")
    else:
        spacy.prefer_cpu()
        print("ℹ️ CUDA not available. Using CPU.")
else:
    spacy.prefer_cpu()
    print("ℹ️ Torch not available. Using CPU.")




✅ CUDA available: 12.1



## 1) Configuration

Adjust the paths below to point to your training corpus and pipeline artifacts.  
You can train from **CSV or Parquet**. The notebook expects:
- A text column (e.g., `raw_text`, `ingredients_text`, etc.) that contains free text.
- Optionally, a **lexicon** JSON (list of canonical ingredient phrases) to bootstrap labels.
- Optionally, a **dedupe mapping** JSONL (phrase → canonical) generated by your W2V or cosine dedupe stage.
- Optionally, `IngredientEncoder` maps to encode tokens into IDs.


In [None]:

@dataclass
class DataCfg:
    # Training data
    DATA_IS_PARQUET: bool = False
    TRAIN_PATH: Path = Path("../../data/raw/wilmerarltstrmberg_data.csv")

    # If you already have a list-like NER column (e.g., ["salt", "sugar", ...]), set this:
    NER_LIST_COL: str = "NER"      # column containing list-like ingredient strings

    # If training from raw text + lexicon (bootstrapping), set TEXT_COL and LEXICON_JSON:
    TEXT_COL: str | None = None    # e.g., "ingredients_text" (free text). Leave None if not using.
    LEXICON_JSON: Path | None = None  # e.g., Path("../../data/lexicon/ingredients.json")

    # Optional dedupe map (JSONL lines: {"from": "...", "to": "..."})
    DEDUPE_JSONL: Path | None = Path("../../data/normalized/cosine_dedupe_map.jsonl")

    # IngredientEncoder maps (Stage 6 artifacts)
    ING_ID2TOK_JSON: Path | None = Path("../../data/encoded/ingredient_id_to_token.json")
    ING_TOK2ID_JSON: Path | None = Path("../../data/encoded/ingredient_token_to_id.json")

@dataclass
class TrainCfg:
    RANDOM_SEED: int = 42
    VALID_FRACTION: float = 0.1
    SHARD_SIZE: int = 2000
    EVAL_SNAPSHOT_MAX: int = 1500
    BATCH_SIZE: int = 256
    # Transformer options (fallback to tok2vec if not available)
    TRANSFORMER_MODEL: str = "distilbert-base-uncased"
    WINDOW: int = 64
    STRIDE: int = 48
    LR: float = 5e-5
    DROPOUT: float = 0.1
    N_EPOCHS: int = 10
    FREEZE_LAYERS: int = 2
    USE_AMP: bool = True
    CLEAR_CACHE_EVERY: int = 200
    EARLY_STOPPING_PATIENCE: int = 3   # epochs without F1 improvement

@dataclass
class OutCfg:
    OUT_DIR: Path = Path("../../models/ingredient_ner_trf")
    MODEL_DIR: Path = Path("../../models/ingredient_ner_trf/model-best")
    BOOT_DIR: Path  = Path("../../models/ingredient_ner_trf/bootstrapped")
    TRAIN_DIR: Path = Path("../../models/ingredient_ner_trf/bootstrapped/train")
    VALID_DIR: Path = Path("../../models/ingredient_ner_trf/bootstrapped/valid")
    PRED_OUT: Path  = Path("../../data/training/predictions.parquet")

DATA = DataCfg()
TRAIN = TrainCfg()
OUT = OutCfg()

# Create output directories
for p in [OUT.OUT_DIR, OUT.TRAIN_DIR, OUT.VALID_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print("Data config:", asdict(DATA))
print("Train config:", asdict(TRAIN))
print("Out config:", asdict(OUT))

set_global_seed(TRAIN.RANDOM_SEED)


Data config: {'DATA_IS_PARQUET': False, 'TRAIN_PATH': WindowsPath('../../data/raw/wilmerarltstrmberg_data.csv'), 'NER_LIST_COL': 'NER', 'TEXT_COL': None, 'LEXICON_JSON': None, 'DEDUPE_JSONL': WindowsPath('../../data/normalized/cosine_dedupe_map.jsonl'), 'ING_ID2TOK_JSON': WindowsPath('../../data/encoded/ingredient_id_to_token.json'), 'ING_TOK2ID_JSON': WindowsPath('../../data/encoded/ingredient_token_to_id.json')}
Train config: {'RANDOM_SEED': 42, 'VALID_FRACTION': 0.1, 'SHARD_SIZE': 2000, 'EVAL_SNAPSHOT_MAX': 1500, 'TRANSFORMER_MODEL': 'distilbert-base-uncased', 'WINDOW': 64, 'STRIDE': 48, 'LR': 5e-05, 'DROPOUT': 0.1, 'N_EPOCHS': 10, 'FREEZE_LAYERS': 2, 'USE_AMP': True, 'CLEAR_CACHE_EVERY': 200, 'EARLY_STOPPING_PATIENCE': 3}
Out config: {'OUT_DIR': WindowsPath('../../models/ingredient_ner_trf'), 'MODEL_DIR': WindowsPath('../../models/ingredient_ner_trf/model-best'), 'BOOT_DIR': WindowsPath('../../models/ingredient_ner_trf/bootstrapped'), 'TRAIN_DIR': WindowsPath('../../models/ingredie


## 2) Utilities

Helpers for data loading, list-like parsing, and string normalization.


In [4]:

def load_data(path: Path, is_parquet: bool, col: str) -> pd.DataFrame:
    """Load a single column from CSV/Parquet and return a clean DataFrame."""
    if is_parquet:
        if not _HAS_PA:
            raise RuntimeError("pyarrow is required to read Parquet files. Please install pyarrow.")
        pf = pq.ParquetFile(str(path))
        frames = [pf.read_row_group(i).to_pandas() for i in range(pf.num_row_groups)]
        df = pd.concat(frames, ignore_index=True)
    else:
        df = pd.read_csv(path, dtype=str)
    if col not in df.columns:
        raise KeyError(f"Column '{col}' not found in {list(df.columns)[:20]}...")
    return df[[col]].dropna().reset_index(drop=True)

def parse_listlike(v):
    if isinstance(v, (list, tuple)): 
        return [str(x).strip() for x in v if str(x).strip()]
    s = str(v).strip()
    if not s: 
        return []
    for parser in (ast.literal_eval, json.loads):
        try:
            out = parser(s)
            if isinstance(out, (list, tuple)):
                return [str(x).strip() for x in out if str(x).strip()]
        except Exception:
            pass
    return [x.strip() for x in s.split(",") if x.strip()]

def join_with_offsets(tokens, sep: str = ", "):
    text, spans, pos = [], [], 0
    for i, tok in enumerate(tokens):
        start, end = pos, pos + len(tok)
        text.append(tok); spans.append((start, end)); pos = end
        if i < len(tokens)-1: text.append(sep); pos += len(sep)
    return "".join(text), spans

def normalize_token(s: str) -> str:
    return " ".join(str(s).strip().lower().split())



## 3) Create training docs

Two options:

1. **From list-like NER column** (`DATA.NER_LIST_COL`): Each row is a list of ingredient strings.  
   We synthesize a text like `'salt, sugar, ...'` and label each ingredient span as `INGREDIENT`.
2. **From raw text + lexicon** (`DATA.TEXT_COL` & `DATA.LEXICON_JSON`): Load a lexicon and label spans using `EntityRuler`.


In [5]:

def docs_from_list_column(df: pd.DataFrame, col: str) -> list[Doc]:
    blank = spacy.blank("en")
    out = []
    for lst in tqdm(df[col].tolist(), desc="Synthesizing from list column"):
        toks = parse_listlike(lst)
        if not toks:
            out.append(blank.make_doc(""))
            continue
        text, offs = join_with_offsets(toks)
        d = blank.make_doc(text)
        ents = []
        for (a, b) in offs:
            sp = d.char_span(a, b, label="INGREDIENT", alignment_mode="contract")
            if sp is not None: ents.append(sp)
        d.ents = spacy.util.filter_spans(ents)
        out.append(d)
    return out

def load_lexicon(path: Path | None) -> list[str]:
    if path is None:
        return []
    p = Path(path)
    if not p.exists(): 
        warnings.warn(f"Lexicon not found at {p}. Skipping.")
        return []
    with open(p, "r", encoding="utf-8") as f:
        data = json.load(f)
    # Expect either {"terms": [...]} or a simple list [...]
    if isinstance(data, dict) and "terms" in data:
        terms = data["terms"]
    else:
        terms = data
    # normalize
    terms = [normalize_token(t) for t in terms if str(t).strip()]
    terms = sorted(set(terms))
    print(f"Loaded {len(terms):,} lexicon terms.")
    return terms

def build_entity_ruler(nlp: spacy.language.Language, phrases: list[str]):
    ruler = nlp.add_pipe("entity_ruler")
    patterns = [{"label": "INGREDIENT", "pattern": t} for t in phrases]
    ruler.add_patterns(patterns)
    return ruler

def docs_from_text_plus_lexicon(df: pd.DataFrame, text_col: str, lexicon_terms: list[str]) -> list[Doc]:
    nlp = spacy.blank("en")
    if not lexicon_terms:
        raise ValueError("No lexicon terms provided; cannot bootstrap from raw text.")
    build_entity_ruler(nlp, lexicon_terms)
    out = []
    for text in tqdm(df[text_col].astype(str).tolist(), desc="Bootstrapping with EntityRuler"):
        d = nlp.make_doc(text)
        d = nlp(d)  # apply ruler
        # Keep only INGREDIENT, deduplicate spans
        d.ents = spacy.util.filter_spans([e for e in d.ents if e.label_ == "INGREDIENT"])
        out.append(d)
    return out

# ---- Build docs according to available columns ----
docs_all = []
if DATA.NER_LIST_COL and DATA.NER_LIST_COL in pd.read_csv(DATA.TRAIN_PATH, nrows=1).columns:
    df_list = load_data(DATA.TRAIN_PATH, DATA.DATA_IS_PARQUET, DATA.NER_LIST_COL)
    docs_all = docs_from_list_column(df_list, DATA.NER_LIST_COL)
    source_mode = "list-column"
elif DATA.TEXT_COL and DATA.LEXICON_JSON:
    # use bootstrapping from raw text + lexicon
    df_text = load_data(DATA.TRAIN_PATH, DATA.DATA_IS_PARQUET, DATA.TEXT_COL)
    lex = load_lexicon(DATA.LEXICON_JSON)
    docs_all = docs_from_text_plus_lexicon(df_text, DATA.TEXT_COL, lex)
    source_mode = "text+lexicon"
else:
    raise RuntimeError(
        "No valid data source inferred. Set DATA.NER_LIST_COL (list-like labels) "
        "or DATA.TEXT_COL + DATA.LEXICON_JSON (bootstrapping)."
    )

print(f"Docs prepared: {len(docs_all):,} | Source mode: {source_mode}")
print("Total labeled entities:", sum(len(d.ents) for d in docs_all))


  from .autonotebook import tqdm as notebook_tqdm
Synthesizing from list column: 100%|██████████| 2231142/2231142 [02:21<00:00, 15723.06it/s]


Docs prepared: 2,231,142 | Source mode: list-column
Total labeled entities: 18419930



## 4) Train/validation split and DocBin sharding

We write train/valid **DocBin** shards to disk to keep memory usage low and make resumes easy.


In [6]:

from sklearn.model_selection import train_test_split

train_docs, valid_docs = train_test_split(
    docs_all, test_size=TRAIN.VALID_FRACTION, random_state=TRAIN.RANDOM_SEED
)
print(f"train={len(train_docs):,} | valid={len(valid_docs):,}")

def write_docbins(docs: list[Doc], out_dir: Path, shard_size: int = TRAIN.SHARD_SIZE):
    out_dir.mkdir(parents=True, exist_ok=True)
    n = len(docs)
    shards = math.ceil(n / max(1, shard_size))
    for i in range(shards):
        db = DocBin(store_user_data=False)
        for d in docs[i*shard_size : (i+1)*shard_size]:
            db.add(d)
        db.to_disk(out_dir / f"shard_{i:04d}.spacy")
    print(f"Wrote {n} docs to {out_dir} in {shards} shard(s).")

write_docbins(train_docs, OUT.TRAIN_DIR)
write_docbins(valid_docs, OUT.VALID_DIR)


train=2,008,027 | valid=223,115
Wrote 2008027 docs to ..\..\models\ingredient_ner_trf\bootstrapped\train in 1005 shard(s).
Wrote 223115 docs to ..\..\models\ingredient_ner_trf\bootstrapped\valid in 112 shard(s).



## 5) Train spaCy NER
We'll train a small spaCy model from the bootstrapped annotations. For stronger performance, consider installing
`spacy-transformers` and swapping to a transformer backbone.


In [None]:

from spacy.language import Language
import random
from spacy.training import Example

def train_spacy_ner(train_docs, valid_docs, n_epochs=10, lr=0.001, dropout=0.2, batch_size=128) -> Language:
    # Initialize a blank English pipeline and add NER
    nlp = spacy.blank("en")
    ner = nlp.add_pipe("ner")
    ner.add_label("INGREDIENT")

    # Convert docs to Examples (NER only)
    train_examples = []
    for d in train_docs:
        # Keep only NER annotations (spans from EntityRuler bootstrapping)
        ents = [(ent.start_char, ent.end_char, ent.label_) for ent in d.ents]
        train_examples.append(Example.from_dict(nlp.make_doc(d.text), {"entities": ents}))

    valid_examples = []
    for d in valid_docs:
        ents = [(ent.start_char, ent.end_char, ent.label_) for ent in d.ents]
        valid_examples.append(Example.from_dict(nlp.make_doc(d.text), {"entities": ents}))

    optimizer = nlp.initialize(lambda: train_examples)
    print("Initialized", nlp.pipe_names)

    for epoch in range(n_epochs):
        random.shuffle(train_examples)
        losses = {}
        # Minibatch training
        for i in range(0, len(train_examples), batch_size):
            batch = train_examples[i:i+batch_size]
            nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses)
        # Simple evaluation
        with nlp.select_pipes(disable=[p for p in nlp.pipe_names if p != "ner"]):
            scores = nlp.evaluate(valid_examples)
        print(f"Epoch {epoch+1:02d}/{n_epochs} - Losses: {losses} - P/R/F1: "
              f"{scores['ents_p']:.3f}/{scores['ents_r']:.3f}/{scores['ents_f']:.3f}")
    return nlp

nlp_trained = train_spacy_ner(
    train_docs=train_docs,
    valid_docs=valid_docs,
    n_epochs=TRAIN.N_EPOCHS,
    lr=TRAIN.LR,
    dropout=TRAIN.DROPOUT,
    batch_size=TRAIN.SHARD_SIZE
)

# Persist model
OUT.MODEL_DIR.mkdir(parents=True, exist_ok=True)
nlp_trained.to_disk(OUT.MODEL_DIR)
print(f"Saved model → {OUT.MODEL_DIR}")

AttributeError: 'TrainCfg' object has no attribute 'LEARNING_RATE'


## 5) Build NER pipeline (transformer or tok2vec fallback)

We prefer **transformers** for better transfer learning. If `spacy-transformers` or `torch`/CUDA is missing, we fallback to a **tok2vec** model for CPU training.


In [None]:

def build_nlp_transformer() -> spacy.language.Language:
    """Build a small-window transformer + NER with optional layer freezing."""
    try:
        import spacy_transformers  # noqa: F401
    except Exception as e:
        raise RuntimeError("spacy-transformers is not available.") from e

    nlp = spacy.blank("en")
    trf_cfg = {
        "model": {
            "@architectures": "spacy-transformers.TransformerModel.v3",
            "name": TRAIN.TRANSFORMER_MODEL,
            "tokenizer_config": {"use_fast": True},
            "transformer_config": {},
            "mixed_precision": bool(TRAIN.USE_AMP),
            "grad_scaler_config": {"enabled": bool(TRAIN.USE_AMP)},
            "get_spans": {
                "@span_getters": "spacy-transformers.strided_spans.v1",
                "window": int(TRAIN.WINDOW),
                "stride": int(TRAIN.STRIDE),
            },
        },
        "set_extra_annotations": {
            "@annotation_setters": "spacy-transformers.null_annotation_setter.v1"
        },
        "max_batch_items": 4096,
    }
    nlp.add_pipe("transformer", config=trf_cfg)
    ner = nlp.add_pipe("ner")
    ner.add_label("INGREDIENT")

    # Optional layer freezing
    if TRAIN.FREEZE_LAYERS > 0:
        try:
            trf = nlp.get_pipe("transformer").model
            hf = trf.transformer.model
            blocks = None
            if hasattr(hf, "transformer") and hasattr(hf.transformer, "layer"):  # distilbert
                blocks = hf.transformer.layer
            elif hasattr(hf, "encoder") and hasattr(hf.encoder, "layer"):        # bert/roberta
                blocks = hf.encoder.layer
            if blocks is not None:
                k = min(TRAIN.FREEZE_LAYERS, len(blocks))
                for i in range(k):
                    for p in blocks[i].parameters():
                        p.requires_grad = False
                print(f"[transformer] Froze {k} lower layer(s).")
        except Exception as e:
            warnings.warn(f"Could not freeze layers: {e}")
    return nlp

def build_nlp_tok2vec() -> spacy.language.Language:
    """CPU-friendly tok2vec + NER fallback."""
    nlp = spacy.blank("en")
    nlp.add_pipe("tok2vec")
    ner = nlp.add_pipe("ner")
    ner.add_label("INGREDIENT")
    print("Using tok2vec fallback (no transformers).")
    return nlp

def choose_nlp():
    if torch is not None:
        has_trf = True
        try:
            import spacy_transformers  # noqa
        except Exception:
            has_trf = False
        if has_trf:
            try:
                return build_nlp_transformer(), "transformer"
            except Exception as e:
                warnings.warn(f"Falling back to tok2vec due to: {e}")
                return build_nlp_tok2vec(), "tok2vec"
    # No torch or transformers
    return build_nlp_tok2vec(), "tok2vec"



## 6) Training loop with early stopping

We stream `Example`s from DocBin shards, update in micro-batches, evaluate on a snapshot of validation examples, and save **model-best** when F1 improves.


In [None]:

def iter_examples_from_docbins(nlp, dir_path: Path, shuffle: bool = False):
    shard_paths = sorted(p for p in dir_path.glob("*.spacy"))
    if shuffle:
        random.shuffle(shard_paths)
    for sp_path in shard_paths:
        db = DocBin().from_disk(sp_path)
        for d in db.get_docs(nlp.vocab):
            ents = [(e.start_char, e.end_char, e.label_) for e in d.ents]
            yield Example.from_dict(nlp.make_doc(d.text), {"entities": ents})

def sample_validation(nlp, dir_path: Path, cap: int = 1500):
    out, n = [], 0
    for eg in iter_examples_from_docbins(nlp, dir_path, shuffle=False):
        out.append(eg); n += 1
        if n >= cap:
            break
    return out

def compounding_batch(epoch: int, total_epochs: int, start: int = 8, end: int = 16) -> int:
    if total_epochs <= 1:
        return end
    r = epoch / (total_epochs - 1)
    return max(1, int(round(start * ((end / start) ** r))))

def train_ner(train_dir: Path, valid_dir: Path):
    nlp, mode = choose_nlp()
    print("Pipeline mode:", mode)

    # Warm init
    warm = []
    for i, eg in enumerate(iter_examples_from_docbins(nlp, train_dir, shuffle=True)):
        warm.append(eg)
        if i >= min(256,  max(16, 100)):  # small warm set
            break
    optimizer = nlp.initialize(lambda: warm)
    if hasattr(optimizer, "learn_rate"):
        optimizer.learn_rate = float(TRAIN.LR)

    # Validation snapshot for fast eval
    valid_snapshot = sample_validation(nlp, valid_dir, cap=TRAIN.EVAL_SNAPSHOT_MAX)

    best_f1 = -1.0
    bad_epochs = 0

    for epoch in range(TRAIN.N_EPOCHS):
        losses = {}
        micro_bs = compounding_batch(epoch, TRAIN.N_EPOCHS, start=8, end=16)
        buf = []
        updates = 0

        for eg in iter_examples_from_docbins(nlp, train_dir, shuffle=True):
            buf.append(eg)
            if len(buf) < micro_bs:
                continue
            nlp.update(buf, sgd=optimizer, drop=TRAIN.DROPOUT, losses=losses)
            buf.clear()
            updates += 1
            if (torch is not None) and torch.cuda.is_available() and updates % TRAIN.CLEAR_CACHE_EVERY == 0:
                torch.cuda.empty_cache()
        if buf:
            nlp.update(buf, sgd=optimizer, drop=TRAIN.DROPOUT, losses=losses)
            buf.clear()

        # quick eval
        with nlp.select_pipes(disable=[p for p in nlp.pipe_names if p != "ner"]):
            scores = nlp.evaluate(valid_snapshot)
        p = float(scores.get("ents_p") or 0.0)
        r = float(scores.get("ents_r") or 0.0)
        f1 = float(scores.get("ents_f") or 0.0)

        print(f"Epoch {epoch+1:02d}/{TRAIN.N_EPOCHS} | μbs={micro_bs:<3d} | loss={losses.get('ner', 0):.1f} | P/R/F1={p:.3f}/{r:.3f}/{f1:.3f}")

        # Early stopping & best model save
        improved = f1 > best_f1 + 1e-6
        if improved:
            best_f1 = f1
            bad_epochs = 0
            OUT.MODEL_DIR.mkdir(parents=True, exist_ok=True)
            nlp.to_disk(OUT.MODEL_DIR)
            print(f"  ↳ Saved model-best → {OUT.MODEL_DIR} (F1={f1:.3f})")
        else:
            bad_epochs += 1
            if bad_epochs >= TRAIN.EARLY_STOPPING_PATIENCE:
                print(f"Early stopping after {bad_epochs} non-improving epoch(s).")
                break

    print("Best F1 observed:", best_f1 if best_f1 >= 0 else 0.0)
    return nlp




In [None]:
# Uncomment to train now (may take time depending on data and model size)
# nlp_trained = train_ner(OUT.TRAIN_DIR, OUT.VALID_DIR)
# print("Model saved to:", OUT.MODEL_DIR)


## 7) Normalization, dedupe & encoding maps

Utilities to (a) load the dedupe JSONL mapping, (b) load `IngredientEncoder` maps, and (c) apply predictions → normalized strings → integer IDs.


In [None]:

from typing import Dict, Union

def load_jsonl_map(path: Union[str, Path]) -> Dict[str, str]:
    mapping = {}
    p = Path(path) if path is not None else None
    if not p or not p.exists():
        return mapping
    with open(p, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                obj = json.loads(line)
                src = normalize_token(str(obj.get("from", "")))
                dst = normalize_token(str(obj.get("to", "")))
                if src and dst:
                    mapping[src] = dst
    return mapping

def load_encoder_maps(id2tok_path: Path | None, tok2id_path: Path | None):
    if not id2tok_path or not tok2id_path:
        return None, None
    if (not Path(id2tok_path).exists()) or (not Path(tok2id_path).exists()):
        return None, None
    with open(id2tok_path, "r", encoding="utf-8") as f:
        id2tok_raw = json.load(f)
    with open(tok2id_path, "r", encoding="utf-8") as f:
        tok2id_raw = json.load(f)
    id2tok = {int(k): str(v) for k, v in id2tok_raw.items()}
    tok2id = {str(k): int(v) for k, v in tok2id_raw.items()}
    return id2tok, tok2id

def apply_dedupe(tok: str, mapping: Dict[str, str]) -> str:
    return mapping.get(tok, tok) if mapping else tok

dedupe_map = load_jsonl_map(DATA.DEDUPE_JSONL) if DATA.DEDUPE_JSONL else {}
ing_id2tok, ing_tok2id = load_encoder_maps(DATA.ING_ID2TOK_JSON, DATA.ING_TOK2ID_JSON)
print(f"Dedupe entries: {len(dedupe_map):,} | Encoder maps present: {ing_id2tok is not None}")



## 8) Inference helper

Load the saved model, run NER on a **target dataset**, normalize strings, dedupe (optional), and encode to IDs (optional).  
Sampling options are available to speed up quick checks.


In [None]:
# --- 7) Structured inference — original vs cleaned (wide + tall) ---
from __future__ import annotations

from pathlib import Path
from typing import Optional
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
import spacy
from IPython.display import display, HTML
from html import escape

def _unique_preserve_order(seq):
    seen = set()
    out = []
    for x in seq:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out

def _extract_ingredient_rows(doc, dedupe: Optional[dict] = None, tok2id: Optional[dict] = None):
    """Return a list of per-entity dicts with offsets + normalized/canonical forms."""
    rows = []
    for ent in doc.ents:
        if ent.label_ != "INGREDIENT":
            continue
        raw = ent.text
        norm = normalize_token(raw)
        canon = apply_dedupe(norm, dedupe)
        tok_id = tok2id.get(canon, 0) if tok2id else None
        rows.append({
            "raw": raw,
            "start": int(ent.start_char),
            "end": int(ent.end_char),
            "label": ent.label_,
            "norm": norm,
            "canonical": canon,
            "id": int(tok_id) if tok_id is not None else None,
        })
    return rows

def predict_normalize_encode_structured(
    nlp_dir: Path,
    data_path: Path,
    is_parquet: bool,
    text_col: str,
    dedupe: Optional[dict] = None,
    tok2id: Optional[dict] = None,
    out_path: Optional[Path] = None,
    batch_size: int = 256,
    # sampling knobs (use exactly one)
    sample_n: Optional[int] = None,
    sample_frac: Optional[float] = None,
    head_n: Optional[int] = None,
    start: int = 0,
    stop: Optional[int] = None,
    sample_seed: int = 42,
    # performance
    n_process: int = 1,  # keep 1 for GPU/transformers
):
    """
    Returns:
      df_wide: one row per input, columns=[text_col, NER_raw, NER_clean, Ingredients?, spans_json]
      df_tall: one row per extracted entity with offsets and normalized/canonical forms
    If out_path is set, writes two parquet files: <stem>_wide.parquet and <stem>_tall.parquet
    """
    nlp = spacy.load(nlp_dir)

    df_in = load_data(data_path, is_parquet, text_col)

    # Apply ONE sampling strategy
    if head_n is not None:
        df_in = df_in.head(head_n)
    elif sample_n is not None:
        df_in = df_in.sample(n=min(sample_n, len(df_in)), random_state=sample_seed)
    elif sample_frac is not None:
        df_in = df_in.sample(frac=min(max(sample_frac, 0.0), 1.0), random_state=sample_seed)
    elif start != 0 or stop is not None:
        df_in = df_in.iloc[start:stop]

    texts = df_in[text_col].astype(str).tolist()

    wide_rows = []
    tall_records = []

    for i, doc in enumerate(tqdm(nlp.pipe(texts, batch_size=batch_size, n_process=n_process),
                                 total=len(texts), desc="Infer (structured)")):
        rows = _extract_ingredient_rows(doc, dedupe=dedupe, tok2id=tok2id)

        raw_list   = _unique_preserve_order([r["raw"] for r in rows])
        clean_list = _unique_preserve_order([r["canonical"] for r in rows if r["canonical"]])
        id_list    = [r["id"] for r in rows if r["id"] is not None] if tok2id else None

        # wide entry (compact)
        wide_entry = {
            text_col: texts[i],
            "NER_raw": raw_list,
            "NER_clean": clean_list,
            "spans_json": json.dumps(rows, ensure_ascii=False),  # arrow-friendly
        }
        if tok2id:
            wide_entry["Ingredients"] = id_list
        wide_rows.append(wide_entry)

        # tall entries (one row per entity, great for QA/exploration)
        for r in rows:
            tall_records.append({
                "row_id": i,
                text_col: texts[i],
                "ent_text": r["raw"],
                "start": r["start"],
                "end": r["end"],
                "label": r["label"],
                "norm": r["norm"],
                "canonical": r["canonical"],
                "id": r["id"],
            })

    df_wide = pd.DataFrame(wide_rows)
    df_tall = pd.DataFrame(tall_records)

    if out_path is not None:
        base = Path(out_path)
        wide_path = base.with_name(base.stem + "_wide.parquet")
        tall_path = base.with_name(base.stem + "_tall.parquet")
        pq.write_table(pa.Table.from_pandas(df_wide, preserve_index=False).replace_schema_metadata(None), wide_path)
        pq.write_table(pa.Table.from_pandas(df_tall, preserve_index=False).replace_schema_metadata(None), tall_path)
        print(f"Wrote → {wide_path.name} and {tall_path.name} in {wide_path.parent}")

    return df_wide, df_tall

# ---- Quick visual helpers ----

def preview_side_by_side(df_wide: pd.DataFrame, text_col: str, n: int = 8):
    """Simple tabular 'original vs cleaned' preview."""
    cols = [text_col, "NER_raw", "NER_clean"] + (["Ingredients"] if "Ingredients" in df_wide.columns else [])
    display(df_wide.loc[:, cols].head(n))

def _render_marked(text: str, spans: list[dict]) -> str:
    """Mark entities inline; tooltip shows norm/canonical/id for quick QA."""
    spans = sorted(spans, key=lambda r: r["start"])
    pos = 0
    out = []
    for r in spans:
        out.append(escape(text[pos:r["start"]]))
        frag = escape(text[r["start"]:r["end"]])
        tip  = f'norm="{r["norm"]}" | canonical="{r["canonical"]}" | id={r["id"] if r["id"] is not None else "-"}'
        out.append(f'<mark title="{escape(tip)}">{frag}</mark>')
        pos = r["end"]
    out.append(escape(text[pos:]))
    return "".join(out)

def html_preview(df_wide: pd.DataFrame, text_col: str, n: int = 8):
    """Inline HTML with highlighted entities and cleaned list below."""
    rows = []
    for _, row in df_wide.head(n).iterrows():
        spans = json.loads(row["spans_json"])
        marked = _render_marked(row[text_col], spans)
        cleaned = ", ".join(row.get("NER_clean") or [])
        rows.append(f"""
        <div class="one">
          <div class="orig">{marked}</div>
          <div class="clean"><strong>NER_clean:</strong> {escape(cleaned)}</div>
        </div>
        """)
    style = """
    <style>
      .one{border:1px solid #ddd; padding:10px; margin:8px 0; border-radius:6px;}
      .orig{margin-bottom:6px; line-height:1.5}
      mark{padding:0 2px; border-radius:3px}
      .clean{font-family:monospace}
    </style>
    """
    display(HTML(style + "\n".join(rows)))

def describe_predictions(df_wide: pd.DataFrame, top_k: int = 20):
    """Small summary to sanity-check output distribution."""
    s = (df_wide["NER_clean"].explode().value_counts().head(top_k))
    print(f"Rows: {len(df_wide):,} | rows with ≥1 pred: {(df_wide['NER_clean'].map(bool)).sum():,}")
    print(f"Mean #unique preds/row: {df_wide['NER_clean'].map(len).mean():.2f}")
    display(s.to_frame("freq"))


In [None]:
# Example call (keeps your paths/variables):
df_wide, df_tall = predict_normalize_encode_structured(
    nlp_dir=OUT.MODEL_DIR,
    data_path=DATA.TRAIN_PATH,             # replace with your new dataset path as needed
    is_parquet=DATA.DATA_IS_PARQUET,
    text_col="ingredients_text",      # <-- change this to your free-text column
    dedupe=dedupe_map,
    tok2id=ing_tok2id,
    out_path=OUT.PRED_OUT.with_name("pred_sample10k.parquet"),  # writes *_wide and *_tall
    sample_n=10_000,
    sample_seed=123,
    n_process=1                       # keep 1 when using transformers/GPU
)

# Side-by-side table and HTML highlights
preview_side_by_side(df_wide, text_col="ingredients_text", n=10)
html_preview(df_wide, text_col="ingredients_text", n=10)

# Quick distributional sanity check
describe_predictions(df_wide)
