# Ingredient Normalization (Data-Driven, PMI-Based)

## This notebook implements a **fully data-driven** normalization for ingredient phrases at scale (~2 M rows).

-   1. Streams your dataset to count unigrams/bigrams/trigrams  
-   2. Computes PMI-style association scores  
-   3. Builds a canonical vocabulary (no hard-coded lists)  
-   4. Segments each NER item by greedy longest-match using that vocabulary  
-   5. Writes a cleaned column (`NER_clean`) back to disk

### Designed for large CSVs: uses chunked ingestion and optional on-disk checkpoints.

### Ingredient Normalization Pipeline

<img src="./ingredient_row_sequence.png" alt="Ingredient normalization pipeline" width="850"/>


In [1]:
# # Optional installs (run once if missing)
# !pip install pyarrow

# # Update requirements.txt 
# !pip freeze > ../requirements.txt

## Config and global imports

In [2]:
from pathlib import Path
import os, gc, json, math, re, ast
from collections import Counter
from typing import List, Tuple, Iterable

import numpy as np
import pandas as pd
from tqdm import tqdm

RAW_DATA_PATH = Path("../data/wilmerarltstrmberg_data.csv")  # raw CSV
# Create a small sample for quick iteration/tests
DATA_PATH = Path("../data/sample_data.csv")
# df = pd.read_csv(RAW_DATA_PATH, nrows=10_000, dtype=str) # remove nrows for full data and comment if path already exists
# df.to_csv(DATA_PATH, index=False) # comment if path already exists

OUTPUT_PATH = Path("../data/recipes_data_clean.parquet")
VOCAB_JSON = Path("../data/ingredient_vocab_stats.json")
NER_COL = "NER"
CHUNK_SIZE = 200_000

# Thresholds
MIN_UNIGRAM = 50
MIN_BIGRAM = 50
MIN_TRIGRAM = 30
PMI_BIGRAM = 3.0
PMI_TRIGRAM = 2.0

## StatsNormalizer (1 to 4 gram)


In [None]:
class StatsNormalizer:
    """
    N-gram normalizer with:
      - Streaming counts (1..4)
      - PMI-based canon + entropy/child-share gates
      - Greedy 4->3->2->1 segmentation
      - Span-preserving fuzzy snap (rapidfuzz if available)
      - Dynamic 4/3/2 fallbacks when canon misses strong collocations
      - CSV->Parquet writer

    Public:
      ingest_csv, ingest_df, build_vocab,
      segment_item, transform_df, transform_csv_to_parquet,
      save_vocab, load_vocab
    """
    # ---- utilities ----
    @staticmethod
    def _tok(s):
        import re
        return re.findall(r"[a-z']+", str(s).lower())

    @staticmethod
    def _ngrams(tokens, n):
        for i in range(len(tokens) - n + 1):
            yield tuple(tokens[i:i+n])

    @staticmethod
    def _parse_ner_entry(entry):
        if entry is None or (isinstance(entry, float) and pd.isna(entry)):
            return []
        s = str(entry).strip()
        if not s:
            return []
        try:
            parsed = ast.literal_eval(s)
            if isinstance(parsed, list):
                return [str(x).strip() for x in parsed if str(x).strip()]
        except Exception:
            pass
        return [x.strip() for x in s.split(",") if x.strip()]

    # ---- init ----
    def __init__(self,
                 max_ngram=4,
                 # canon thresholds
                 min_unigram=50, min_bigram=50, min_trigram=30, min_fourgram=20,
                 pmi_bigram=3.0, pmi_trigram=2.0, pmi_fourgram=2.0,
                 # trigram gates
                 min_child_share=0.12, max_right_entropy=1.0,
                 # 4-gram gates (trigram-head branching)
                 min_child_share4=0.05, max_right_entropy3=1.3,
                 # dynamic fallbacks
                 pmi_bigram_fallback=2.6,  min_bigram_fallback=20,
                 pmi_trigram_fallback=2.2, min_trigram_fallback=12,
                 min_child_share_fallback=0.06, max_right_entropy_fallback=1.4,
                 pmi_fourgram_fallback=1.8, min_fourgram_fallback=10,
                 min_child_share4_fallback=0.04, max_right_entropy3_fallback=1.5,
                 # snap settings
                 snap_score_cutoff=92, snap_near_perfect=96):
        from collections import defaultdict
        self.max_ngram = int(max_ngram)

        self.min_unigram, self.min_bigram, self.min_trigram, self.min_fourgram = (
            min_unigram, min_bigram, min_trigram, min_fourgram
        )
        self.pmi_bigram, self.pmi_trigram, self.pmi_fourgram = PMI_BIGRAM, PMI_TRIGRAM, pmi_fourgram
        # allow external overrides
        self.pmi_bigram = pmi_bigram
        self.pmi_trigram = pmi_trigram

        self.min_child_share, self.max_right_entropy = min_child_share, max_right_entropy
        self.min_child_share4, self.max_right_entropy3 = min_child_share4, max_right_entropy3

        self.pmi_bigram_fallback, self.min_bigram_fallback = pmi_bigram_fallback, min_bigram_fallback
        self.pmi_trigram_fallback, self.min_trigram_fallback = pmi_trigram_fallback, min_trigram_fallback
        self.min_child_share_fallback, self.max_right_entropy_fallback = (
            min_child_share_fallback, max_right_entropy_fallback
        )
        self.pmi_fourgram_fallback, self.min_fourgram_fallback = pmi_fourgram_fallback, min_fourgram_fallback
        self.min_child_share4_fallback, self.max_right_entropy3_fallback = (
            min_child_share4_fallback, max_right_entropy3_fallback
        )

        self.snap_score_cutoff, self.snap_near_perfect = snap_score_cutoff, snap_near_perfect

        self.token_total = 0
        self.c1, self.c2, self.c3, self.c4 = Counter(), Counter(), Counter(), Counter()

        self._followers  = defaultdict(Counter)  # (a,b)->c
        self._followers3 = defaultdict(Counter)  # (a,b,c)->d

        self.canon = set()
        self._canon_ready = False
        self._canon_phrases = None
        self._canon_buckets = None

    # ---- ingest ----
    def ingest_df(self, df, ner_col="NER"):
        for entry in df[ner_col]:
            for item in self._parse_ner_entry(entry):
                t = self._tok(item)
                if not t:
                    continue

                self.c1.update(t)
                self.token_total += len(t)

                if self.max_ngram >= 2 and len(t) >= 2:
                    self.c2.update(self._ngrams(t, 2))

                if self.max_ngram >= 3 and len(t) >= 3:
                    for i in range(len(t) - 2):
                        a, b, c = t[i], t[i+1], t[i+2]
                        self.c3[(a,b,c)] += 1
                        self._followers[(a,b)][c] += 1

                if self.max_ngram >= 4 and len(t) >= 4:
                    for i in range(len(t) - 3):
                        a, b, c, d = t[i], t[i+1], t[i+2], t[i+3]
                        self.c4[(a,b,c,d)] += 1
                        self._followers3[(a,b,c)][d] += 1

    def ingest_csv(self, csv_path, ner_col="NER", chunksize=200_000):
        for chunk in pd.read_csv(csv_path, chunksize=chunksize, dtype=str):
            self.ingest_df(chunk, ner_col=ner_col)
            del chunk; gc.collect()

    # ---- stats ----
    def _right_entropy(self, ab):
        foll = self._followers.get(ab)
        if not foll:
            return 0.0
        tot = sum(foll.values())
        if tot == 0: return 0.0
        H = 0.0
        for v in foll.values():
            p = v / tot
            H -= p * math.log(p + 1e-12)
        return H

    def _right_entropy3(self, abc):
        foll = self._followers3.get(abc)
        if not foll:
            return 0.0
        tot = sum(foll.values())
        if tot == 0: return 0.0
        H = 0.0
        for v in foll.values():
            p = v / tot
            H -= p * math.log(p + 1e-12)
        return H

    def _child_share(self, abc):
        cabc = self.c3.get(abc, 0)
        cab  = self.c2.get(abc[:2], 0)
        return (cabc / cab) if cab else 0.0

    def _child_share4(self, abcd):
        cabcd = self.c4.get(abcd, 0)
        cabc  = self.c3.get(abcd[:3], 0)
        return (cabcd / cabc) if cabc else 0.0

    def _pmi_bigram(self, ab):
        a, b = ab
        cab = self.c2.get(ab, 0)
        if cab == 0 or self.token_total == 0: return -1e9
        pa = self.c1.get(a,0)/self.token_total
        pb = self.c1.get(b,0)/self.token_total
        pab = cab/self.token_total
        return math.log((pab/(pa*pb))+1e-12)

    def _pmi_trigram(self, abc):
        a,b,c = abc
        return (self._pmi_bigram((a,b)) + self._pmi_bigram((b,c)))/2.0

    def _pmi_fourgram(self, abcd):
        a,b,c,d = abcd
        return (self._pmi_bigram((a,b)) + self._pmi_bigram((b,c)) + self._pmi_bigram((c,d))) / 3.0

    # ---- build canon ----
    def build_vocab(self):
        self.canon.clear()

        for w,c in self.c1.items():
            if c >= self.min_unigram:
                self.canon.add((w,))

        for ab,c in self.c2.items():
            if c >= self.min_bigram and self._pmi_bigram(ab) >= self.pmi_bigram:
                self.canon.add(ab)

        for abc,c in self.c3.items():
            if c < self.min_trigram: continue
            if self._pmi_trigram(abc) < self.pmi_trigram: continue
            if self._child_share(abc) < self.min_child_share: continue
            if self._right_entropy(abc[:2]) > self.max_right_entropy: continue
            self.canon.add(abc)

        for abcd,c in self.c4.items():
            if c < self.min_fourgram: continue
            if self._pmi_fourgram(abcd) < self.pmi_fourgram: continue
            if self._child_share4(abcd) < self.min_child_share4: continue
            if self._right_entropy3(abcd[:3]) > self.max_right_entropy3: continue
            self.canon.add(abcd)

        self._canon_ready = True
        self._canon_phrases = None
        self._canon_buckets = None

    # ---- snap helpers ----
    def _canon_bucket_init(self):
        self._canon_phrases = [" ".join(p) for p in self.canon]
        buckets = {}
        for ph in self._canon_phrases:
            ft = ph.split()[0] if ph else ""
            buckets.setdefault(ft, []).append(ph)
        self._canon_buckets = buckets

    def _snap_span(self, tokens, i, n):
        """
        Try snapping tokens[i:i+n] to a canon phrase with same length (or near-perfect match).
        Uses rapidfuzz if available; otherwise no-op.
        """
        if i + n > len(tokens):
            return None
        if self._canon_phrases is None or self._canon_buckets is None:
            self._canon_bucket_init()
        try:
            from rapidfuzz import process, fuzz
        except Exception:
            return None  # RF not installed; skip snapping

        span = " ".join(tokens[i:i+n])
        bucket = self._canon_buckets.get(tokens[i], self._canon_phrases)
        match = process.extractOne(span, bucket, scorer=fuzz.WRatio, score_cutoff=self.snap_score_cutoff)
        if not match:
            return None

        # Robust to RF versions: tuple like (choice, score, idx) or object with indexing
        cand = match[0]
        score = match[1]

        if len(cand.split()) == n or score >= self.snap_near_perfect:
            return cand.split(), n
        return None


    # segmentation
    def _longest_match(self, toks, i):
        if not self._canon_ready:
            raise RuntimeError("build_vocab() first")

        # Exact canon first (4 -> 3 -> 2 -> 1)
        if self.max_ngram >= 4 and i+3 < len(toks) and tuple(toks[i:i+4]) in self.canon:
            return tuple(toks[i:i+4]), 4
        if self.max_ngram >= 3 and i+2 < len(toks) and tuple(toks[i:i+3]) in self.canon:
            return tuple(toks[i:i+3]), 3
        if self.max_ngram >= 2 and i+1 < len(toks) and tuple(toks[i:i+2]) in self.canon:
            return tuple(toks[i:i+2]), 2
        if (toks[i],) in self.canon:
            return (toks[i],), 1

        # Snap to canon (3, then 2)
        if self.max_ngram >= 3:
            snapped = self._snap_span(toks, i, 3)
            if snapped: return tuple(snapped[0]), snapped[1]
        if self.max_ngram >= 2:
            snapped = self._snap_span(toks, i, 2)
            if snapped: return tuple(snapped[0]), snapped[1]

        # Dynamic fallbacks (4,3,2)
        if self.max_ngram >= 4 and i+3 < len(toks):
            abcd = (toks[i], toks[i+1], toks[i+2], toks[i+3])
            cabcd = self.c4.get(abcd, 0)
            if cabcd >= self.min_fourgram_fallback:
                if (self._pmi_fourgram(abcd) >= self.pmi_fourgram_fallback and
                    self._child_share4(abcd) >= self.min_child_share4_fallback and
                    self._right_entropy3(abcd[:3]) <= self.max_right_entropy3_fallback):
                    return abcd, 4

        if self.max_ngram >= 3 and i+2 < len(toks):
            abc = (toks[i], toks[i+1], toks[i+2])
            cabc = self.c3.get(abc, 0)
            if cabc >= self.min_trigram_fallback:
                if (self._pmi_trigram(abc) >= self.pmi_trigram_fallback and
                    self._child_share(abc) >= self.min_child_share_fallback and
                    self._right_entropy(abc[:2]) <= self.max_right_entropy_fallback):
                    return abc, 3

        if self.max_ngram >= 2 and i+1 < len(toks):
            ab = (toks[i], toks[i+1])
            cab = self.c2.get(ab, 0)
            if cab >= self.min_bigram_fallback and self._pmi_bigram(ab) >= self.pmi_bigram_fallback:
                return ab, 2

        return (toks[i],), 1

    def segment_item(self, text):
        t = self._tok(text)
        out, i = [], 0
        while i < len(t):
            phrase, k = self._longest_match(t, i)
            out.append(" ".join(phrase))
            i += k

        # de-dup while keeping order
        seen, clean = set(), []
        for x in out:
            if x not in seen:
                clean.append(x); seen.add(x)

        # drop immediate repetition of previous tail
        pruned = []
        for x in clean:
            if pruned and x == pruned[-1].split()[-1]:
                continue
            pruned.append(x)
        return pruned

    # dataframe & IO 
    def transform_df(self, df, ner_col="NER", out_col="NER_clean", dedupe_row=False):
        results = []
        for v in df[ner_col]:
            segs = [seg for item in self._parse_ner_entry(v) for seg in self.segment_item(item)]
            if dedupe_row:
                seen, uniq = set(), []
                for s in segs:
                    if s not in seen:
                        uniq.append(s); seen.add(s)
                results.append(uniq)
            else:
                results.append(segs)
        df[out_col] = results
        return df

    @staticmethod
    def _sanitize_for_arrow(df, list_col="NER_clean"):
        import pyarrow as pa  # only to check types later
        df = df.copy()

        # Ensure list[str]
        def _to_list_of_str(x):
            if isinstance(x, (list, tuple)):
                return [str(y) for y in x]
            if x is None or (isinstance(x, float) and pd.isna(x)):
                return []
            try:
                val = json.loads(x)
                if isinstance(val, list):
                    return [str(y) for y in val]
            except Exception:
                pass
            return [str(x)]

        if list_col in df.columns:
            df[list_col] = df[list_col].apply(_to_list_of_str)

        # Minimal sanitization for non-list columns
        for col in df.columns:
            if col == list_col:
                continue
            s = df[col]
            if s.dtype == object:
                def _to_scalar_str(v):
                    if isinstance(v, (list, tuple, dict, set)):
                        return json.dumps(v, ensure_ascii=False)
                    return "" if v is None or (isinstance(v, float) and pd.isna(v)) else str(v)
                df[col] = s.map(_to_scalar_str)
        return df

    def transform_csv_to_parquet(self, csv_path, out_path, ner_col="NER", chunksize=200_000):
        import pyarrow as pa, pyarrow.parquet as pq
        writer = None
        for chunk in pd.read_csv(csv_path, chunksize=chunksize, dtype=str):
            chunk = self.transform_df(chunk, ner_col=ner_col, out_col="NER_clean")
            chunk = self._sanitize_for_arrow(chunk, list_col="NER_clean")

            table = pa.Table.from_pandas(chunk, preserve_index=False).replace_schema_metadata(None)
            fields = []
            for f in table.schema:
                if f.name == "NER_clean" and not pa.types.is_list(f.type):
                    fields.append(pa.field("NER_clean", pa.list_(pa.string())))
                else:
                    fields.append(f)
            target_schema = pa.schema(fields)
            try:
                table = table.cast(target_schema, safe=False)
            except Exception:
                arrays = [pa.array(arr, type=pa.list_(pa.string())) for arr in table.column("NER_clean").to_pylist()]
                table = table.set_column(table.schema.get_field_index("NER_clean"), "NER_clean", pa.chunked_array(arrays))

            if writer is None:
                writer = pq.ParquetWriter(out_path, target_schema, compression="zstd")
            writer.write_table(table)

            del chunk, table; gc.collect()
        if writer is not None:
            writer.close()

    # persistence 
    def save_vocab(self, path):
        data = {"token_total": int(self.token_total),
                "canon": [" ".join(p) for p in sorted(self.canon)]}
        Path(path).parent.mkdir(parents=True, exist_ok=True)
        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2)

    @classmethod
    def load_vocab(cls, path):
        data = json.load(open(path, "r", encoding="utf-8"))
        obj = cls()
        obj.canon = set(tuple(p.split()) for p in data["canon"])
        obj._canon_ready = True
        obj._canon_phrases = None
        obj._canon_buckets = None
        return obj


## PASS 1: Count n-grams

In [4]:
normalizer = StatsNormalizer(
    max_ngram=4,
    min_unigram=MIN_UNIGRAM,
    min_bigram=MIN_BIGRAM,
    min_trigram=MIN_TRIGRAM,
    pmi_bigram=PMI_BIGRAM,
    pmi_trigram=PMI_TRIGRAM,
    min_child_share=0.01,     # keep more trigrams
    max_right_entropy=1.0
)
print("Streaming counts from", DATA_PATH)
normalizer.ingest_csv(DATA_PATH, ner_col=NER_COL, chunksize=CHUNK_SIZE)
print("Total tokens:", normalizer.token_total)


Streaming counts from ..\data\sample_data.csv
Total tokens: 105242


## PASS 2: Build vocabulary

In [5]:
normalizer.build_vocab()
normalizer.save_vocab(VOCAB_JSON)
print("Canonical phrases:", len(normalizer.canon))
print("Saved vocab:", VOCAB_JSON)



Canonical phrases: 373
Saved vocab: ..\data\ingredient_vocab_stats.json


## PASS A: Build a global spell/fuzzy map (threaded, fast)

In [6]:
from concurrent.futures import ThreadPoolExecutor
from rapidfuzz import process, fuzz
from spellchecker import SpellChecker
import threading

UNIQ_PATH = Path("../data/ner_unique.txt")
MAP_PATH  = Path("../data/ner_spell_map.jsonl")
FTHRESH   = 90
BATCH     = 2000
WORKERS   = min(16, (os.cpu_count() or 2) * 2)

# 1) Collect unique raw items (stream)
seen = set()
for chunk in pd.read_csv(DATA_PATH, chunksize=CHUNK_SIZE, dtype=str):
    for entry in chunk[NER_COL]:
        for item in normalizer._parse_ner_entry(entry):
            if item:
                seen.add(item)
items = sorted(seen)
UNIQ_PATH.write_text("\n".join(items), encoding="utf-8")
print("Unique NER items:", len(items))

# 2) Prepare canon & buckets from PMI vocab
canon_phrases = [" ".join(p) for p in normalizer.canon]
canon_set = set(canon_phrases)
buckets = {}
for p in canon_phrases:
    toks = p.split()
    ft = toks[0] if toks else ""
    buckets.setdefault(ft, []).append(p)

# Thread-local spellchecker
_tls = threading.local()
vocab_tokens = [t for ph in canon_phrases for t in ph.split()]
def _get_spell():
    sc = getattr(_tls, "sc", None)
    if sc is None:
        sc = SpellChecker(distance=2)
        sc.word_frequency.load_words(vocab_tokens)
        _tls.sc = sc
    return sc

def _fix_batch(batch):
    """Token-level spell; phrase snap only if same length or near-perfect."""
    sc = _get_spell()
    out = []
    for raw in batch:
        toks = normalizer._tok(raw)
        if not toks:
            out.append((raw, raw)); continue

        toks2 = [sc.correction(t) or t for t in toks]
        corrected = " ".join(toks2)
        if corrected in canon_set:
            out.append((raw, corrected)); continue

        choices = buckets.get(toks2[0], canon_phrases)
        match = process.extractOne(corrected, choices, scorer=fuzz.WRatio, score_cutoff=92)
        if match:
            cand, score = match[0], match[1]
            len_ok = (len(cand.split()) == len(toks2))
            near_perfect = score >= 96
            out.append((raw, cand if (len_ok or near_perfect) else corrected))
        else:
            out.append((raw, corrected))
    return out

def _chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

batches = list(_chunks(items, BATCH))
with ThreadPoolExecutor(max_workers=WORKERS) as ex, open(MAP_PATH, "w", encoding="utf-8") as out:
    for res in tqdm(ex.map(_fix_batch, batches), total=len(batches), desc="Spell/Fuzzy map"):
        for raw, fixed in res:
            out.write(json.dumps({"raw": raw, "fixed": fixed}) + "\n")
print("Wrote spell map:", MAP_PATH)


Unique NER items: 3969


Spell/Fuzzy map: 100%|██████████| 2/2 [00:38<00:00, 19.14s/it]

Wrote spell map: ..\data\ner_spell_map.jsonl





## PASS B: Apply the map during write (fast runtime)

In [7]:
import pyarrow as pa, pyarrow.parquet as pq

OUTPUT_PATH_SPELL = Path("../data/recipes_data_clean_spell.parquet")

# load map into memory
spell_map = {}
with open(MAP_PATH, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        spell_map[obj["raw"]] = obj["fixed"]

def _apply_map(entry):
    return [spell_map.get(x, x) for x in normalizer._parse_ner_entry(entry)]

def write_with_spellmap(normalizer, csv_path, out_path, ner_col="NER", chunksize=CHUNK_SIZE):
    writer = None
    for chunk in pd.read_csv(csv_path, chunksize=chunksize, dtype=str):
        chunk["NER_spellchecked"] = chunk[ner_col].apply(_apply_map)
        chunk = normalizer.transform_df(chunk, ner_col="NER_spellchecked", out_col="NER_clean")
        chunk = normalizer._sanitize_for_arrow(chunk, list_col="NER_clean")

        table = pa.Table.from_pandas(chunk, preserve_index=False).replace_schema_metadata(None)
        fields = []
        for f in table.schema:
            if f.name == "NER_clean" and not pa.types.is_list(f.type):
                fields.append(pa.field("NER_clean", pa.list_(pa.string())))
            else:
                fields.append(f)
        schema = pa.schema(fields)
        try:
            table = table.cast(schema, safe=False)
        except Exception:
            arrays = [pa.array(arr, type=pa.list_(pa.string())) for arr in table.column("NER_clean").to_pylist()]
            table = table.set_column(table.schema.get_field_index("NER_clean"), "NER_clean", pa.chunked_array(arrays))

        if writer is None:
            writer = pq.ParquetWriter(out_path, schema, compression="zstd", compression_level=1)
        writer.write_table(table)
        del chunk, table; gc.collect()
    if writer is not None:
        writer.close()

write_with_spellmap(
    normalizer=normalizer,
    csv_path=DATA_PATH,
    out_path=OUTPUT_PATH_SPELL,
    ner_col=NER_COL,
    chunksize=CHUNK_SIZE
)
print("Wrote cleaned (spell-mapped) file:", OUTPUT_PATH_SPELL)



Wrote cleaned (spell-mapped) file: ..\data\recipes_data_clean_spell.parquet


## PASS C: Quick evaluation (optional, light & informative)

In [8]:
from collections import Counter

BASELINE_PATH = Path("../data/recipes_data_clean.parquet")       # from PASS 3 below
SPELL_PATH    = Path("../data/recipes_data_clean_spell.parquet") # from Pass B
MAP_PATH      = Path("../data/ner_spell_map.jsonl")

def scan_parquet(path, columns=("NER_clean",)):
    pf = pq.ParquetFile(path)
    for rg in range(pf.num_row_groups):
        tbl = pf.read_row_group(rg, columns=list(columns))
        df = tbl.to_pandas()
        yield df
        del df, tbl; gc.collect()

def parquet_vocab_and_lengths(path):
    vocab = set()
    total_rows = total_items = 0
    for df_part in scan_parquet(path, columns=("NER_clean",)):
        total_rows += len(df_part)
        lens = df_part["NER_clean"].apply(len)
        total_items += lens.sum()
        for lst in df_part["NER_clean"]:
            for tok in lst:
                vocab.add(tok)
    avg_len = (total_items / total_rows) if total_rows else 0.0
    return {
        "rows": total_rows,
        "avg_ingredients_per_row": avg_len,
        "total_ingredients_emitted": int(total_items),
        "vocab_size": len(vocab),
        "vocab": vocab
    }

def count_in_csv(csv_path, ner_col, target_set, chunksize=200_000):
    cnt = Counter()
    for chunk in pd.read_csv(csv_path, chunksize=chunksize, dtype=str):
        for entry in chunk[ner_col]:
            for item in normalizer._parse_ner_entry(entry):
                if item in target_set:
                    cnt[item] += 1
        del chunk; gc.collect()
    return cnt


## PASS 3: Segment & write


In [9]:
normalizer.transform_csv_to_parquet(
    csv_path=DATA_PATH,
    out_path=OUTPUT_PATH,
    ner_col=NER_COL,
    chunksize=CHUNK_SIZE
)
print("Wrote cleaned file:", OUTPUT_PATH)


Wrote cleaned file: ..\data\recipes_data_clean.parquet


## Sanity tests


In [10]:
tests = ["brown sugar ham", "brown sugar", "dark brown sugar",
         "cream of mushroom soup", "extra lean ground beef", "graham cracker crusts"]
for s in tests:
    seg = normalizer.segment_item(s)
    joined = " | ".join(seg)
    # require 'brown sugar' collocation
    if "brown sugar" in s:
        assert "brown sugar" in joined, (s, seg)
print("Span tests passed.")

# Peek output head
df_check = pd.read_parquet(OUTPUT_PATH).head(10)
df_check[[NER_COL, "NER_clean"]]

Span tests passed.


Unnamed: 0,NER,NER_clean
0,"[""bite size shredded rice biscuits"", ""vanilla""...","[bite, size, shredded, rice, biscuits, vanilla..."
1,"[""cream of mushroom soup"", ""beef"", ""sour cream...","[cream of mushroom soup, beef, sour cream, chi..."
2,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar...","[frozen, corn, pepper, cream, cheese, garlic, ..."
3,"[""chicken gravy"", ""cream of mushroom soup"", ""c...","[chicken, gravy, cream of mushroom soup, chick..."
4,"[""graham cracker crumbs"", ""powdered sugar"", ""p...","[graham cracker, crumbs, powdered, sugar, pean..."
5,"[""sour cream"", ""bacon"", ""pepper"", ""extra lean ...","[sour cream, bacon, pepper, extra, lean ground..."
6,"[""buttermilk"", ""egg"", ""sugar"", ""vanilla"", ""sod...","[buttermilk, egg, sugar, vanilla, soda, flour,..."
7,"[""egg"", ""pepper"", ""crackers"", ""cream-style cor...","[egg, pepper, crackers, cream style, corn, who..."
8,"[""oil"", ""tomatoes"", ""green peppers"", ""water"", ...","[oil, tomatoes, green peppers, water, onions, ..."
9,"[""condensed milk"", ""lemons"", ""graham cracker c...","[condensed milk, lemons, graham cracker, crust..."
