# Multi-dataset normalizer/encoder


## 1. Load normalizer + maps

In [2]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
from typing import Iterable, Dict, List, Optional
import pyarrow as pa
import pyarrow.parquet as pq

UNK_TOKEN = "<UNK>"
UNK_ID = 0

In [3]:
def _ensure_list_of_str(x) -> List[str]:
    """Coerce various stored formats (list/tuple/np.ndarray/strified list) to list[str]."""
    if x is None:
        return []
    if isinstance(x, (list, tuple, np.ndarray)):
        return [str(t).strip() for t in x if str(t).strip()]
    s = str(x).strip()
    if not s:
        return []
    # leave strings as-is (already-normalized phrases) â€“ do not re-segment here
    # if you have comma-joined strings in some datasets, you can split by comma here
    return [s]

## IngredientEncoder:


In [4]:
class IngredientEncoder:
    def __init__(self, token_to_id: Optional[Dict[str, int]] = None):
        if token_to_id is None:
            token_to_id = {UNK_TOKEN: UNK_ID}
        self.token_to_id: Dict[str, int] = dict(token_to_id)
        self.id_to_token: Dict[int, str] = {v: k for k, v in self.token_to_id.items()}

    # ---- fitting / vocab building ----
    def fit_from_series(self, listcol: Iterable, min_freq: int = 1) -> None:
        """
        Build/extend the vocab from a Series of lists (e.g., df['NER_clean']).
        Tokens below min_freq are ignored (remain OOV -> 0).
        """
        freq: Dict[str, int] = {}
        for entry in listcol:
            for tok in _ensure_list_of_str(entry):
                freq[tok] = freq.get(tok, 0) + 1

        # deterministic order: by (-freq, token) so mappings are reproducible
        items = sorted([(t, c) for t, c in freq.items() if c >= min_freq],
                       key=lambda x: (-x[1], x[0]))

        # assign ids starting after existing
        next_id = max(self.id_to_token.keys(), default=UNK_ID) + 1
        for tok, _ in items:
            if tok not in self.token_to_id:
                self.token_to_id[tok] = next_id
                self.id_to_token[next_id] = tok
                next_id += 1

    def fit_from_parquet(self, path: Path, col: str = "NER_clean", min_freq: int = 1) -> None:
        pf = pq.ParquetFile(path)
        # stream row-groups
        for rg in range(pf.num_row_groups):
            df = pf.read_row_group(rg, columns=[col]).to_pandas()
            self.fit_from_series(df[col], min_freq=min_freq)

    # ---- transform / encode ----
    def encode_list(self, tokens: Iterable[str]) -> List[int]:
        out: List[int] = []
        for tok in _ensure_list_of_str(tokens):
            out.append(self.token_to_id.get(tok, UNK_ID))
        return out or [UNK_ID]

    def transform_series_to_idlists(self, listcol: Iterable) -> List[List[int]]:
        return [self.encode_list(x) for x in listcol]

    def transform_df(self, df: pd.DataFrame, ingredients_col: str = "NER_clean",
                     dataset_id: int = 1) -> pd.DataFrame:
        """
        Returns a DataFrame with: Dataset ID | Index | Ingredients (list[int])
        """
        enc = self.transform_series_to_idlists(df[ingredients_col])
        out = pd.DataFrame({
            "Dataset ID": dataset_id,
            "Index": np.arange(len(df), dtype=np.int64),
            "Ingredients": enc
        })
        return out

    # ---- persistence ----
    def save_maps(self, id_to_token_path: Path, token_to_id_path: Path) -> None:
        id_to_token_path.parent.mkdir(parents=True, exist_ok=True)
        with open(id_to_token_path, "w", encoding="utf-8") as f:
            # write as {id: token}
            json.dump({int(i): t for i, t in self.id_to_token.items()},
                      f, ensure_ascii=False, indent=2)
        with open(token_to_id_path, "w", encoding="utf-8") as f:
            # write as {token: id}
            json.dump({t: int(i) for t, i in self.token_to_id.items()},
                      f, ensure_ascii=False, indent=2)

    @classmethod
    def load_maps(cls, id_to_token_path: Path = None, token_to_id_path: Path = None):
        if id_to_token_path and id_to_token_path.exists():
            id_to_token = json.load(open(id_to_token_path, "r", encoding="utf-8"))
            # keys may come back as strings; coerce
            id_to_token = {int(k): v for k, v in id_to_token.items()}
            token_to_id = {v: k for k, v in id_to_token.items()}
            return cls(token_to_id=token_to_id)
        elif token_to_id_path and token_to_id_path.exists():
            token_to_id = json.load(open(token_to_id_path, "r", encoding="utf-8"))
            # values may come back as strings; coerce
            token_to_id = {k: int(v) for k, v in token_to_id.items()}
            return cls(token_to_id=token_to_id)
        else:
            return cls()  # empty (only <UNK>)

In [5]:
PARQUET_FOR_VOCAB = Path("../data/recipes_data_clean_spell.parquet")  # or the baseline parquet you prefer
ING_COL = "NER_clean"

OUT_DIR = Path("../data/multiset")
OUT_DIR.mkdir(parents=True, exist_ok=True)
UNIFIED_PARQUET = OUT_DIR / "datasets_unified.parquet"
UNIFIED_CSV     = OUT_DIR / "datasets_unified.csv"
REF_ING_JSON    = OUT_DIR / "ingredient_id_to_token.json"   # id -> token
REF_ING_REV_JSON= OUT_DIR / "ingredient_token_to_id.json"   # token -> id

# 1) Fit encoder on the sample parquet
enc = IngredientEncoder()
enc.fit_from_parquet(PARQUET_FOR_VOCAB, col=ING_COL, min_freq=1)

# 2) Transform the same parquet to ID lists
#    (You can also stream row-groups if the file is large; here we do a single read for the sample)
df_sample = pd.read_parquet(PARQUET_FOR_VOCAB, columns=[ING_COL])
encoded_df = enc.transform_df(df_sample, ingredients_col=ING_COL, dataset_id=1)

# 3) Save outputs
# Parquet with list<int64>
ingredients_array = pa.array(
    [(lst if isinstance(lst, (list, tuple, np.ndarray)) else []) for lst in encoded_df["Ingredients"]],
    type=pa.list_(pa.int64())
)
table = pa.Table.from_arrays(
    [
        pa.array(encoded_df["Dataset ID"].to_numpy(dtype=np.int32)),
        pa.array(encoded_df["Index"].to_numpy(dtype=np.int64)),
        ingredients_array
    ],
    names=["Dataset ID", "Index", "Ingredients"],
)
pq.write_table(table, UNIFIED_PARQUET, compression="zstd")

# CSV fallback (lists stringified)
encoded_df.to_csv(UNIFIED_CSV, index=False)

# Reference maps (true inverses)
enc.save_maps(REF_ING_JSON, REF_ING_REV_JSON)

print("Done.")
print(f"Encoded rows: {len(encoded_df)}")
print(f"Vocab size (incl <UNK>): {len(enc.token_to_id)}")
print(f"Written:\n  {UNIFIED_PARQUET}\n  {UNIFIED_CSV}\n  {REF_ING_JSON}\n  {REF_ING_REV_JSON}")

FileNotFoundError: [WinError 2] Failed to open local file '../data/recipes_data_clean_spell.parquet'. Detail: [Windows error 2] The system cannot find the file specified.


In [None]:
# check head of UNIFIED_PARQUET
pq.read_table(UNIFIED_PARQUET).to_pandas().head()

Unnamed: 0,Dataset ID,Index,Ingredients
0,1,0,"[39, 57, 238, 154, 14, 738, 86, 1900, 2545, 164]"
1,1,1,"[353, 352, 12, 563, 165, 6, 9, 126, 1499, 46, ..."
2,1,2,"[3, 14, 32, 314]"
3,1,3,"[15, 16, 2, 244, 50, 9, 4, 1]"
4,1,4,"[2966, 22]"


## Streaming & Inference helpers for IngredientEncoder 

In [7]:
from pathlib import Path
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

def fit_encoder_from_parquet_streaming(
    encoder,
    parquet_path: Path,
    col: str = "NER_clean",
    min_freq: int = 1
):
    """
    Stream row-groups from a Parquet file to build/extend the encoder vocab.
    """
    pf = pq.ParquetFile(parquet_path)
    for rg in range(pf.num_row_groups):
        df_rg = pf.read_row_group(rg, columns=[col]).to_pandas()
        encoder.fit_from_series(df_rg[col], min_freq=min_freq)
        del df_rg
    return encoder


def encode_parquet_streaming(
    encoder,
    parquet_path: Path,
    out_parquet_path: Path,
    dataset_id: int = 1,
    col: str = "NER_clean",
    compression: str = "zstd"
):
    """
    Stream-encode a large Parquet file row-group by row-group into a unified Parquet
    with schema: ["Dataset ID" (int32), "Index" (int64), "Ingredients" (list<int64>)].
    """
    pf = pq.ParquetFile(parquet_path)

    # Prepare Arrow schema up-front
    target_schema = pa.schema([
        pa.field("Dataset ID", pa.int32()),
        pa.field("Index", pa.int64()),
        pa.field("Ingredients", pa.list_(pa.int64())),
    ])

    # Create the writer once
    out_parquet_path.parent.mkdir(parents=True, exist_ok=True)
    writer = pq.ParquetWriter(out_parquet_path, target_schema, compression=compression)

    global_index_start = 0
    for rg in range(pf.num_row_groups):
        # Pull only required column
        df_rg = pf.read_row_group(rg, columns=[col]).to_pandas()

        # Encode this chunk
        id_lists = encoder.transform_series_to_idlists(df_rg[col])

        # Build Arrow arrays
        ds_ids = pa.array(np.full(len(id_lists), dataset_id, dtype=np.int32))
        idxs   = pa.array(np.arange(global_index_start, global_index_start + len(id_lists), dtype=np.int64))
        ingr   = pa.array(
            [(lst if isinstance(lst, (list, tuple, np.ndarray)) else []) for lst in id_lists],
            type=pa.list_(pa.int64())
        )
        tbl = pa.Table.from_arrays([ds_ids, idxs, ingr],
                                   names=["Dataset ID", "Index", "Ingredients"])

        writer.write_table(tbl)
        global_index_start += len(id_lists)

        del df_rg, id_lists, ds_ids, idxs, ingr, tbl

    writer.close()
    return out_parquet_path


def encode_dataframe_inference(
    encoder,
    df: pd.DataFrame,
    ingredients_col: str = "NER_clean",
    dataset_id: int = 1
) -> pd.DataFrame:
    """
    Encode a small in-memory DataFrame (inference batch).
    Returns DataFrame: ["Dataset ID", "Index", "Ingredients"].
    """
    return encoder.transform_df(df, ingredients_col=ingredients_col, dataset_id=dataset_id)

## For large datasets, use streaming functions:


In [8]:
enc = IngredientEncoder()

# 1) Build vocab by streaming (or load existing maps if you want stable IDs)
fit_encoder_from_parquet_streaming(enc, Path("../data/recipes_data_clean_spell.parquet"), col="NER_clean", min_freq=1)

# Optional: persist maps for later reuse
enc.save_maps(Path("../data/multiset/ingredient_id_to_token.json"),
              Path("../data/multiset/ingredient_token_to_id.json"))

# 2) Encode to unified Parquet, streaming row-groups
encode_parquet_streaming(
    enc,
    parquet_path=Path("../data/recipes_data_clean_spell.parquet"),
    out_parquet_path=Path("../data/multiset/datasets_unified.parquet"),
    dataset_id=1,
    col="NER_clean",
)


FileNotFoundError: [WinError 2] Failed to open local file '../data/recipes_data_clean_spell.parquet'. Detail: [Windows error 2] The system cannot find the file specified.


## For smaller datasets

In [13]:
# open up datasets_unified.parquet to check
pq.read_table("../data/encoded/datasets_unified.parquet").to_pandas().head()

Unnamed: 0,Dataset ID,Index,Ingredients
0,1,0,"[1, 2, 3, 4, 5, 6]"
1,1,1,"[7, 8, 9]"
2,1,2,"[10, 11, 12, 13, 6, 14]"
3,1,3,"[15, 7, 16, 17]"
4,1,4,"[18, 3, 19, 20, 6]"
