In [3]:
!pip install rapidfuzz



import json
import re
import unicodedata
from collections import defaultdict
import pandas as pd
from rapidfuzz import fuzz

# =========================================================
# 1️⃣  Helpers
# =========================================================

def nfkc(s: str) -> str:
    return unicodedata.normalize("NFKC", s)

def clean_text(s):
    """Light cleanup with robust dash + punctuation handling."""
    if not isinstance(s, str):
        return None
    s = nfkc(s)
    # unify dashes, invisible spaces
    s = re.sub(r"[\u00A0\u2000-\u200B]", " ", s)
    s = re.sub(r"[\u2212\u2010-\u2015]", "-", s)
    s = re.sub(r"\s+", " ", s).strip()
    s = s.strip(" ,;:")
    return s or None

def norm_key(s):
    """Normalization for matching (lowercase + alnum only)."""
    if not s:
        return None
    s = nfkc(s).lower()
    s = re.sub(r"[\u2212\u2010-\u2015]", "-", s)
    s = re.sub(r"[^a-z0-9 \-]", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s or None

def to_list(x):
    if x is None:
        return []
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        return [x]
    return []

def unique_preserve_order(items):
    seen, out = set(), []
    for it in items:
        if it is None:
            continue
        k = norm_key(it)
        if not k or k in seen:
            continue
        seen.add(k)
        out.append(it)
    return out

def ascii_fold(s):
    """Normalize accents and Greek letters to ASCII."""
    if not isinstance(s, str):
        return s
    # Greek β → beta, μ → micro etc.
    greek_map = {"β": "beta", "μ": "micro", "α": "alpha", "γ": "gamma"}
    for g, r in greek_map.items():
        s = s.replace(g, r)
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    return s

# Remove generic terms that pollute embeddings
GENERIC_TERMS = ["extract", "powder", "root", "leaf", "oil", "seed", "juice"]
def strip_generic_terms(s):
    if not s: 
        return s
    s = re.sub(r"\b(" + "|".join(GENERIC_TERMS) + r")\b", "", s, flags=re.IGNORECASE)
    s = re.sub(r"\s{2,}", " ", s).strip()
    return s

# =========================================================
# 2️⃣  Load source JSONs
# =========================================================

with open("pharmaceutical_data.json", "r", encoding="utf-8") as f:
    pharma = json.load(f)

with open("novel_foods_catalogue.json", "r", encoding="utf-8") as f:
    novel = json.load(f)

rows = []

# ---------------- Pharmaceutical ----------------
for rec in pharma:
    canonical = clean_text(rec.get("name"))
    if not canonical:
        continue
    variants = [canonical] + [clean_text(x) for x in to_list(rec.get("synonyms"))]
    variants = [strip_generic_terms(v) for v in variants if v]
    variants = unique_preserve_order(variants)
    rows.append({
        "canonical_name": strip_generic_terms(canonical),
        "latin_name": None,
        "variants": variants,
        "sources": ["pharma"]
    })

# ---------------- Novel Foods ----------------
for rec in novel:
    canonical = clean_text(rec.get("novel_food_name"))
    if not canonical:
        continue
    latin_name = clean_text(rec.get("latin_name")) if "latin_name" in rec else None
    common_name = clean_text(rec.get("common_name"))
    syns = [clean_text(x) for x in to_list(rec.get("synonyms"))]

    variants = [canonical]
    if common_name: variants.append(common_name)
    if latin_name: variants.append(latin_name)
    variants.extend(syns)
    variants = [strip_generic_terms(v) for v in variants if v]
    variants = unique_preserve_order(variants)

    rows.append({
        "canonical_name": strip_generic_terms(canonical),
        "latin_name": strip_generic_terms(latin_name),
        "variants": variants,
        "sources": ["novel"]
    })

# =========================================================
# 3️⃣  Merge duplicates by canonical
# =========================================================
merged = {}
for r in rows:
    key = norm_key(r["canonical_name"])
    if key not in merged:
        merged[key] = r
    else:
        m = merged[key]
        if (not m["latin_name"]) and r["latin_name"]:
            m["latin_name"] = r["latin_name"]
        m["variants"] = unique_preserve_order(m["variants"] + r["variants"])
        m["sources"] = unique_preserve_order(m["sources"] + r["sources"])

cleaned = list(merged.values())

# =========================================================
# 4️⃣  Fuzzy merge near-duplicates (≥95% lexical similarity)
# =========================================================
print("🔎 Performing fuzzy merge for near-duplicate canonicals...")
df = pd.DataFrame(cleaned)
used = set()
merged_rows = []
for i, row in df.iterrows():
    if i in used: 
        continue
    similar = df[df["canonical_name"].apply(lambda x: fuzz.token_sort_ratio(x, row["canonical_name"]) >= 95)]
    used.update(similar.index)
    all_variants = sum(similar["variants"].tolist(), [])
    merged_rows.append({
        "canonical_name": row["canonical_name"],
        "latin_name": row["latin_name"] or next((x for x in similar["latin_name"] if x), None),
        "variants": unique_preserve_order(all_variants),
        "sources": unique_preserve_order(sum(similar["sources"].tolist(), []))
    })

df = pd.DataFrame(merged_rows)

# =========================================================
# 5️⃣  Accent & Greek normalization for stability
# =========================================================
df["canonical_name_ascii"] = df["canonical_name"].apply(ascii_fold).str.lower()
df["latin_name_ascii"] = df["latin_name"].apply(ascii_fold).str.lower()

# =========================================================
# 6️⃣  Save outputs
# =========================================================
df = df.sort_values("canonical_name").reset_index(drop=True)
df.to_csv("cleaned_supplements_highacc.csv", index=False, encoding="utf-8")

with open("cleaned_supplements_highacc.json", "w", encoding="utf-8") as f:
    json.dump(df.to_dict(orient="records"), f, ensure_ascii=False, indent=2)

print(f"✅ Final entries: {len(df)}")
print("📦 Saved:")
print(" - cleaned_supplements_highacc.csv")
print(" - cleaned_supplements_highacc.json")
print(df.head(5))


Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
🔎 Performing fuzzy merge for near-duplicate canonicals...
✅ Final entries: 1090
📦 Saved:
 - cleaned_supplements_highacc.csv
 - cleaned_supplements_highacc.json
                 canonical_name latin_name  \
0  3, 3'-Diindolylmethane (DIM)       None   
1          4-hydroxy isoleucine       None   
2                         5-HTP       None   
3            5-Hydroxitryptofan       None   
4           5-