In [None]:
# ===============================================================
# Supplement Synonym Search - Canonical-Only Training
# ===============================================================

import os
import json
import time
import re
import unicodedata
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rapidfuzz import fuzz

# Optional FAISS acceleration
try:
    import faiss
    FAISS_AVAILABLE = True
except ImportError:
    FAISS_AVAILABLE = False


# -----------------------
# Config
# -----------------------
DATA_PATH      = "cleaned_supplements_highacc.json"   # canonical_name field required
MODEL_NAME     = "paraphrase-multilingual-MiniLM-L12-v2"  # base model
# MODEL_NAME   = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"  # higher accuracy, slower

EMB_PATH       = "embeddings_canonical.npy"
INDEX_PATH     = "index_canonical.faiss"
LOOKUP_PATH    = "index_lookup_canonical.csv"
META_PATH      = "index_meta_canonical.json"

TOP_K_DEFAULT  = 5
RECALL_K       = 50
ALPHA_SEM      = 0.70
MIN_CONFIDENCE = 0.55


# ===============================================================
# Helpers
# ===============================================================
def normalize_text(s: str) -> str:
    """Normalization for both query and canonical names."""
    if not isinstance(s, str):
        return ""
    s = unicodedata.normalize("NFKC", s.lower())
    s = re.sub(r"[\u2212\u2010-\u2015]", "-", s)
    s = re.sub(r"[^a-z0-9 \-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


def load_canonical_data():
    """Load dataset and keep only canonical_name column."""
    if DATA_PATH.lower().endswith(".json"):
        with open(DATA_PATH, "r", encoding="utf-8") as f:
            data = json.load(f)
        df = pd.DataFrame(data)
    elif DATA_PATH.lower().endswith(".csv"):
        df = pd.read_csv(DATA_PATH, encoding="utf-8")
    else:
        raise ValueError(f"Unsupported file type: {DATA_PATH}")

    if "canonical_name" not in df.columns:
        raise ValueError("Input file must contain 'canonical_name'.")

    # Keep only canonical_name column
    df = df[["canonical_name"]].copy()
    df["canonical_name"] = df["canonical_name"].astype(str).apply(normalize_text)
    df = df[df["canonical_name"].notna() & (df["canonical_name"] != "")]
    df = df.drop_duplicates().reset_index(drop=True)

    print(f"✅ Loaded {len(df)} canonical entries from {DATA_PATH}.")
    return df


def build_or_load_index(df):
    """Build embeddings + FAISS index using canonical_name only."""
    print("⚙️ Loading model:", MODEL_NAME)
    model = SentenceTransformer(MODEL_NAME)

    # Check cache
    if os.path.exists(META_PATH) and os.path.exists(EMB_PATH) and os.path.exists(LOOKUP_PATH):
        try:
            with open(META_PATH, "r", encoding="utf-8") as f:
                meta = json.load(f)
            if meta.get("row_count") == len(df) and meta.get("model") == MODEL_NAME:
                print("🔁 Loading cached embeddings/index...")
                embeddings = np.load(EMB_PATH)
                df_lookup = pd.read_csv(LOOKUP_PATH)
                index = None
                if FAISS_AVAILABLE and os.path.exists(INDEX_PATH):
                    index = faiss.read_index(INDEX_PATH)
                    print("✅ FAISS index loaded from disk.")
                else:
                    print("⚠️ FAISS not installed or missing; using cosine similarity.")
                return model, embeddings, index, df_lookup
        except Exception:
            print("♻️ Cache mismatch or read error — rebuilding index.")

    # Build from scratch
    print("⚙️ Building embeddings from canonical names...")
    texts = df["canonical_name"].tolist()

    t0 = time.time()
    embeddings = model.encode(
        texts, show_progress_bar=True, normalize_embeddings=True
    )
    print(f"⏱️ Embedding complete in {time.time() - t0:.1f}s for {len(texts)} entries")

    np.save(EMB_PATH, embeddings)
    df.to_csv(LOOKUP_PATH, index=False)
    with open(META_PATH, "w", encoding="utf-8") as f:
        json.dump({"model": MODEL_NAME, "row_count": len(df)}, f)

    if FAISS_AVAILABLE:
        index = faiss.IndexFlatIP(embeddings.shape[1])
        index.add(np.array(embeddings, dtype="float32"))
        faiss.write_index(index, INDEX_PATH)
        print("✅ FAISS index built and cached.")
    else:
        index = None
        print("⚠️ FAISS not installed — using cosine similarity at query time.")

    return model, embeddings, index, df


def search_name(query, model, embeddings, index, df_lookup, top_k=TOP_K_DEFAULT):
    """Search using canonical-only embeddings."""
    q_norm = normalize_text(query)
    q_emb = model.encode([q_norm], normalize_embeddings=True)

    # Step 1: Semantic retrieval
    if FAISS_AVAILABLE and index is not None:
        scores, idx = index.search(np.array(q_emb, dtype="float32"), min(RECALL_K, len(df_lookup)))
        idx, scores = idx[0], scores[0]
    else:
        sims = cosine_similarity(q_emb, embeddings)[0]
        idx = np.argsort(sims)[::-1][:min(RECALL_K, len(df_lookup))]
        scores = sims[idx]

    # Step 2: Combine with lexical (fuzzy) re-rank
    results = []
    for i, s in zip(idx, scores):
        name = df_lookup.iloc[i]["canonical_name"]
        lex = fuzz.token_sort_ratio(query, name) / 100
        final = ALPHA_SEM * float(s) + (1 - ALPHA_SEM) * lex
        if final >= MIN_CONFIDENCE:
            results.append({
                "canonical_name": name,
                "semantic": round(float(s), 3),
                "lexical": round(lex, 3),
                "score": round(final, 3)
            })

    results.sort(key=lambda x: x["score"], reverse=True)
    return results[:top_k]


# ===============================================================
# CLI
# ===============================================================
def main():
    print("🧠 Supplement Synonym Search (Canonical-Only)")
    df = load_canonical_data()
    model, embeddings, index, df_lookup = build_or_load_index(df)

    print("\n✅ Ready. Type any ingredient name (or 'exit' to quit).")
    while True:
        query = input("\n🔍 Enter ingredient name: ").strip()
        if query.lower() == "exit":
            print("👋 Exiting program. Goodbye!")
            break

        results = search_name(query, model, embeddings, index, df_lookup)

        print(f"\nResults for '{query}':")
        print("=" * 70)
        if not results:
            print("No confident match (below threshold). Try another term.")
        else:
            for r in results:
                print(
                    f"Canonical: {r['canonical_name']}\n"
                    f"  Scores → semantic: {r['semantic']}, lexical: {r['lexical']}, final: {r['score']}\n"
                    f"{'-'*70}"
                )


if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


🧠 Supplement Synonym Search (Canonical-Only)
✅ Loaded 1090 canonical entries from cleaned_supplements_highacc.json.
⚙️ Loading model: paraphrase-multilingual-MiniLM-L12-v2




⚙️ Building embeddings from canonical names...


Batches: 100%|██████████| 35/35 [00:13<00:00,  2.62it/s]

⏱️ Embedding complete in 13.4s for 1090 entries
✅ FAISS index built and cached.

✅ Ready. Type any ingredient name (or 'exit' to quit).






Results for '':
Canonical: gaba
  Scores → semantic: 0.915, lexical: 0.0, final: 0.641
----------------------------------------------------------------------
Canonical: tein
  Scores → semantic: 0.896, lexical: 0.0, final: 0.627
----------------------------------------------------------------------
Canonical: oleamid
  Scores → semantic: 0.886, lexical: 0.0, final: 0.62
----------------------------------------------------------------------
Canonical: tiggarn t
  Scores → semantic: 0.855, lexical: 0.0, final: 0.599
----------------------------------------------------------------------
Canonical: vitpil
  Scores → semantic: 0.847, lexical: 0.0, final: 0.593
----------------------------------------------------------------------

Results for 'alba':
Canonical: abies alba
  Scores → semantic: 0.912, lexical: 0.571, final: 0.81
----------------------------------------------------------------------
Canonical: salix alba
  Scores → semantic: 0.908, lexical: 0.571, final: 0.807
---------------