In [2]:
# connect googledrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
# ========= Step2（CSV-only）：用 NDNS_NOVA_DATABASE.new2023.csv 做 Fuzzy / TF-IDF / SBERT =========
# 输出：/content/drive/MyDrive/UPF-HFI/NOVA classification/outcome/step2_from_CSV_matches.xlsx

import sys, subprocess, os, re, unicodedata, warnings, numpy as np, pandas as pd

# ---- 依赖 ----
def ensure(module_name, pip_name=None):
    try: __import__(module_name)
    except ModuleNotFoundError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pip_name or module_name])
        __import__(module_name)

ensure("rapidfuzz")
ensure("sklearn", "scikit-learn")
ensure("openpyxl")
ensure("sentence_transformers", "sentence-transformers")

from rapidfuzz import process, fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ---- 路径（按需改 Step1 路径；CSV 已固定为你提供的文件） ----
STEP1_PATH   = "/content/drive/MyDrive/UPF-HFI/NOVA classification/outcome/step1.xlsx"
REF_CSV_PATH = "/content/drive/MyDrive/UPF-HFI/NOVA classification/ref/NDNS_NOVA_DATABASE.new2023.csv"
OUT_PATH     = "/content/drive/MyDrive/UPF-HFI/NOVA classification/outcome/step2_from_CSV_matches.xlsx"
THRESH_PCT   = 80.0

# ========= 工具函数 =========
def norm(s):
    if pd.isna(s): return ""
    s = str(s).lower()
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"[^a-z0-9&/\+\-\%\(\) ]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def pick_col(df, candidates, name_for_err, required=True):
    low = {c.lower(): c for c in df.columns}
    for c in candidates:
        if c in df.columns: return c
        if c.lower() in low: return low[c.lower()]
    if required:
        raise KeyError(f"找不到 {name_for_err} 列（候选：{candidates}）；当前列名：{list(df.columns)}")
    return None

def looks_star(x):
    return ('*' in str(x)) if x is not None else False

# ========= Step1：筛出未匹配项（空 / 含* / 群组NOVA列含*）=========
def load_unmatched_from_step1(path):
    df = pd.read_excel(path, engine="openpyxl")

    desc_col   = pick_col(df, ["Descriptionen","Description","food_desc","description","desc","item",
                               "short_desc","food_name","food","name","intake24_desc"], "食物描述")
    nova1_col  = pick_col(df, ["NOVA_step1","nova_step1","nova"], "NOVA_step1")
    group_main = pick_col(df, ["Foodgroupen","group","food_group","main_group","category",
                               "ndns_group","ref_group"], "食物分组", required=False)
    subgroup   = pick_col(df, ["Subgroupcode_clean","Subgroupcode","subgroup","sub_group","subgroup_code"],
                               "子组", required=False)

    gn_cols = [c for c in df.columns if ("group" in c.lower() and "nova" in c.lower())]

    mask_empty      = df[nova1_col].isna() | df[nova1_col].astype(str).str.strip().eq("")
    mask_star_self  = df[nova1_col].astype(str).map(looks_star)
    mask_star_group = pd.Series(False, index=df.index)
    if gn_cols:
        mask_star_group = pd.DataFrame({c: df[c].astype(str).map(looks_star) for c in gn_cols}).any(axis=1)

    unmatched_mask = mask_empty | mask_star_self | mask_star_group
    un = df[unmatched_mask].copy()

    def build_text(row):
        base = norm(row[desc_col]); tags = []
        if group_main and pd.notna(row.get(group_main)): tags.append(norm(row[group_main]))
        if subgroup   and pd.notna(row.get(subgroup))  : tags.append(norm(row[subgroup]))
        return base if not tags else f"{base} [group:{'|'.join(tags)}]"

    un["__text__"] = un.apply(build_text, axis=1)
    un["__id__"]   = un.index.astype(str)

    print(f"[Step1] 未匹配总数: {len(un)} | 空: {int(mask_empty.sum())} | 自身含*: {int(mask_star_self.sum())}"
          + (f" | 群组含*: {int(mask_star_group.sum())}" if gn_cols else " | 群组含*: 0"))
    if gn_cols: print(f"[Step1] 检出群组NOVA列：{gn_cols}")
    return un, desc_col, (group_main or subgroup)

# ========= 读取 CSV 参考库 =========
def read_csv_robust(path):
    # 自动推断分隔符与编码
    for enc in ["utf-8", "utf-8-sig", "latin1"]:
        try:
            return pd.read_csv(path, sep=None, engine="python", encoding=enc)
        except Exception:
            continue
    # 最后兜底
    return pd.read_csv(path)

def load_ref_from_csv(path_csv):
    if not os.path.exists(path_csv):
        raise FileNotFoundError(f"参考 CSV 不存在：{path_csv}")
    raw = read_csv_robust(path_csv)
    if raw.empty:
        raise RuntimeError("参考 CSV 为空。")

    # 兼容列名
    desc_c = pick_col(raw, ["Descriptionen","Description","ref_desc","food_desc","desc","name",
                            "label","item","food_name","Foodname","ndns_desc","ndns_name"],
                      "参考-描述")
    nova_c = pick_col(raw, ["NOVA","nova","nova_class","class","nova_code","nova_numeric"],
                      "参考-NOVA")
    group_c = pick_col(raw, ["Foodgroupen","ref_group","group","food_group","category",
                             "main_group","group_desc","Foodgroup"],
                       "参考-分组", required=False)

    ref = pd.DataFrame({
        "ref_desc":  raw[desc_c].astype(str),
        "ref_group": raw[group_c].astype(str) if group_c else "",
        "ref_nova":  raw[nova_c],
        "__file__":  os.path.basename(path_csv)
    })
    ref = ref.dropna(subset=["ref_nova"]).reset_index(drop=True)
    ref["ref_text"] = ref["ref_desc"].map(norm) + " [group:" + ref["ref_group"].map(norm) + "]"
    print(f"[参考库-CSV] 记录数：{len(ref)} | 来自：{os.path.basename(path_csv)}")
    return ref

# ========= 三种匹配 =========
def match_fuzzy(un_df, ref_df, thresh_pct=80.0):
    ref_list = ref_df["ref_text"].tolist()
    hits = []
    for t in un_df["__text__"].tolist():
        if not t: hits.append({"best_idx": None, "score_pct": 0.0}); continue
        m = process.extractOne(t, ref_list, scorer=fuzz.WRatio)
        if m is None: hits.append({"best_idx": None, "score_pct": 0.0})
        else:
            _, sc, pos = m; hits.append({"best_idx": int(pos), "score_pct": float(sc)})
    res = pd.DataFrame(hits); res["accepted"] = res["score_pct"] >= thresh_pct
    rate = res["accepted"].mean() if len(res) else 0.0
    print(f"[Fuzzy] 覆盖率：{rate:.1%}（{res['accepted'].sum()}/{len(res)}），阈值={thresh_pct:.0f}%")
    res = pd.concat([un_df[["__id__","__text__"]].reset_index(drop=True), res], axis=1)
    take = ref_df.loc[res["best_idx"].dropna().astype(int), ["ref_desc","ref_group","ref_nova","ref_text"]].reset_index(drop=True)
    res.loc[res["best_idx"].notna(), ["ref_desc","ref_group","ref_nova","ref_text"]] = take.values
    res["method"] = "fuzzy"; return res

def match_tfidf(un_df, ref_df, thresh_pct=80.0):
    vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
    X_ref = vec.fit_transform(ref_df["ref_text"].tolist())
    X_q   = vec.transform(un_df["__text__"].tolist())
    S = cosine_similarity(X_q, X_ref)
    best_idx = S.argmax(axis=1); best_score = S[np.arange(S.shape[0]), best_idx]
    res = pd.DataFrame({"best_idx": best_idx.astype(int), "score_pct": best_score * 100.0})
    res["accepted"] = res["score_pct"] >= thresh_pct
    rate = res["accepted"].mean() if len(res) else 0.0
    print(f"[TF-IDF] 覆盖率：{rate:.1%}（{res['accepted'].sum()}/{len(res)}），阈值={thresh_pct:.0f}%")
    res = pd.concat([un_df[["__id__","__text__"]].reset_index(drop=True), res], axis=1)
    take = ref_df.loc[res["best_idx"], ["ref_desc","ref_group","ref_nova","ref_text"]].reset_index(drop=True)
    res[["ref_desc","ref_group","ref_nova","ref_text"]] = take.values
    res["method"] = "tfidf"; return res

def match_sbert(un_df, ref_df, thresh_pct=80.0):
    try:
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer("all-MiniLM-L6-v2")
        emb_ref = model.encode(ref_df["ref_text"].tolist(), show_progress_bar=False, normalize_embeddings=True)
        emb_q   = model.encode(un_df["__text__"].tolist(), show_progress_bar=False, normalize_embeddings=True)
        S = cosine_similarity(emb_q, emb_ref)
        best_idx = S.argmax(axis=1); best_score = S[np.arange(S.shape[0]), best_idx]
        res = pd.DataFrame({"best_idx": best_idx.astype(int), "score_pct": best_score * 100.0})
        res["accepted"] = res["score_pct"] >= thresh_pct
        rate = res["accepted"].mean() if len(res) else 0.0
        print(f"[SBERT] 覆盖率：{rate:.1%}（{res['accepted'].sum()}/{len(res)}），阈值={thresh_pct:.0f}%")
        res = pd.concat([un_df[["__id__","__text__"]].reset_index(drop=True), res], axis=1)
        take = ref_df.loc[res["best_idx"], ["ref_desc","ref_group","ref_nova","ref_text"]].reset_index(drop=True)
        res[["ref_desc","ref_group","ref_nova","ref_text"]] = take.values
        res["method"] = "sbert"; return res
    except Exception as e:
        print(f"[SBERT] 跳过（原因：{e}）")
        return pd.DataFrame(columns=["__id__","__text__","best_idx","score_pct","accepted",
                                     "ref_desc","ref_group","ref_nova","ref_text","method"])

# ========= 主流程 =========
unmatched, DESC_COL, GROUP_COL = load_unmatched_from_step1(STEP1_PATH)
ref_all = load_ref_from_csv(REF_CSV_PATH)

# 三法匹配
res_fuzzy = match_fuzzy(unmatched, ref_all, THRESH_PCT)
res_tfidf = match_tfidf(unmatched, ref_all, THRESH_PCT)
res_sbert = match_sbert(unmatched, ref_all, THRESH_PCT)

# 共识：取最高分
cands = [d for d in [res_fuzzy, res_tfidf, res_sbert] if not d.empty]
if cands:
    long = pd.concat(cands, ignore_index=True)
    idx_max = long.groupby("__id__")["score_pct"].idxmax()
    consensus = long.loc[idx_max].copy().reset_index(drop=True)
    consensus["accepted"] = consensus["score_pct"] >= THRESH_PCT
    rate_cons = consensus["accepted"].mean() if len(consensus) else 0.0
    print(f"[Consensus] 覆盖率：{rate_cons:.1%}（{consensus['accepted'].sum()}/{len(consensus)}），阈值={THRESH_PCT:.0f}%")
else:
    consensus = pd.DataFrame()
    print("[Consensus] 无可用候选。")

# 导出
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with pd.ExcelWriter(OUT_PATH, engine="openpyxl") as w:
    unmatched.to_excel(w, index=False, sheet_name="unmatched_input")
    if not res_fuzzy.empty: res_fuzzy.to_excel(w, index=False, sheet_name="fuzzy")
    if not res_tfidf.empty: res_tfidf.to_excel(w, index=False, sheet_name="tfidf")
    if not res_sbert.empty: res_sbert.to_excel(w, index=False, sheet_name="sbert")
    if not consensus.empty: consensus.to_excel(w, index=False, sheet_name="consensus")
print(f"✅ 已保存：{OUT_PATH}")

# 低分样例（便于手工复核）
def peek_low(df, k=8):
    if df.empty: return
    bad = df.loc[~df["accepted"]].sort_values("score_pct", ascending=True).head(k)
    if bad.empty: return
    print("\n—— 低分样例（前几条）——")
    for _, r in bad.iterrows():
        print(f"- ({r['method']:6}) {r['score_pct']:5.1f}% | {r['__text__']}  ==>  {str(r.get('ref_desc',''))[:80]}")

for d in [res_fuzzy, res_tfidf, res_sbert, consensus]:
    peek_low(d, k=6)


[Step1] 未匹配总数: 8413 | 空: 356 | 自身含*: 8057 | 群组含*: 0
[参考库-CSV] 记录数：32210 | 来自：NDNS_NOVA_DATABASE.new2023.csv
[Fuzzy] 覆盖率：98.0%（8246/8413），阈值=80%
[TF-IDF] 覆盖率：1.2%（99/8413），阈值=80%


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[SBERT] 覆盖率：29.2%（2454/8413），阈值=80%
[Consensus] 覆盖率：98.8%（8312/8413），阈值=80%
✅ 已保存：/content/drive/MyDrive/UPF-HFI/NOVA classification/outcome/step2_from_CSV_matches.xlsx

—— 低分样例（前几条）——
- (fuzzy )  65.0% | tahini [group:misc|56r]  ==>  BEER:BEST BITTER;CANNED EG. WHITBREAD TROPHY;TANKARD;WORTHIN
- (fuzzy )  68.4% | flaked almonds [group:nuts|56r]  ==>  NOUGAT
- (fuzzy )  68.4% | flaked almonds [group:nuts|56r]  ==>  NOUGAT
- (fuzzy )  68.8% | melon honeydew [group:fresh fruit|40r]  ==>  HONEYGAR
- (fuzzy )  68.8% | melon honeydew [group:fresh fruit|40r]  ==>  HONEYGAR
- (fuzzy )  68.8% | melon honeydew [group:fresh fruit|40r]  ==>  HONEYGAR

—— 低分样例（前几条）——
- (tfidf )  11.4% |  [group:nan]  ==>  PEPPER
- (tfidf )  11.4% |  [group:nan]  ==>  PEPPER
- (tfidf )  11.4% |  [group:nan]  ==>  PEPPER
- (tfidf )  11.4% |  [group:nan]  ==>  PEPPER
- (tfidf )  11.4% |  [group:nan]  ==>  PEPPER
- (tfidf )  11.4% |  [group:nan]  ==>  PEPPER

—— 低分样例（前几条）——
- (sbert )  54.8% | bottle gourd dumpling cu