<a href="https://colab.research.google.com/github/NINGTANG1124/UPF-HFI/blob/main/notebooks/intake24_nova_matching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# connect googledrive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Step 1: 读取 intake 数据（含 Descriptionen 和 FoodGroupen）
import pandas as pd
import re

file_path = "/content/drive/MyDrive/UPF-HFI/Bradford_original data/1. Dietmasterfile_foodlevel_clean.xls"
intake_df = pd.read_excel(file_path)


In [None]:
# Step 2: 清洗 Descriptionen 列
def clean_description(desc):
    if pd.isna(desc):
        return ""
    desc = str(desc).lower()
    desc = re.sub(r"[()\.,\-]", "", desc)
    desc = re.sub(r"\s+", " ", desc)
    return desc.strip()

intake_df["food_name_clean"] = intake_df["Descriptionen"].apply(clean_description)
intake_df["Foodgroupen"] = intake_df["Foodgroupen"].fillna("")


In [None]:
# Step 3: 定义描述字段的规则匹配函数（宏观→微观）
def match_nova_by_description_v3(text):
    text = str(text).lower()

    # === NOVA 1: 宏观 raw / water ===
    if "raw" in text:
        return 1, "raw"
    if any(word in text for word in ["tap water", "still water", "filtered water"]):
        return 1, "water"

    # === NOVA 3: 宏观 homemade / boiled ===
    if "homemade" in text or "home made" in text:
        return 3, "homemade item"
    if "boiled" in text or "mashed potato" in text:
        return 3, "boiled or mashed"
    if "porridge made with milk" in text:
        return 3, "porridge w/ milk"

    # === NOVA 4: 宏观加工类 ===
    if "takeaway" in text:
        return 4, "fast food"
    if any(word in text for word in ["ice cream topping", "breakfast cereal", "milkshake"]):
        return 4, "dessert/snack item"
    if any(word in text for word in ["flavour", "instant"]):
        return 4, "instant/flavoured"
    if any(word in text for word in ["cracker", "biscuit", "weetabix"]):
        return 4, "snack item"
    if "ketchup" in text and "home made" not in text:
        return 4, "processed ketchup"
    if any(word in text for word in ["squash", "cordial", "carbonated"]):
        return 4, "sweetened drink"
    if any(word in text for word in ["margarine", "clover spread", "flora"]):
        return 4, "processed fat"
    if "nutella" in text:
        return 4, "branded sweet spread"

    # === NOVA 1: 微观乳制品类 ===
    if any(word in text for word in ["natural yoghurt", "whole milk", "fromage frais"]) and "flavour" not in text:
        return 1, "plain dairy"

    return None, None


In [None]:
# Step 4: 定义 group 字段的匹配函数（宏观→微观）
def match_nova_by_group_v2(group):
    group = str(group).lower().strip()

    # === NOVA 1 ===
    if "fresh fruit" in group:
        return 1, "fruit (group)"
    if "dried fruit" in group:
        return 1, "dried fruit (group)"
    if "vegetables" in group and "fried" not in group:
        return 1, "vegetables (group)"

    # === NOVA 3 ===
    if "monounsaturated" in group:
        return 3, "culinary fat (mono)"
    if "dairy fat spreads" in group or "hard marg" in group:
        return 3, "dairy fat spread"

    # === NOVA 4 ===
    if "other breakfast cereals" in group or "muesli" in group or "bran flakes" in group:
        return 4, "processed cereal (group)"
    if "ice cream" in group or "desserts and lollies" in group:
        return 4, "ice cream (group)"
    if any(word in group for word in ["sweets", "toffee", "boiled sweets", "gums", "jellies", "mints", "liquorice", "raw jelly", "popcorn"]):
        return 4, "sweets/snack (group)"

    return None, None

In [None]:
# Step 5: 应用匹配逻辑（优先 description，再补 group）
intake_df[["NOVA_by_desc", "Match_reason"]] = intake_df["Descriptionen"].apply(
    lambda x: pd.Series(match_nova_by_description_v3(x))
)

mask_unmatched = intake_df["NOVA_by_desc"].isna()
intake_df.loc[mask_unmatched, ["NOVA_by_desc", "Match_reason"]] = intake_df.loc[mask_unmatched, "Foodgroupen"].apply(
    lambda x: pd.Series(match_nova_by_group_v2(x))
)

# Step 6: 展示匹配样本结果
matched_sample = intake_df[["food_name_clean", "Foodgroupen", "NOVA_by_desc", "Match_reason"]].query("NOVA_by_desc.notna()").head(20)
matched_sample

In [None]:
intake_df["NOVA_step1"] = intake_df["NOVA_by_desc"]


In [None]:
cols_to_save = [
    "food_name_clean", "Descriptionen", "Foodgroupen",
    "NOVA_step1", "Match_reason"
]

intake_df[cols_to_save].to_excel("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step1.xlsx", index=False)


In [None]:
intake_df[cols_to_save].to_csv("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step1.csv", index=False)


🔹 Step 2: TF-IDF 高阈值匹配（>0.99）【数据源：VKesaite】 Intake 描述 vs FoodName 字段 特点：英国 NDNS 数据，语义贴合度高 匹配后字段： Matched_NOVA Source = 'tfidf_vk_099' Similarity_score

🔹 Step 3: TF-IDF 中阈值匹配（>0.85）【数据源：Giulia FNDDS】 Intake 描述 vs FoodName/Description 字段（视结构而定） 特点：匹配面广但风格偏美式 可作为第二权重匹配源补充空值 匹配后： Source = 'tfidf_giulia_085'

🔹 Step 4: TF-IDF 或 SBERT 语义匹配（>0.85）【数据源：OFF】 两种方式都可用： TF-IDF 匹配 product_name 字段 SBERT 匹配描述（推荐 MiniLM ） 用于最后补充空值，提高 recall（召回率） 匹配后： Source = 'tfidf_off' 或 'sbert_off'

🔹 Step 5: 整合 + 人工补全 + Final 输出

In [None]:
# nova文件数据清洗
# ndns
ndns_df = pd.read_csv("/content/drive/MyDrive/UPF-HFI/nova/NDNS_NOVA_DATABASE.new2023.csv", encoding="ISO-8859-1")
ndns_df.columns = ndns_df.columns.str.strip()
ndns_df = ndns_df[["FoodName", "NOVA"]].dropna()
ndns_df["FoodName_clean"] = ndns_df["FoodName"].str.lower().str.replace(r"[^\w\s]", " ", regex=True).str.replace(r"\s+", " ", regex=True).str.strip()
ndns_df = ndns_df.drop_duplicates(subset=["FoodName_clean"])


In [None]:
giulia_df = pd.read_excel("/content/drive/MyDrive/UPF-HFI/nova/Training Data Original Given by NOVA Researchers - Corrections by Giulia Babak FNDDS 2009-10.xls")
giulia_df.columns = giulia_df.columns.str.strip()
print(giulia_df.columns.tolist())  # 找出正确列名


In [None]:
# 美国的
giulia_df = pd.read_excel("/content/drive/MyDrive/UPF-HFI/nova/Training Data Original Given by NOVA Researchers - Corrections by Giulia Babak FNDDS 2009-10.xls")
giulia_df.columns = giulia_df.columns.str.strip()

giulia_df = giulia_df[["Main_food_description", "SR_nova_group"]].dropna()
giulia_df = giulia_df.rename(columns={"Main_food_description": "FoodName", "SR_nova_group": "NOVA"})

giulia_df["FoodName_clean"] = giulia_df["FoodName"].str.lower().str.replace(r"[^\w\s]", " ", regex=True).str.replace(r"\s+", " ", regex=True).str.strip()
giulia_df = giulia_df.drop_duplicates(subset=["FoodName_clean"])


In [None]:
# off的
off_clean = []
with open("/content/drive/MyDrive/UPF-HFI/nova/openfoodfacts-popular-24.json", "r", encoding="utf-8") as f:
    for line in f:
        try:
            entry = json.loads(line)
            if not isinstance(entry, dict):
                continue  # 跳过非对象
            name = entry.get("product_name") or entry.get("abbreviated_product_name")
            nova = entry.get("nova_group")
            if name and nova:
                name_clean = re.sub(r"[^\w\s]", " ", name.lower())
                name_clean = re.sub(r"\s+", " ", name_clean).strip()
                off_clean.append({"FoodName_clean": name_clean, "NOVA": nova})
        except json.JSONDecodeError:
            continue  # 忽略错误行

off_df = pd.DataFrame(off_clean).drop_duplicates(subset=["FoodName_clean"])


In [None]:
ndns_df.to_csv("NDNS_clean.csv", index=False)
giulia_df.to_csv("Giulia_clean.csv", index=False)
off_df.to_csv("OFF_clean.csv", index=False)


In [None]:
# ✅ Step 2：TF-IDF 匹配未完成部分（基于 NOVA 对照池）

# 🧱 1. 合并对照库作为 TF-IDF 的 reference
nova_pool = pd.concat([ndns_df, giulia_df, off_df], ignore_index=True)
nova_pool = nova_pool.drop_duplicates(subset=["FoodName_clean"])

# 🧱 2. 加载 intake 原始数据（含 Step1 结果）
intake_df = pd.read_csv("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step1.csv")

# 🧱 3. 选出 NOVA_step1 是缺失的食物
mask_missing = intake_df["NOVA_step1"].isna()
query_texts = intake_df.loc[mask_missing, "food_name_clean"].dropna()
query_texts_index = query_texts.index

# 🧱 4. 构建 TF-IDF 向量器并转换
vectorizer = TfidfVectorizer()
tfidf_ref = vectorizer.fit_transform(nova_pool["FoodName_clean"])
tfidf_query = vectorizer.transform(query_texts)

# 🧱 5. 匹配并返回得分和匹配内容
similarity_matrix = cosine_similarity(tfidf_query, tfidf_ref)
best_match_idx = similarity_matrix.argmax(axis=1)
best_match_score = similarity_matrix.max(axis=1)
matched_nova = nova_pool.iloc[best_match_idx]["NOVA"].values
matched_name = nova_pool.iloc[best_match_idx]["FoodName_clean"].values

# 🧱 6. 回写 intake_df 中
intake_df.loc[query_texts_index, "NOVA_step2"] = matched_nova
intake_df.loc[query_texts_index, "TFIDF_score"] = best_match_score
intake_df.loc[query_texts_index, "TFIDF_match_name"] = matched_name

# ✅ 可选：设置匹配阈值
threshold = 0.85
intake_df.loc[intake_df["TFIDF_score"] < threshold, ["NOVA_step2", "TFIDF_match_name"]] = [None, None]

# ✅ 保存输出
intake_df.to_csv("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step2.csv", index=False)


In [None]:
# ✅ Step 2 分析代码
# ✅ 1. 匹配成功数量和比例
matched_tfidf = intake_df["NOVA_step2"].notna().sum()
total_tfidf_targets = intake_df["NOVA_step1"].isna().sum()
match_rate_tfidf = matched_tfidf / total_tfidf_targets

print(f"🔍 Step 2（TF-IDF）匹配成功数: {matched_tfidf} / {total_tfidf_targets} = {match_rate_tfidf:.2%}")

# ✅ 2. 匹配置信度统计
print("\n📊 TF-IDF 匹配得分描述性统计：")
print(intake_df["TFIDF_score"].describe())

# ✅ 3. 查看低置信度（得分 < 0.85）示例
low_confidence = intake_df.query("TFIDF_score < 0.85 and TFIDF_score.notna()").sort_values(by="TFIDF_score")
low_confidence[["food_name_clean", "TFIDF_match_name", "TFIDF_score", "NOVA_step2"]].head(10)


In [None]:
# 🧩 Step 3：合并 Step1 与 Step2 匹配结果，形成最终 NOVA 列

def combine_nova(row):
    if pd.notna(row["NOVA_step1"]):
        return row["NOVA_step1"]
    elif pd.notna(row["NOVA_step2"]):
        return row["NOVA_step2"]
    else:
        return None

intake_df["NOVA_final"] = intake_df.apply(combine_nova, axis=1)

# 同时保留来源（说明匹配来源是 Step1 / Step2 / None）
def get_reason(row):
    if pd.notna(row["NOVA_step1"]):
        return "Keyword"
    elif pd.notna(row["NOVA_step2"]):
        return "TF-IDF"
    else:
        return "Unmatched"

intake_df["Match_source"] = intake_df.apply(get_reason, axis=1)

# ✅ 保存最终结果
intake_df.to_csv("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step3.csv", index=False)


In [None]:
# ✅ Step 3 分析代码：
# ✅ 1. 匹配来源分布
print("\n📊 匹配来源分布统计：")
print(intake_df["Match_source"].value_counts(dropna=False))
print("\n📊 匹配来源百分比：")
print(intake_df["Match_source"].value_counts(normalize=True, dropna=False).map("{:.2%}".format))

# ✅ 2. 可选：每种 Match_source 在 Foodgroupen 中的分布（如需深入分析）
# pd.crosstab(intake_df["Foodgroupen"], intake_df["Match_source"])


🎯 Step 4：使用 SBERT 对剩余 NOVA_final 为空的食物进行语义匹配补全

In [None]:
!pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer, util
import torch


In [None]:
# 2. 加载预训练模型（推荐 all-MiniLM-L6-v2）：
model = SentenceTransformer("all-MiniLM-L6-v2")


In [None]:
# 🛠 Step 4：对 NOVA_final 为空的食物进行 SBERT 匹配
# ✅ 1. 准备候选库（对 nova_pool 编码）
# 确保你之前准备好的 nova_pool 有 FoodName_clean 列
ref_texts = nova_pool["FoodName_clean"].tolist()
ref_embeddings = model.encode(ref_texts, convert_to_tensor=True)


In [None]:
# ✅ 2. 选取待匹配食物（NOVA_final 为空）
unmatched_df = intake_df[intake_df["NOVA_final"].isna()].copy()
query_texts = unmatched_df["food_name_clean"].dropna().tolist()
query_indices = unmatched_df["food_name_clean"].dropna().index

query_embeddings = model.encode(query_texts, convert_to_tensor=True)


In [None]:
# ✅ 3. 计算语义相似度并提取匹配结果
cosine_scores = util.pytorch_cos_sim(query_embeddings, ref_embeddings)
top_scores, top_indices = torch.max(cosine_scores, dim=1)

# 写入结果
intake_df.loc[query_indices, "SBERT_score"] = top_scores.cpu().numpy()
intake_df.loc[query_indices, "SBERT_match_name"] = nova_pool.iloc[top_indices.cpu().numpy()]["FoodName_clean"].values
intake_df.loc[query_indices, "NOVA_step4"] = nova_pool.iloc[top_indices.cpu().numpy()]["NOVA"].values


In [None]:
# ✅ 4. 更新最终列：NOVA_final + Match_source
# 如果 Step3 没找到但 Step4 找到了，使用 SBERT 匹配结果
intake_df["NOVA_final"] = intake_df["NOVA_final"].combine_first(intake_df["NOVA_step4"])

# 同样更新匹配来源
intake_df["Match_source"] = intake_df.apply(lambda row: (
    "SBERT" if pd.notna(row["NOVA_step4"]) and pd.isna(row["NOVA_step1"]) and pd.isna(row["NOVA_step2"])
    else row["Match_source"]
), axis=1)


In [None]:
# 将 NOVA_final 移动到最后一列
col_order = [col for col in intake_df.columns if col != "NOVA_final"] + ["NOVA_final"]
intake_df = intake_df[col_order]

# 保存为 CSV
intake_df.to_csv("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step4_final.csv", index=False)


In [None]:
unmatched_final = intake_df[intake_df["NOVA_final"].isin([None, "", "NC"])]
print(f"❌ 实际未匹配上的食物数量（含 NC）：{len(unmatched_final)}")


In [None]:
# 提取未匹配的行（None, "", "NC"）
unmatched_final = intake_df[intake_df["NOVA_final"].isin([None, "", "NC"])]

# 按 food_name_clean 统计频率
nc_counts = unmatched_final["food_name_clean"].value_counts().reset_index()
nc_counts.columns = ["food_name_clean", "count"]

# 展示前 30 个高频未匹配条目
print("🍽️ 高频未匹配食物（前30）：")
print(nc_counts.head(30))

# 可选：导出成 CSV 文件
nc_counts.to_csv("/content/high_freq_nc_foods.csv", index=False)
