<a href="https://colab.research.google.com/github/NINGTANG1124/UPF-HFI/blob/main/notebook/intake24_nova_matching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# connect googledrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Step 1: 读取 intake 数据（含 Descriptionen 和 FoodGroupen）
import pandas as pd
import re

file_path = "/content/drive/MyDrive/UPF-HFI/Bradford_original data/1. Dietmasterfile_foodlevel_clean.xls"
intake_df = pd.read_excel(file_path)


In [3]:
# step2 清洗 Description 和 Foodgroup
intake_df["Foodgroupen_clean"] = (
    intake_df["Foodgroupen"].astype(str).str.lower().str.strip().str.replace(r"\s+", " ", regex=True)
)

intake_df["Descriptionen_clean"] = (
    intake_df["Descriptionen"].astype(str).str.lower().str.strip().str.replace(r"\s+", " ", regex=True)
)


In [4]:
# description
def match_nova_by_description(text):
    text = str(text).lower().strip()

    # === NOVA 1: 饮用水 ===
    if any(w in text for w in ["tap water", "still water", "filtered water", "plain water"]):
        if "flavour" not in text:
            return 1, "plain water (description)"

    # === NOVA 1: 微观乳制品（plain）===
    if any(w in text for w in ["semi skimmed milk", "skimmed milk", "whole milk"]) and "flavour" not in text:
        return 1, "plain milk"
    if any(w in text for w in ["natural yoghurt", "fromage frais"]) and "flavour" not in text:
        return 1, "plain yoghurt"

    # === NOVA 1: 精确 raw/unprocessed ===
    import re

    if re.search(r'\braw\b', text):
        return 1, "raw (word-bound)"

    # === NOVA 3: 自制、轻加工 ===
    if any(w in text for w in ["homemade", "home made"]):
        return 3, "homemade"
    if any(w in text for w in ["boiled", "mashed potato", "baked potato", "jacket potato"]):
        return 3, "boiled/baked/jacket"

    # === NOVA 4: 工业加工麦片（如sachet类）===
    if "porridge sachet" in text or ("porridge" in text and "oat so simple" in text):
        return 4, "sachet porridge (description)"

    # === NOVA 4: takeaway 快餐类 ===
    if "takeaway" in text or "take away" in text:
        return 4, "takeaway food"

    # === NOVA 4: 零食/甜食/加工脂肪 ===
    if any(w in text for w in ["jam", "conserve", "marmalade", "chocolate spread", "ice cream topping", "marzipan"]):
        return 4, "spread/syrup"
    if any(w in text for w in ["cracker", "savoury biscuit", "cheddar biscuit", "cream cracker"]):
        return 4, "processed snack"
    if any(w in text for w in ["sweets", "gums", "jellies", "boiled sweets", "mints", "liquorice", "popcorn"]):
        return 4, "sweet snack"
    if any(w in text for w in ["ice cream", "dessert", "milkshake"]):
        return 4, "processed dessert"
    if any(w in text for w in ["margarine", "clover spread", "flora"]):
        return 4, "processed fat"
    if "flavoured milk" in text or "chocolate milk" in text:
        return 4, "flavoured milk"
    if "ketchup" in text and "home made" not in text:
        return 4, "processed ketchup"
    if "instant" in text and "porridge" not in text:
        return 4, "instant food"
    # === NOVA 4: takeaway 快餐类 ===
    if "takeaway" in text or "take away" in text:
        return 4, "takeaway food"

    return None, None


In [5]:
# group
def match_nova_by_group(group, description):
    group = str(group).lower().strip()
    description = str(description).lower().strip()

    # === NOVA 1: group 精确匹配 water 类 ===
    if group.strip() in ["water", "tap water", "filtered water"]:
        return 1, "water (group)"

    # === NOVA 1: 未加工果蔬、牛奶、酸奶 ===
    if "fresh fruit" in group:
        return 1, "fruit (group)"
    if "dried fruit" in group:
        return 1, "dried fruit (group)"
    if "vegetables" in group and "fried" not in group:
        return 1, "vegetables (group)"
    if any(word in group for word in ["semi skimmed milk", "skimmed milk", "whole milk"]):
        if "flavour" not in description and "fruit" not in description:
            return 1, "milk (group)"
    if any(word in group for word in ["natural yoghurt", "fromage frais"]):
        if "flavour" not in description and "fruit" not in description:
            return 1, "yoghurt/plain dairy (group)"

    # === NOVA 3: 最小加工脂肪 ===
    if any(w in group for w in ["olive oil", "rapeseed oil", "sunflower oil", "vegetable oil", "butter"]):
        return 3, "culinary fat/oil (group)"

    # === NOVA 4: 糖浆、早餐谷物、加工脂肪 ===
    if any(w in group for w in ["margarine", "fat spread", "flora", "dairy fat spreads", "hard marg"]):
        return 4, "processed fat (group)"
    if any(w in group for w in ["jam", "conserve", "marmalade"]):
        return 4, "preserves (group)"
    if "other breakfast cereals" in group or "muesli" in group or "bran flakes" in group:
        return 4, "processed cereal (group)"

    return None, None


In [6]:
print(intake_df.columns)


Index(['SurveyID', 'UserID', 'Source', 'Starttime', 'Submissiontime',
       'Timetocomplete', 'Cookingoilused', 'Diet', 'Foodamount',
       'Reasonforunusualfoodamount',
       ...
       'Modification_Identification', 'discontinued', 'NDNS_Checks',
       'UserID_specific', 'Day', 'weekday', 'ratio', 'UserID_clean',
       'Foodgroupen_clean', 'Descriptionen_clean'],
      dtype='object', length=168)


In [7]:
# ✅ Step 3: 定义主函数 match_nova()
def match_nova(row):
    description = row["Descriptionen_clean"]
    group = row["Foodgroupen_clean"]

    # Step 1: try description
    nova, reason = match_nova_by_description(description)
    if nova is not None:
        return pd.Series([nova, "description: " + reason])

    # Step 2: fallback to group
    nova, reason = match_nova_by_group(group, description)
    if nova is not None:
        return pd.Series([nova, "group: " + reason])

    return pd.Series([None, None])


In [8]:
intake_df[["NOVA_step1", "match_reason"]] = intake_df.apply(match_nova, axis=1)

In [9]:
cols_to_save = [
    "Descriptionen",
    "Descriptionen_clean",
    "Foodgroupen",
    "Foodgroupen_clean",
    "NOVA_step1",      # 改这里
    "match_reason"
]
intake_df[cols_to_save].to_csv("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step1.csv", index=False)


🔹 Step 2: TF-IDF 高阈值匹配（>0.99）【数据源：VKesaite】 Intake 描述 vs FoodName 字段 特点：英国 NDNS 数据，语义贴合度高 匹配后字段： Matched_NOVA Source = 'tfidf_vk_099' Similarity_score

🔹 Step 3: TF-IDF 中阈值匹配（>0.85）【数据源：Giulia FNDDS】 Intake 描述 vs FoodName/Description 字段（视结构而定） 特点：匹配面广但风格偏美式 可作为第二权重匹配源补充空值 匹配后： Source = 'tfidf_giulia_085'

🔹 Step 4: TF-IDF 或 SBERT 语义匹配（>0.85）【数据源：OFF】 两种方式都可用： TF-IDF 匹配 product_name 字段 SBERT 匹配描述（推荐 MiniLM ） 用于最后补充空值，提高 recall（召回率） 匹配后： Source = 'tfidf_off' 或 'sbert_off'

🔹 Step 5: 整合 + 人工补全 + Final 输出

In [10]:
# nova文件数据清洗
# ndns
ndns_df = pd.read_csv("/content/drive/MyDrive/UPF-HFI/nova/NDNS_NOVA_DATABASE.new2023.csv", encoding="ISO-8859-1")
ndns_df.columns = ndns_df.columns.str.strip()
ndns_df = ndns_df[["FoodName", "NOVA"]].dropna()
ndns_df["FoodName_clean"] = ndns_df["FoodName"].str.lower().str.replace(r"[^\w\s]", " ", regex=True).str.replace(r"\s+", " ", regex=True).str.strip()
ndns_df = ndns_df.drop_duplicates(subset=["FoodName_clean"])


In [11]:
# 美国的
giulia_df = pd.read_excel("/content/drive/MyDrive/UPF-HFI/nova/Training Data Original Given by NOVA Researchers - Corrections by Giulia Babak FNDDS 2009-10.xls")
giulia_df.columns = giulia_df.columns.str.strip()

giulia_df = giulia_df[["Main_food_description", "SR_nova_group"]].dropna()
giulia_df = giulia_df.rename(columns={"Main_food_description": "FoodName", "SR_nova_group": "NOVA"})

giulia_df["FoodName_clean"] = giulia_df["FoodName"].str.lower().str.replace(r"[^\w\s]", " ", regex=True).str.replace(r"\s+", " ", regex=True).str.strip()
giulia_df = giulia_df.drop_duplicates(subset=["FoodName_clean"])


In [12]:
import json
import re
import pandas as pd

# off的
off_clean = []
with open("/content/drive/MyDrive/UPF-HFI/nova/openfoodfacts-popular-24.json", "r", encoding="utf-8") as f:
    for line in f:
        try:
            entry = json.loads(line)
            if not isinstance(entry, dict):
                continue  # 跳过非对象
            name = entry.get("product_name") or entry.get("abbreviated_product_name")
            nova = entry.get("nova_group")
            if name and nova:
                name_clean = re.sub(r"[^\w\s]", " ", name.lower())
                name_clean = re.sub(r"\s+", " ", name_clean).strip()
                off_clean.append({"FoodName_clean": name_clean, "NOVA": nova})
        except json.JSONDecodeError:
            continue  # 忽略错误行

off_df = pd.DataFrame(off_clean).drop_duplicates(subset=["FoodName_clean"])


In [13]:
ndns_df.to_csv("NDNS_clean.csv", index=False)
giulia_df.to_csv("Giulia_clean.csv", index=False)
off_df.to_csv("OFF_clean.csv", index=False)


In [14]:
# ✅ Step 2：TF-IDF 匹配未完成部分（基于 NOVA 对照池）

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
# ✅ 1. 构建 TF-IDF 匹配参考库（NOVA pool）
nova_pool = pd.concat([ndns_df, giulia_df, off_df], ignore_index=True)
nova_pool = nova_pool.drop_duplicates(subset=["FoodName_clean"])  # 注意：我们用 FoodName_clean 作为 reference

# ✅ 2. 加载 intake 数据（包含 Step1 的结果）
intake_df = pd.read_csv("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step1.csv")

# ✅ 3. 选出 Step1 未匹配的项（缺失 NOVA_step1 的）
mask_missing = intake_df["NOVA_step1"].isna()
query_texts = intake_df.loc[mask_missing, "Descriptionen_clean"].dropna()
query_texts_index = query_texts.index

# ✅ 4. 构建 TF-IDF 向量器并转换为向量
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
tfidf_ref = vectorizer.fit_transform(nova_pool["FoodName_clean"])  # 参考库：FoodName_clean
tfidf_query = vectorizer.transform(query_texts)                    # 查询项：Descriptionen_clean

# ✅ 5. 计算相似度得分和索引
similarity_matrix = cosine_similarity(tfidf_query, tfidf_ref)
best_match_idx = similarity_matrix.argmax(axis=1)
best_match_score = similarity_matrix.max(axis=1)

# ✅ 6. 从匹配位置提取 nova 分数与匹配名称
matched_nova = nova_pool.iloc[best_match_idx]["NOVA"].values
matched_name = nova_pool.iloc[best_match_idx]["FoodName_clean"].values

# ✅ 7. 回写进 intake 数据
intake_df.loc[query_texts_index, "NOVA_step2"] = matched_nova
intake_df.loc[query_texts_index, "TFIDF_score"] = best_match_score
intake_df.loc[query_texts_index, "TFIDF_match_name"] = matched_name

# ✅ 8. 可选：过滤低于阈值的匹配结果（设置为 None）
threshold = 0.85
intake_df.loc[intake_df["TFIDF_score"] < threshold, ["NOVA_step2", "TFIDF_match_name"]] = [None, None]

# ✅ 9. 保存最终结果
intake_df.to_csv("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step2.csv", index=False)


In [16]:
# 🧩 Step 3：合并 Step1 与 Step2 匹配结果，形成最终 NOVA 列

def combine_nova(row):
    if pd.notna(row["NOVA_step1"]):
        return row["NOVA_step1"]
    elif pd.notna(row["NOVA_step2"]):
        return row["NOVA_step2"]
    else:
        return None

intake_df["NOVA_final"] = intake_df.apply(combine_nova, axis=1)

# 同时保留来源（说明匹配来源是 Step1 / Step2 / None）
def get_reason(row):
    if pd.notna(row["NOVA_step1"]):
        return "Keyword"
    elif pd.notna(row["NOVA_step2"]):
        return "TF-IDF"
    else:
        return "Unmatched"

intake_df["Match_source"] = intake_df.apply(get_reason, axis=1)

# ✅ 保存最终结果
intake_df.to_csv("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step3.csv", index=False)


🎯 Step 4：使用 SBERT 对剩余 NOVA_final 为空的食物进行语义匹配补全

In [17]:
!pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer, util
import torch


In [18]:
# 2. 加载预训练模型（推荐 all-MiniLM-L6-v2）：
model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [19]:
# 🛠 Step 4：对 NOVA_final 为空的食物进行 SBERT 匹配
# ✅ 1. 准备候选库（对 nova_pool 编码）
# 确保你之前准备好的 nova_pool 有 FoodName_clean 列
ref_texts = nova_pool["FoodName_clean"].tolist()
ref_embeddings = model.encode(ref_texts, convert_to_tensor=True)


In [20]:
# ✅ 2. 选取待匹配食物（NOVA_final 为空）
unmatched_df = intake_df[intake_df["NOVA_final"].isna()].copy()
query_texts = unmatched_df["Descriptionen_clean"].dropna().tolist()
query_indices = unmatched_df["Descriptionen_clean"].dropna().index

query_embeddings = model.encode(query_texts, convert_to_tensor=True)


In [21]:
# ✅ 3. 计算语义相似度并提取匹配结果
cosine_scores = util.pytorch_cos_sim(query_embeddings, ref_embeddings)
top_scores, top_indices = torch.max(cosine_scores, dim=1)

# 写入结果
intake_df.loc[query_indices, "SBERT_score"] = top_scores.cpu().numpy()
intake_df.loc[query_indices, "SBERT_match_name"] = nova_pool.iloc[top_indices.cpu().numpy()]["FoodName_clean"].values
intake_df.loc[query_indices, "NOVA_step4"] = nova_pool.iloc[top_indices.cpu().numpy()]["NOVA"].values


In [22]:
# ✅ 4. 更新最终列：NOVA_final + Match_source
# 如果 Step3 没找到但 Step4 找到了，使用 SBERT 匹配结果
intake_df["NOVA_final"] = intake_df["NOVA_final"].combine_first(intake_df["NOVA_step4"])

# 同样更新匹配来源
intake_df["Match_source"] = intake_df.apply(lambda row: (
    "SBERT" if pd.notna(row["NOVA_step4"]) and pd.isna(row["NOVA_step1"]) and pd.isna(row["NOVA_step2"])
    else row["Match_source"]
), axis=1)


In [23]:
print(intake_df.columns.tolist())


['Descriptionen', 'Descriptionen_clean', 'Foodgroupen', 'Foodgroupen_clean', 'NOVA_step1', 'match_reason', 'NOVA_step2', 'TFIDF_score', 'TFIDF_match_name', 'NOVA_final', 'Match_source', 'SBERT_score', 'SBERT_match_name', 'NOVA_step4']


In [24]:
col_order = [
    "Descriptionen", "Descriptionen_clean", "Foodgroupen", "Foodgroupen_clean",

    # === Step 1: Keyword 匹配 ===
    "NOVA_step1", "match_reason",

    # === Step 2: TF-IDF 匹配 ===
    "NOVA_step2", "TFIDF_match_name", "TFIDF_score",

    # === Step 3: SBERT 匹配 ===
    "NOVA_step4", "SBERT_match_name", "SBERT_score",

    # === 最终结果 ===
    "NOVA_final", "Match_source"
]


# 按列顺序导出
intake_df = intake_df[col_order]
intake_df.to_csv("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step4_final.csv", index=False)


In [25]:
unmatched_final = intake_df[intake_df["NOVA_final"].isin([None, "", "NC"])]
print(f"未匹配上的食物数量：{len(unmatched_final)}")


未匹配上的食物数量：414


In [26]:
# 提取未匹配的行（None, "", "NC"）
unmatched_final = intake_df[intake_df["NOVA_final"].isin([None, "", "NC"])]

# 用 Descriptionen_clean 统计频率
nc_counts = unmatched_final["Descriptionen_clean"].value_counts().reset_index()
nc_counts.columns = ["Descriptionen_clean", "count"]


# 展示前 30 个高频未匹配条目
print("高频未匹配食物（前30）：")
print(nc_counts.head(30))

# 可选：导出成 CSV 文件
nc_counts.to_csv("/content/high_freq_nc_foods.csv", index=False)


高频未匹配食物（前30）：
                                  Descriptionen_clean  count
0   childrens' chewable vitamins with vitamin a (2...    240
1                      vitamin d 400iu (10ug), tablet     34
2        childrens' multivitamin and minerals, tablet     21
3   bassetts chewy early health vitamins with a (4...     21
4                   multivitamin and minerals, tablet     17
5   childrens' vitamin c (120mg) plus zinc (3mg), ...     12
6   prescription iron supplement, 27.5mg (e.g. syt...     12
7                     childrens' multivitamins, drops     12
8                    vitamin d 1000 iu (25ug), tablet      7
9   wellkid multivitamin (age 4-12) (e.g. vitabiot...      7
10  childrens' chewable multivitamins (age 3 plus)...      5
11       calcium (250mg) & magnesium (157mg), capsule      4
12         calcium (500mg) & vitamin d (10ug), tablet      4
13                         magnesium (100 mg), tablet      4
14                          vitamin c (200mg), tablet      3
15        