<a href="https://colab.research.google.com/github/NINGTANG1124/UPF-HFI/blob/main/Untitled17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Read intake data (including Descriptionen and FoodGroupen)
import pandas as pd
import re

file_path = "/content/drive/MyDrive/UPF-HFI/Bradford_original data/1. Dietmasterfile_foodlevel_clean.xls"
intake_df = pd.read_excel(file_path)

# Define text cleaning function
def clean_text(col):
    return col.astype(str).str.lower().str.strip().str.replace(r"\s+", " ", regex=True)

# Apply to key fields
intake_df["Foodgroupen_clean"] = clean_text(intake_df["Foodgroupen"])
intake_df["Descriptionen_clean"] = clean_text(intake_df["Descriptionen"])


In [3]:
# 设置 att3 文件路径
att3_path = "/content/drive/MyDrive/UPF-HFI/nova matching files/att3-excel.xlsx"

# 读取 att3 文件
att3 = pd.read_excel(att3_path)

# 清洗 Subsidiary food group name 列
att3["desc_clean"] = att3["Subsidiary food group name"]\
    .astype(str).str.lower().str.strip().str.replace(r"\s+", " ", regex=True)

# 筛选出非 "*" 的匹配项
att3_no_star = att3[att3["NOVA food group"] != "*"].copy()


# Matching

## NDNS 1-8

In [4]:
# 合并匹配
merged = intake_df.merge(
    att3_no_star[["desc_clean", "NOVA food group", "Subsidiary food group name"]],
    how="left",
    left_on="Foodgroupen_clean",
    right_on="desc_clean"
)

# 添加新列
merged["NOVA_step1"] = merged["NOVA food group"]
merged["match_reason"] = merged["NOVA_step1"].apply(lambda x: "att3_group_match" if pd.notna(x) else None)
merged["matched_att3_group"] = merged["Subsidiary food group name"]


In [5]:
# 保留关键字段用于输出
output_step1 = merged[
    ["Descriptionen", "Foodgroupen", "Descriptionen_clean", "Foodgroupen_clean",
     "NOVA_step1", "match_reason", "matched_att3_group"]
]

# 保存到 Excel（或 CSV 也可以）
output_step1.to_excel("/content/drive/MyDrive/UPF-HFI/0723 outcome/step1_match.xlsx", index=False)


In [6]:
matched_count = merged["NOVA_step1"].notna().sum()
total_count = len(merged)
match_rate = matched_count / total_count

print(f"Step 1 匹配成功数量：{matched_count}")
print(f"总样本数量：{total_count}")
print(f"匹配率：{match_rate:.2%}")


Step 1 匹配成功数量：411
总样本数量：22217
匹配率：1.85%


In [7]:
# 导出匹配成功的部分（方便人工检查）
matched_only = merged[merged["NOVA_step1"].notna()].copy()

# 保存匹配成功部分到单独文件
matched_only.to_excel("/content/drive/MyDrive/UPF-HFI/0723 outcome/step1_matched_only.xlsx", index=False)


In [37]:
print(intake_df.columns.tolist())


['SurveyID', 'UserID', 'Source', 'Starttime', 'Submissiontime', 'Timetocomplete', 'Cookingoilused', 'Diet', 'Foodamount', 'Reasonforunusualfoodamount', 'Proxy', 'ProxyIssues', 'MealIndex', 'MealID', 'Mealname', 'Mealtime', 'Foodsource', 'FoodIndex', 'Searchterm', 'FoodID', 'Intake24foodcode', 'Descriptionen', 'Descriptionlocal', 'Nutrienttablename', 'Nutrienttablecode', 'Foodgroupcode', 'Foodgroupen', 'Foodgrouplocal', 'Readymeal', 'Brand', 'Asservedweightfactor', 'Servingsizegml', 'Servingimage', 'Leftoversgml', 'Leftoversimage', 'Portionsizegml', 'Reasonableamount', 'MissingfoodID', 'Missingfooddescription', 'Missingfoodportionsize', 'Missingfoodleftovers', 'Subgroupcode', 'Water', 'Totalnitrogen', 'Nitrogenconversionfactor', 'Protein', 'Fat', 'Carbohydrate', 'Energykcal', 'EnergykJ', 'Alcohol', 'Englystfibre', 'Starch', 'Totalsugars', 'AOAC', 'Nonmilkextrinsicsugars', 'Intrinsicandmilksugars', 'Glucose', 'Fructose', 'Maltose', 'Lactose', 'Sucrose', 'OthersugarsUK', 'FSTablesugar', '

## Media keyword

In [45]:
import pandas as pd

# 构建规则表（仅包含来自 media 的示例）
nova_example_rules_strict = [

    # --- NOVA 1: Minimally Processed Foods ---
    {"keyword": "fresh apple", "nova": 1, "reason": "MPF: fresh fruit"},
    {"keyword": "dried apple", "nova": 1, "reason": "MPF: dried fruit"},
    {"keyword": "banana","apple","" "nova": 1, "reason": "MPF: fresh fruit"},
    {"keyword": "frozen peas", "nova": 1, "reason": "MPF: frozen vegetable"},
    {"keyword": "fresh spinach", "nova": 1, "reason": "MPF: fresh vegetable"},
    {"keyword": "frozen spinach", "nova": 1, "reason": "MPF: frozen vegetable"},
    {"keyword": "boiled potato", "nova": 1, "reason": "MPF: boiled vegetable"},
    {"keyword": "milk", "nova": 1, "reason": "MPF: milk"},
    {"keyword": "plain yoghurt", "nova": 1, "reason": "MPF: plain yoghurt"},
    {"keyword": "tea", "nova": 1, "reason": "MPF: tea"},
    {"keyword": "coffee", "nova": 1, "reason": "MPF: coffee"},
    {"keyword": "water", "nova": 1, "reason": "MPF: water"},
    {"keyword": "fruit juice", "nova": 1, "reason": "MPF: fresh juice"},
    {"keyword": "vegetable juice", "nova": 1, "reason": "MPF: fresh juice"},
    {"keyword": "oat flakes", "nova": 1, "reason": "MPF: cereal grain"},
    {"keyword": "wheat flour", "nova": 1, "reason": "MPF: ground grain"},
    {"keyword": "corn flour", "nova": 1, "reason": "MPF: ground grain"},
    {"keyword": "cassava flour", "nova": 1, "reason": "MPF: ground root"},
    {"keyword": "herbs", "nova": 1, "reason": "MPF: herb"},
    {"keyword": "spices", "nova": 1, "reason": "MPF: spice"},
    {"keyword": "egg", "nova": 1, "reason": "MPF: egg"},
    {"keyword": "fresh chicken", "nova": 1, "reason": "MPF: fresh poultry"},
    {"keyword": "fresh beef", "nova": 1, "reason": "MPF: fresh meat"},
    {"keyword": "fresh fish", "nova": 1, "reason": "MPF: fresh fish"},
    {"keyword": "legumes", "nova": 1, "reason": "MPF: legumes"},
    {"keyword": "uncooked", "nova": 1, "reason": "MPF: raw/uncooked ingredient"},

    # --- NOVA 2: Processed Culinary Ingredients ---
    {"keyword": "vegetable oil", "nova": 2, "reason": "PCI: extracted oil"},
    {"keyword": "olive oil", "nova": 2, "reason": "PCI: extracted oil"},
    {"keyword": "sunflower oil", "nova": 2, "reason": "PCI: extracted oil"},
    {"keyword": "butter", "nova": 2, "reason": "PCI: dairy fat"},
    {"keyword": "lard", "nova": 2, "reason": "PCI: animal fat"},
    {"keyword": "sugar", "nova": 2, "reason": "PCI: extracted sugar"},
    {"keyword": "molasses", "nova": 2, "reason": "PCI: extracted sugar"},
    {"keyword": "honey", "nova": 2, "reason": "PCI: natural sugar"},
    {"keyword": "starch", "nova": 2, "reason": "PCI: extracted starch"},
    {"keyword": "salt", "nova": 2, "reason": "PCI: salt"},

    # --- NOVA 3: Processed Foods ---
    {"keyword": "canned peas", "nova": 3, "reason": "PF: canned vegetable"},
    {"keyword": "canned beans", "nova": 3, "reason": "PF: canned legumes"},
    {"keyword": "salted peanuts", "nova": 3, "reason": "PF: salted nuts"},
    {"keyword": "salted cashews", "nova": 3, "reason": "PF: salted nuts"},
    {"keyword": "salted sunflower seeds", "nova": 3, "reason": "PF: salted seeds"},
    {"keyword": "smoked salmon", "nova": 3, "reason": "PF: smoked fish"},
    {"keyword": "canned tuna", "nova": 3, "reason": "PF: canned fish"},
    {"keyword": "ham", "nova": 3, "reason": "PF: cured meat"},
    {"keyword": "bacon", "nova": 3, "reason": "PF: cured meat"},
    {"keyword": "cheddar cheese", "nova": 3, "reason": "PF: cheese"},
    {"keyword": "white bread", "nova": 3, "reason": "PF: unpackaged bread"},
    {"keyword": "wholemeal bread", "nova": 3, "reason": "PF: unpackaged bread"},
    {"keyword": "fruit in syrup", "nova": 3, "reason": "PF: sweetened fruit"},

    # --- NOVA 4: Ultra-Processed Foods ---
    {"keyword": "instant noodles", "nova": 4, "reason": "UPF: instant noodles"},
    {"keyword": "instant soup", "nova": 4, "reason": "UPF: instant soup"},
    {"keyword": "carbonated drink", "nova": 4, "reason": "UPF: soft drink"},
    {"keyword": "cola", "nova": 4, "reason": "UPF: soft drink"},
    {"keyword": "fruit drink", "nova": 4, "reason": "UPF: fruit flavoured drink"},
    {"keyword": "energy drink", "nova": 4, "reason": "UPF: energy drink"},
    {"keyword": "chocolate", "nova": 4, "reason": "UPF: confectionery"},
    {"keyword": "ice cream", "nova": 4, "reason": "UPF: ice cream"},
    {"keyword": "margarine", "nova": 4, "reason": "UPF: spread"},
    {"keyword": "packaged bread", "nova": 4, "reason": "UPF: mass-produced bread"},
    {"keyword": "burger", "nova": 4, "reason": "UPF: reconstituted meat"},
    {"keyword": "hot dog", "nova": 4, "reason": "UPF: reconstituted meat"},
    {"keyword": "fish fingers", "nova": 4, "reason": "UPF: reconstituted fish"},
    {"keyword": "pizza", "nova": 4, "reason": "UPF: ready to heat"},
    {"keyword": "lasagna", "nova": 4, "reason": "UPF: ready to heat"},
    {"keyword": "breakfast cereal", "nova": 4, "reason": "UPF: breakfast cereal"},
    {"keyword": "cereal bar", "nova": 4, "reason": "UPF: cereal bar"},
    {"keyword": "cake", "nova": 4, "reason": "UPF: packaged cake"},
    {"keyword": "biscuit", "nova": 4, "reason": "UPF: sweet snack"},
    {"keyword": "pastry", "nova": 4, "reason": "UPF: sweet snack"},
    {"keyword": "fruit yoghurt", "nova": 4, "reason": "UPF: flavoured yoghurt"},
    {"keyword": "cocoa drink", "nova": 4, "reason": "UPF: cocoa drink"},
    {"keyword": "meal replacement", "nova": 4, "reason": "UPF: meal substitute"},
    {"keyword": "protein powder", "nova": 4, "reason": "UPF: protein supplement"},

]


In [41]:
# --- 2. 构建规则 DataFrame ---
nova_rules_df = pd.DataFrame(nova_example_rules_strict)

# --- 3. 匹配函数（含优先级选择） ---
def match_nova_with_conflict_handling(row, nova_rules_df):
    matches = []
    for _, rule in nova_rules_df.iterrows():
        if rule["keyword"] in row["Descriptionen_clean"] or rule["keyword"] in row["Foodgroupen_clean"]:
            matches.append(rule)
    if not matches:
        return pd.Series([None, None])
    best_match = sorted(matches, key=lambda x: x["nova"], reverse=True)[0]
    return pd.Series([best_match["nova"], best_match["reason"]])

# --- 4. 若无 Step1，初始化空列 ---
if "NOVA_step1" not in intake_df.columns:
    intake_df["NOVA_step1"] = None
    intake_df["match_reason_step1"] = None

# --- 5. Step2 匹配：只对 Step1 没命中的样本 ---
step2_target_df = intake_df[intake_df["NOVA_step1"].isna()].copy()
step2_target_df[["NOVA_step2", "match_reason_step2"]] = step2_target_df.apply(
    lambda row: match_nova_with_conflict_handling(row, nova_rules_df),
    axis=1
)
intake_df.update(step2_target_df[["NOVA_step2", "match_reason_step2"]])

# --- 6. 生成 NOVA_final 结果 ---
def select_nova_final(row):
    if pd.notna(row["NOVA_step1"]):
        return pd.Series([row["NOVA_step1"], row["match_reason_step1"]])
    elif pd.notna(row["NOVA_step2"]):
        return pd.Series([row["NOVA_step2"], row["match_reason_step2"]])
    else:
        return pd.Series([None, None])

intake_df[["NOVA_step2_output", "match_reason_step2_output"]] = intake_df.apply(select_nova_final, axis=1)

In [44]:
# 导出 Step2 匹配结果（只包含关键字段）
cols_to_export = ["Descriptionen", "Foodgroupen", "Descriptionen_clean", "Foodgroupen_clean", "NOVA_step2", "match_reason_step2"]
intake_df[cols_to_export].to_csv("/content/drive/MyDrive/UPF-HFI/0723 outcome/step2_match.csv", index=False)


🧠 那「优先级」到底是什么意思？
简单说就是：

遇到多个关键词时，先匹配优先级高的。
我们人为规定 NOVA 的级别中：

NOVA 4（超加工食品）最“激进”最重要，优先级最高
其次是 NOVA 3
然后是 NOVA 2
最后才是 NOVA 1（原型食品）

优先级规则会让程序这样判断：
看描述里有没有 NOVA 4 的关键词（比如：instant, flakes, chocolate）
没有再看有没有 NOVA 3 的关键词（比如：salted, smoked）
没有再看 NOVA 2 的关键词（比如：butter, oil）
最后才看 NOVA 1 的词（比如：milk, water）

In [None]:
# 1. 设置关键词字典（含优先级逻辑）
# 第1步：构建关键词字典（含Foodgroup线索）
nova_text_keywords = {
    4: [  # Ultra-processed 优先级最高
        "soft drinks", "packaged snacks", "chocolate", "confectionery", "ice-cream",
        "mass-produced", "biscuits", "pastries", "cakes", "cake mixes", "cereal bars",
        "energy bars", "milk drinks", "fruit yoghurt", "fruit drinks", "cocoa drinks",
        "instant sauces", "meal replacement", "nuggets", "sticks", "sausages",
        "burgers", "hot dogs", "reconstituted", "instant soups", "instant noodles",
        "instant desserts"
    ],
    3: [  # Processed
        "canned", "bottled", "in brine", "salted", "sugared", "dried", "cured",
        "smoked", "meats", "cheeses", "fruits in syrup", "unpackaged breads"
    ],
    2: [  # Processed culinary ingredients
        "vegetable oils", "butter", "lard", "sugar", "molasses", "honey",
        "starches", "salt"
    ],
    1: [  # Minimally processed
        "fresh", "squeezed", "chilled", "frozen", "grains", "legumes",
        "meat", "poultry", "fish", "eggs", "milk", "fruit juice", "vegetable juice",
        "flakes", "flour", "corn", "wheat", "oats", "cassava", "seeds", "herbs",
        "spices", "plain yoghurt", "tea", "coffee", "drinking water"
    ]
}


In [None]:
# 第二步：写初筛函数（关键词+Foodgroup条件判断）
import numpy as np

def classify_step1_with_group(row):
    desc = row["Descriptionen_clean"]
    group = row["Foodgroupen_clean"]

    for nova in sorted(nova_text_keywords.keys(), reverse=True):  # 从NOVA4优先开始
        for kw in nova_text_keywords[nova]:
            if kw in desc:
                # 举例：特殊条件限制
                if kw == "milk" and "milk drinks" in group:
                    return pd.Series([4, f"text_match: milk + group={group}"])
                if kw == "fruit yoghurt" and "dairy" in group:
                    return pd.Series([4, f"text_match: fruit yoghurt + dairy group"])
                # 一般匹配
                return pd.Series([nova, f"text_match: {kw}"])

    return pd.Series([np.nan, np.nan])


In [None]:
#  第三步：应用匹配函数，生成 Step1 结果
intake_df[["NOVA_step1", "match_reason1"]] = intake_df.apply(classify_step1_with_group, axis=1)


In [None]:
# 第四步（可选）：导出匹配结果看看效果
matched_df = intake_df[intake_df["NOVA_step1"].notna()].copy()

output_cols = [
    "Descriptionen", "Foodgroupen", "NOVA_step1", "match_reason1"
]
matched_df[output_cols].to_csv("/content/drive/MyDrive/UPF-HFI/0723 outcome/NOVA_step1_textmatch.csv", index=False)
