<a href="https://colab.research.google.com/github/NINGTANG1124/UPF-HFI/blob/main/notebooks/NDNS-NOVA-intake24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Read intake data (including Descriptionen and FoodGroupen)
import pandas as pd
import re

file_path = "/content/drive/MyDrive/UPF-HFI/Bradford_original data/1. Dietmasterfile_foodlevel_clean.xls"
intake_df = pd.read_excel(file_path)

# Define text cleaning function
def clean_text(col):
    return col.astype(str).str.lower().str.strip().str.replace(r"\s+", " ", regex=True)

# Apply to key fields
intake_df["Foodgroupen_clean"] = clean_text(intake_df["Foodgroupen"])
intake_df["Descriptionen_clean"] = clean_text(intake_df["Descriptionen"])


In [3]:
# 设置 att3 文件路径
att3_path = "/content/drive/MyDrive/UPF-HFI/nova matching files/att3-excel.xlsx"

# 读取 att3 文件
att3 = pd.read_excel(att3_path)

# 清洗 Subsidiary food group name 列
att3["desc_clean"] = att3["Subsidiary food group name"]\
    .astype(str).str.lower().str.strip().str.replace(r"\s+", " ", regex=True)

# 筛选出非 "*" 的匹配项
att3_no_star = att3[att3["NOVA food group"] != "*"].copy()


# Matching

## NDNS 1-8

In [4]:
# 合并匹配
merged = intake_df.merge(
    att3_no_star[["desc_clean", "NOVA food group", "Subsidiary food group name"]],
    how="left",
    left_on="Foodgroupen_clean",
    right_on="desc_clean"
)

# 添加新列
merged["NOVA_step1"] = merged["NOVA food group"]
merged["match_reason"] = merged["NOVA_step1"].apply(lambda x: "att3_group_match" if pd.notna(x) else None)
merged["matched_att3_group"] = merged["Subsidiary food group name"]


In [5]:
# 保留关键字段用于输出
output_step1 = merged[
    ["Descriptionen", "Foodgroupen", "Descriptionen_clean", "Foodgroupen_clean",
     "NOVA_step1", "match_reason", "matched_att3_group"]
]

# 保存到 Excel（或 CSV 也可以）
output_step1.to_excel("/content/drive/MyDrive/UPF-HFI/0723 outcome/step1_match.xlsx", index=False)


In [7]:
matched_count = merged["NOVA_step1"].notna().sum()
total_count = len(merged)
match_rate = matched_count / total_count

print(f"Step 1 Number of successful matches：{matched_count}")
print(f"Total sample size：{total_count}")
print(f"Match Rate：{match_rate:.2%}")


Step 1 Number of successful matches：411
Total sample size：22217
Match Rate：1.85%


In [8]:
# 导出匹配成功的部分（方便人工检查）
matched_only = merged[merged["NOVA_step1"].notna()].copy()

# 保存匹配成功部分到单独文件
matched_only.to_excel("/content/drive/MyDrive/UPF-HFI/0723 outcome/step1_matched_only.xlsx", index=False)


## Media keyword

In [23]:
import pandas as pd

# 构建规则表（仅包含来自 media 的示例）
nova_example_rules_strict = [

    # --- NOVA 1: Minimally Processed Foods ---
    {"keyword": "uncooked oats", "nova": 1, "reason": "MPF"},
    {"keyword": "fresh apple", "nova": 1, "reason": "MPF"},
    {"keyword": "dried apple", "nova": 1, "reason": "MPF"},
    {"keyword": "banana", "nova": 1, "reason": "MPF"},
    {"keyword": "orange", "nova": 1, "reason": "MPF"},
    {"keyword": "pear", "nova": 1, "reason": "MPF"},
    {"keyword": "plum", "nova": 1, "reason": "MPF"},
    {"keyword": "grapes", "nova": 1, "reason": "MPF"},
    {"keyword": "frozen peas", "nova": 1, "reason": "MPF"},
    {"keyword": "fresh spinach", "nova": 1, "reason": "MPF"},
    {"keyword": "frozen spinach", "nova": 1, "reason": "MPF"},
    {"keyword": "boiled potato", "nova": 1, "reason": "MPF"},
    {"keyword": "milk", "nova": 1, "reason": "MPF"},
    {"keyword": "plain yoghurt", "nova": 1, "reason": "MPF"},
    {"keyword": "tea", "nova": 1, "reason": "MPF"},
    {"keyword": "coffee", "nova": 1, "reason": "MPF"},
    {"keyword": "water", "nova": 1, "reason": "MPF"},
    {"keyword": "fruit juice", "nova": 1, "reason": "MPF"},
    {"keyword": "vegetable juice", "nova": 1, "reason": "MPF"},
    {"keyword": "oats", "nova": 1, "reason": "MPF"},
    {"keyword": "wheat flour", "nova": 1, "reason": "MPF"},
    {"keyword": "corn flour", "nova": 1, "reason": "MPF"},
    {"keyword": "cassava flour", "nova": 1, "reason": "MPF"},
    {"keyword": "herbs", "nova": 1, "reason": "MPF"},
    {"keyword": "spices", "nova": 1, "reason": "MPF"},
    {"keyword": "egg", "nova": 1, "reason": "MPF"},
    {"keyword": "fresh chicken", "nova": 1, "reason": "MPF"},
    {"keyword": "fresh beef", "nova": 1, "reason": "MPF"},
    {"keyword": "fresh fish", "nova": 1, "reason": "MPFh"},
    {"keyword": "legumes", "nova": 1, "reason": "MPF"},

    # --- NOVA 2: Processed Culinary Ingredients ---
    {"keyword": "vegetable oil", "nova": 2, "reason": "PCI"},
    {"keyword": "olive oil", "nova": 2, "reason": "PCI"},
    {"keyword": "sunflower oil", "nova": 2, "reason": "PCI"},
    {"keyword": "butter", "nova": 2, "reason": "PCI"},
    {"keyword": "lard", "nova": 2, "reason": "PCI"},
    {"keyword": "sugar", "nova": 2, "reason": "PCI"},
    {"keyword": "molasses", "nova": 2, "reason": "PCI"},
    {"keyword": "honey", "nova": 2, "reason": "PCI"},
    {"keyword": "starch", "nova": 2, "reason": "PCI"},
    {"keyword": "salt", "nova": 2, "reason": "PCI"},

    # --- NOVA 3: Processed Foods ---
    {"keyword": "canned peas", "nova": 3, "reason": "PF"},
    {"keyword": "canned beans", "nova": 3, "reason": "PF"},
    {"keyword": "salted peanuts", "nova": 3, "reason": "PF"},
    {"keyword": "salted cashews", "nova": 3, "reason": "PF"},
    {"keyword": "salted sunflower seeds", "nova": 3, "reason": "PF"},
    {"keyword": "smoked salmon", "nova": 3, "reason": "PF"},
    {"keyword": "canned tuna", "nova": 3, "reason": "PF"},
    {"keyword": "ham", "nova": 3, "reason": "PF"},
    {"keyword": "bacon", "nova": 3, "reason": "PF"},
    {"keyword": "cheddar cheese", "nova": 3, "reason": "PF"},
    {"keyword": "fruits in syrup", "nova": 3, "reason": "PF"},

    # --- NOVA 4: Ultra-Processed Foods ---
    {"keyword": "instant noodles", "nova": 4, "reason": "UPF"},
    {"keyword": "instant soup", "nova": 4, "reason": "UPF"},
    {"keyword": "carbonated drink", "nova": 4, "reason": "UPF"},
    {"keyword": "cola", "nova": 4, "reason": "UPF"},
    {"keyword": "fruit drink", "nova": 4, "reason": "UPF"},
    {"keyword": "energy drink", "nova": 4, "reason": "UPF"},
    {"keyword": "chocolate", "nova": 4, "reason": "UPF"},
    {"keyword": "ice cream", "nova": 4, "reason": "UPF"},
    {"keyword": "margarine", "nova": 4, "reason": "UPF"},
    {"keyword": "packaged bread", "nova": 4, "reason": "UPF"},
    {"keyword": "burger", "nova": 4, "reason": "UPF"},
    {"keyword": "hot dog", "nova": 4, "reason": "UPF"},
    {"keyword": "fish fingers", "nova": 4, "reason": "UPF"},
    {"keyword": "pizza", "nova": 4, "reason": "UPF"},
    {"keyword": "lasagna", "nova": 4, "reason": "UPF"},
    {"keyword": "breakfast cereal", "nova": 4, "reason": "UPF"},
    {"keyword": "cereal bar", "nova": 4, "reason": "UPF"},
    {"keyword": "cake", "nova": 4, "reason": "UPF"},
    {"keyword": "biscuit", "nova": 4, "reason": "UPF"},
    {"keyword": "pastry", "nova": 4, "reason": "UPF"},
    {"keyword": "fruit yoghurt", "nova": 4, "reason": "UPF"},
    {"keyword": "cocoa drink", "nova": 4, "reason": "UPF"},
    {"keyword": "meal replacement", "nova": 4, "reason": "UPF"},
    {"keyword": "protein powder", "nova": 4, "reason": "UPF"},

]


In [24]:
# --- 2. 构建规则 DataFrame ---
nova_rules_df = pd.DataFrame(nova_example_rules_strict)

# --- 3. 匹配函数（含优先级选择） ---
def match_nova_with_conflict_handling(row, nova_rules_df):
    matches = []
    for _, rule in nova_rules_df.iterrows():
        if rule["keyword"] in row["Descriptionen_clean"] or rule["keyword"] in row["Foodgroupen_clean"]:
            matches.append(rule)
    if not matches:
        return pd.Series([None, None])
    best_match = sorted(matches, key=lambda x: x["nova"], reverse=True)[0]
    return pd.Series([best_match["nova"], best_match["reason"]])

# --- 4. 若无 Step1，初始化空列 ---
if "NOVA_step1" not in intake_df.columns:
    intake_df["NOVA_step1"] = None
    intake_df["match_reason_step1"] = None

# --- 5. Step2 匹配：只对 Step1 没命中的样本 ---
# 提取需要 Step2 匹配的记录
step2_target_df = intake_df[intake_df["NOVA_step1"].isna()].copy()

# 应用关键词规则匹配
step2_target_df[["NOVA_step2", "match_reason_step2"]] = step2_target_df.apply(
    lambda row: match_nova_with_conflict_handling(row, nova_rules_df),
    axis=1
)

# 把结果合并更新回 intake_df
intake_df.update(step2_target_df[["NOVA_step2", "match_reason_step2"]])

In [25]:
# 标记 Step2 应更新的行（NOVA_step1 是空的）
step2_mask = intake_df["NOVA_step1"].isna()

# 执行 Step2 匹配逻辑
step2_target_df = intake_df[step2_mask].copy()
step2_target_df[["NOVA_step2", "match_reason_step2"]] = step2_target_df.apply(
    lambda row: match_nova_with_conflict_handling(row, nova_rules_df),
    axis=1
)

# 直接赋值，确保新列写入 intake_df（而非 update）
intake_df.loc[step2_mask, "NOVA_step2"] = step2_target_df["NOVA_step2"]
intake_df.loc[step2_mask, "match_reason_step2"] = step2_target_df["match_reason_step2"]


In [26]:
print(intake_df.columns)


Index(['SurveyID', 'UserID', 'Source', 'Starttime', 'Submissiontime',
       'Timetocomplete', 'Cookingoilused', 'Diet', 'Foodamount',
       'Reasonforunusualfoodamount',
       ...
       'Day', 'weekday', 'ratio', 'UserID_clean', 'Foodgroupen_clean',
       'Descriptionen_clean', 'NOVA_step1', 'match_reason_step1', 'NOVA_step2',
       'match_reason_step2'],
      dtype='object', length=172)


In [27]:
# 只导出关键列，推荐保存到独立文件
step2_output_cols = [
    "Descriptionen", "Foodgroupen", "Descriptionen_clean", "Foodgroupen_clean",
    "NOVA_step2", "match_reason_step2"
]

intake_df[step2_output_cols].dropna(subset=["NOVA_step2"]).to_csv(
    "/content/drive/MyDrive/UPF-HFI/0723 outcome/step2_matched_only.csv", index=False
)

In [28]:
# 合并 step1 和 step2 成 final（可选）
def select_nova_partial_final(row):
    if pd.notna(row["NOVA_step1"]):
        return pd.Series([row["NOVA_step1"], row["match_reason_step1"]])
    elif pd.notna(row["NOVA_step2"]):
        return pd.Series([row["NOVA_step2"], row["match_reason_step2"]])
    else:
        return pd.Series([None, None])

intake_df[["NOVA_partial_final", "match_reason_partial_final"]] = intake_df.apply(
    select_nova_partial_final, axis=1
)

In [30]:
matched_count = intake_df["NOVA_step2"].notna().sum()
total_count = len(intake_df)
print(f"Step 2 匹配数量: {matched_count} / {total_count} （{matched_count / total_count:.2%}）")


Step 2 匹配数量: 13931 / 22217 （62.70%）
