In [1]:
# connect googledrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Step 1: 读取 intake 数据（含 Descriptionen 和 FoodGroupen）
import pandas as pd
import re

file_path = "/content/drive/MyDrive/UPF-HFI/Bradford_original data/1. Dietmasterfile_foodlevel_clean.xls"
intake_df = pd.read_excel(file_path)


In [3]:
# Step 2: 清洗 Descriptionen 列
def clean_description(desc):
    if pd.isna(desc):
        return ""
    desc = str(desc).lower()
    desc = re.sub(r"[()\.,\-]", "", desc)
    desc = re.sub(r"\s+", " ", desc)
    return desc.strip()

intake_df["food_name_clean"] = intake_df["Descriptionen"].apply(clean_description)
intake_df["Foodgroupen"] = intake_df["Foodgroupen"].fillna("")


In [4]:
# Step 3: 定义描述字段的规则匹配函数（宏观→微观）
def match_nova_by_description_v3(text):
    text = str(text).lower()

    # === NOVA 1: 宏观 raw / water ===
    if "raw" in text:
        return 1, "raw"
    if any(word in text for word in ["tap water", "still water", "filtered water"]):
        return 1, "water"

    # === NOVA 3: 宏观 homemade / boiled ===
    if "homemade" in text or "home made" in text:
        return 3, "homemade item"
    if "boiled" in text or "mashed potato" in text:
        return 3, "boiled or mashed"
    if "porridge made with milk" in text:
        return 3, "porridge w/ milk"

    # === NOVA 4: 宏观加工类 ===
    if "takeaway" in text:
        return 4, "fast food"
    if any(word in text for word in ["ice cream topping", "breakfast cereal", "milkshake"]):
        return 4, "dessert/snack item"
    if any(word in text for word in ["flavour", "instant"]):
        return 4, "instant/flavoured"
    if any(word in text for word in ["cracker", "biscuit", "weetabix"]):
        return 4, "snack item"
    if "ketchup" in text and "home made" not in text:
        return 4, "processed ketchup"
    if any(word in text for word in ["squash", "cordial", "carbonated"]):
        return 4, "sweetened drink"
    if any(word in text for word in ["margarine", "clover spread", "flora"]):
        return 4, "processed fat"
    if "nutella" in text:
        return 4, "branded sweet spread"

    # === NOVA 1: 微观乳制品类 ===
    if any(word in text for word in ["natural yoghurt", "whole milk", "fromage frais"]) and "flavour" not in text:
        return 1, "plain dairy"

    return None, None


In [5]:
# Step 4: 定义 group 字段的匹配函数（宏观→微观）
def match_nova_by_group_v2(group):
    group = str(group).lower().strip()

    # === NOVA 1 ===
    if "fresh fruit" in group:
        return 1, "fruit (group)"
    if "dried fruit" in group:
        return 1, "dried fruit (group)"
    if "vegetables" in group and "fried" not in group:
        return 1, "vegetables (group)"

    # === NOVA 3 ===
    if "monounsaturated" in group:
        return 3, "culinary fat (mono)"
    if "dairy fat spreads" in group or "hard marg" in group:
        return 3, "dairy fat spread"

    # === NOVA 4 ===
    if "other breakfast cereals" in group or "muesli" in group or "bran flakes" in group:
        return 4, "processed cereal (group)"
    if "ice cream" in group or "desserts and lollies" in group:
        return 4, "ice cream (group)"
    if any(word in group for word in ["sweets", "toffee", "boiled sweets", "gums", "jellies", "mints", "liquorice", "raw jelly", "popcorn"]):
        return 4, "sweets/snack (group)"

    return None, None

In [6]:
# Step 5: 应用匹配逻辑（优先 description，再补 group）
intake_df[["NOVA_by_desc", "Match_reason"]] = intake_df["Descriptionen"].apply(
    lambda x: pd.Series(match_nova_by_description_v3(x))
)

mask_unmatched = intake_df["NOVA_by_desc"].isna()
intake_df.loc[mask_unmatched, ["NOVA_by_desc", "Match_reason"]] = intake_df.loc[mask_unmatched, "Foodgroupen"].apply(
    lambda x: pd.Series(match_nova_by_group_v2(x))
)

# Step 6: 展示匹配样本结果
matched_sample = intake_df[["food_name_clean", "Foodgroupen", "NOVA_by_desc", "Match_reason"]].query("NOVA_by_desc.notna()").head(20)
matched_sample

Unnamed: 0,food_name_clean,Foodgroupen,NOVA_by_desc,Match_reason
0,natural yoghurt,Whole milk yoghurt/fromage frais,1.0,plain dairy
1,spinach raw not baby spinach,"Other vegetables (excluding potato): carrots, ...",1.0,raw
3,savoury cracker eg sesame and poppy thins butt...,"Savoury biscuits and baked goods eg crackers, ...",4.0,snack item
4,water from tap including hot water filtered water,Water,1.0,water
8,tomato based pasta sauce home made,"Other vegetables (excluding potato): carrots, ...",3.0,homemade item
11,red cabbage raw,"Other vegetables (excluding potato): carrots, ...",1.0,raw
12,jam/conserve berries eg strawberry,"Preserves and syrups: glace cherries, honey, j...",1.0,raw
14,water from tap including hot water filtered water,Water,1.0,water
19,water from tap including hot water filtered water,Water,1.0,water
24,water from tap including hot water filtered water,Water,1.0,water
