<a href="https://colab.research.google.com/github/NINGTANG1124/UPF-HFI/blob/main/notebooks/intake24_nova_matching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
# connect googledrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 0. Data Processing

## 0.1 Intake24 Data

In [46]:
# Read intake data (including Descriptionen and FoodGroupen)
import pandas as pd
import re

file_path = "/content/drive/MyDrive/UPF-HFI/Bradford_original data/1. Dietmasterfile_foodlevel_clean.xls"
intake_df = pd.read_excel(file_path)

# Define text cleaning function
def clean_text(col):
    return col.astype(str).str.lower().str.strip().str.replace(r"\s+", " ", regex=True)

# Apply to key fields
intake_df["Foodgroupen_clean"] = clean_text(intake_df["Foodgroupen"])
intake_df["Descriptionen_clean"] = clean_text(intake_df["Descriptionen"])


## 0.12 NDNS NOVA

In [50]:
# 设置 att3 文件路径
att3_path = "/content/drive/MyDrive/UPF-HFI/nova matching files/att3-excel.xlsx"

# 读取 att3 文件
att3 = pd.read_excel(att3_path)

# 清洗 Subsidiary food group name 列
att3["desc_clean"] = att3["Subsidiary food group name"]\
    .astype(str).str.lower().str.strip().str.replace(r"\s+", " ", regex=True)

# 筛选出非 "*" 的匹配项
att3_no_star = att3[att3["NOVA food group"] != "*"].copy()


In [51]:
# 假设 intake_df 已清洗并包含 "Foodgroupen_clean"

# 合并匹配
merged = intake_df.merge(
    att3_no_star[["desc_clean", "NOVA food group", "Subsidiary food group name"]],
    how="left",
    left_on="Foodgroupen_clean",
    right_on="desc_clean"
)

# 添加新列
merged["NOVA_step1"] = merged["NOVA food group"]
merged["match_reason"] = merged["NOVA_step1"].apply(lambda x: "att3_group_match" if pd.notna(x) else None)
merged["matched_att3_group"] = merged["Subsidiary food group name"]


In [52]:
# 只保存关键字段
output_path = "/content/drive/MyDrive/UPF-HFI/0722outcome/intake_with_nova_step1.csv"
cols_to_save = [
    "Descriptionen", "Descriptionen_clean",
    "Foodgroupen", "Foodgroupen_clean",
    "NOVA_step1", "match_reason", "matched_att3_group"
]

merged[cols_to_save].to_csv(output_path, index=False)


In [53]:
# 仅保留匹配成功的行
matched_df = merged[merged["NOVA_step1"].notna()].copy()

# 保存匹配成功的样本
matched_output_path = "/content/drive/MyDrive/UPF-HFI/0722outcome/intake_nova_matched_step1_only.csv"

matched_df[cols_to_save].to_csv(matched_output_path, index=False)


In [54]:
num_matched = matched_df.shape[0]
total = merged.shape[0]
print(f"匹配率：{num_matched} / {total} = {num_matched/total:.2%}")


匹配率：411 / 22217 = 1.85%


## 0.2 NOVA Data

In [47]:
# Nova file data cleaning
ndns_df = pd.read_csv("/content/drive/MyDrive/UPF-HFI/nova/NDNS_NOVA_DATABASE.new2023.csv", encoding="ISO-8859-1")
ndns_df.columns = ndns_df.columns.str.strip()
ndns_df = ndns_df[["FoodName", "NOVA"]].dropna()
ndns_df["FoodName_clean"] = ndns_df["FoodName"].str.lower().str.replace(r"[^\w\s]", " ", regex=True).str.replace(r"\s+", " ", regex=True).str.strip()
ndns_df = ndns_df.drop_duplicates(subset=["FoodName_clean"])

# Giulia file data cleaning
giulia_df = pd.read_excel("/content/drive/MyDrive/UPF-HFI/nova/Training Data Original Given by NOVA Researchers - Corrections by Giulia Babak FNDDS 2009-10.xls")
giulia_df.columns = giulia_df.columns.str.strip()

giulia_df = giulia_df[["Main_food_description", "SR_nova_group"]].dropna()
giulia_df = giulia_df.rename(columns={"Main_food_description": "FoodName", "SR_nova_group": "NOVA"})

giulia_df["FoodName_clean"] = giulia_df["FoodName"].str.lower().str.replace(r"[^\w\s]", " ", regex=True).str.replace(r"\s+", " ", regex=True).str.strip()
giulia_df = giulia_df.drop_duplicates(subset=["FoodName_clean"])

# off file data cleaning
import json
import re
import pandas as pd

off_clean = []
with open("/content/drive/MyDrive/UPF-HFI/nova/openfoodfacts-popular-24.json", "r", encoding="utf-8") as f:
    for line in f:
        try:
            entry = json.loads(line)
            if not isinstance(entry, dict):
                continue

            name = entry.get("product_name") or entry.get("abbreviated_product_name")
            nova = entry.get("nova_group")

            if name and nova:
                name_clean = re.sub(r"[^\w\s]", " ", name.lower())
                name_clean = re.sub(r"\s+", " ", name_clean).strip()
                off_clean.append({"FoodName_clean": name_clean, "NOVA": nova})

        except json.JSONDecodeError:
            continue

off_df = pd.DataFrame(off_clean).drop_duplicates(subset=["FoodName_clean"])



KeyboardInterrupt: 

In [49]:
ndns_df.to_csv("NDNS_clean.csv", index=False)

In [48]:
ndns_df.to_csv("NDNS_clean.csv", index=False)
giulia_df.to_csv("Giulia_clean.csv", index=False)
off_df.to_csv("OFF_clean.csv", index=False)


NameError: name 'giulia_df' is not defined

# 1. Keyword Matching

## Media-Dicken matching

In [38]:
import pandas as pd
import numpy as np

# 示例函数：完全依据 media 文件中提到的分类规则
def media_based_nova_classification(row):
    desc = row['Descriptionen'].lower()

    # === Media rule: Homemade mayonnaise, salad cream, or French dressing → NOVA 4
    if 'homemade' in desc and any(x in desc for x in ['mayonnaise', 'salad cream', 'french dressing']):
        return 4

    # === Media rule: Homemade buns, cakes, pastries, puddings, pancakes → NOVA 3
    if 'homemade' in desc and any(x in desc for x in ['bun', 'cake', 'pastry', 'pudding', 'pancake']):
        return 3

    # === Media rule: Homemade pies or dumplings → NOVA 3
    if 'homemade' in desc and any(x in desc for x in ['pie', 'dumpling']):
        return 3

    # === Media rule: Homemade gravy → NOVA 1 (unless oxo or granule, then NOVA 3)
    if 'homemade' in desc and 'gravy' in desc:
        if 'oxo' in desc or 'granule' in desc:
            return 3
        else:
            return 1

    # === Media rule: Homemade stews, meat-based meals → NOVA 1
    if 'homemade' in desc and ('stew' in desc or 'meat' in desc):
        return 1

    # === Media rule: Homemade cottage pie, shepherd’s pie → NOVA 1
    if 'homemade' in desc and ('cottage pie' in desc or 'shepherd' in desc):
        return 1

    # === Media rule: Homemade battered/breaded fish → NOVA 3
    if 'homemade' in desc and 'fish' in desc and any(x in desc for x in ['battered', 'breaded']):
        return 3

    # === Media rule: General battered/breaded fish → NOVA 4
    if 'fish' in desc and any(x in desc for x in ['battered', 'breaded']):
        return 4

    # === Media rule: Canned vegetables in brine/syrup → NOVA 3, otherwise NOVA 1
    if 'canned vegetable' in desc or 'tinned vegetable' in desc:
        if 'brine' in desc or 'syrup' in desc:
            return 3
        else:
            return 1

    # === Media rule: Canned fruit → NOVA 3
    if 'canned fruit' in desc:
        return 3

    # === Media rule: Processed meats (bacon, ham, gammon, sliced meat) → NOVA 4
    if any(x in desc for x in ['bacon', 'ham', 'gammon', 'sliced meat']):
        return 4

    # === Media rule: Specialty cured meats (prosciutto, parma, serrano) → NOVA 3
    if any(x in desc for x in ['prosciutto', 'parma', 'serrano']):
        return 3

    # === Media rule: Nut butters (e.g., peanut butter) → NOVA 3
    if 'nut butter' in desc:
        return 3

    # === Media rule: Gluten-free substitutes → NOVA 4
    if 'gluten free' in desc:
        return 4

    # === Media rule: Jams, marmalade, lemon curd → NOVA 4
    if any(x in desc for x in ['jam', 'marmalade', 'lemon curd']):
        return 4

    # === Media rule: Breakfast cereal or muesli → NOVA 4
    if 'breakfast cereal' in desc or 'muesli' in desc:
        return 4

    # === Media rule: Plain porridge → NOVA 1
    if 'porridge' in desc and 'plain' in desc:
        return 1

    # === Media rule: Plant-based milks → NOVA 1
    if 'plant-based milk' in desc:
        return 1

    # === Media rule: Creams → NOVA 2
    if any(x in desc for x in ['single cream', 'double cream', 'crème fraiche']):
        return 2

    # === Media rule: Chinese takeaway meals with soy sauce → NOVA 3
    if 'soy sauce' in desc and any(x in desc for x in ['chow mein', 'chop suey']):
        return 3

    # === Media rule: Stir-fry dishes → NOVA 1
    if 'stir fry' in desc:
        return 1

    return np.nan  # 如果都不匹配就返回空值



In [41]:
# 推荐保存更多列，方便后续检查
cols_to_save = [
    "FoodID", "Descriptionen", "Foodgroupen",  # 基础食物信息
    "NOVA_step1", "NOVA_step2"                 # 匹配结果
]

# 保存文件（保留已匹配上的）
intake_df[cols_to_save].dropna(subset=["NOVA_step2"]).to_csv(output_path, index=False)


In [44]:
def media_based_nova_classification(row):
    print(">>>", row['Descriptionen'])  # 先看看到底传进来了什么

    desc = str(row['Descriptionen']).lower()  # 记得强制转为字符串+小写
    ...


In [42]:
intake_df["NOVA_step2"].value_counts(dropna=True)


Unnamed: 0_level_0,count
NOVA_step2,Unnamed: 1_level_1
4.0,11


In [43]:
# 举例：看看哪些食物被 "stir fry" 匹配到了（media 规则里归类为 NOVA 1）
intake_df[intake_df["Descriptionen"].str.lower().str.contains("stir fry", na=False)]


Unnamed: 0,SurveyID,UserID,Source,Starttime,Submissiontime,Timetocomplete,Cookingoilused,Diet,Foodamount,Reasonforunusualfoodamount,...,weekday,ratio,UserID_clean,Foodgroupen_clean,Descriptionen_clean,NOVA_att3_match,NOVA_source,NOVA_step1,match_reason,NOVA_step2
1741,7bd01e5e-73ec-428d-8a2d-bdbb1f13c98c,BFD043c,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,2024-01-29T20:47:07.412Z,2024-01-29T20:52:10.020Z,5,Corn oil,Not on a special diet,Usual,,...,1,"3 week, 1 weekend",BFD043,noodles and noodle dishes,"chicken chow mein, stir fry (with noodles)",,,,,
1831,c765d96c-933d-48ac-b652-d334d2873994,BFD045a,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,2024-01-29T17:39:57.967Z,2024-01-29T17:49:01.179Z,9,Sunflower oil,Not on a special diet,Usual,,...,1,"3 week, 1 weekend",BFD045,chicken/turkey dishes,oriental chicken stir fry (including noodles) ...,,,,,
3412,3892f8bf-17f2-4a42-9736-797ae7b8b273,BFD069c,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,2024-02-09T21:24:36.596Z,2024-02-09T21:35:22.519Z,10,Did not use,Not on a special diet,Usual,,...,1,"2 week, 2 weekend",BFD069,noodles and noodle dishes,"vegetable chow mein, stir fry (with noodles)",,,,,
4851,5232186d-cc03-409e-b7b6-37b77e174056,BFD094a,Mozilla/5.0 (Linux; Android 11; SM-A705FN) App...,2024-02-11T19:33:26.415Z,2024-02-11T19:52:28.753Z,19,Sunflower oil,Not on a special diet,More,Travelling / out of house / visiting friends,...,2,"3 week, 1 weekend",BFD094,"other vegetables (excluding potato): carrots, ...",mixed vegetables stir fry (cooked from frozen),,,,,
15571,8af8812e-5818-4060-b846-a94cda2edbf6,BFD284a,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,2024-06-01T00:10:46.186Z,2024-06-01T00:29:39.634Z,18,Sunflower oil,Not on a special diet,Less,Sickness/tiredness,...,2,"3 week, 1 weekend",BFD284,chicken/turkey dishes,chicken stir fry (meat and vegetables with sauce),,,,,
16055,137a351d-da54-4ab7-816d-2926eee11704,BFD291c,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,2024-06-23T18:35:56.198Z,2024-06-23T18:59:18.366Z,23,Sunflower oil,Not on a special diet,Usual,,...,2,"3 week, 1 weekend",BFD291,noodles and noodle dishes,"chicken chow mein, stir fry (with noodles)",,,,,
16084,893d7ada-bb7d-42c8-8639-3144a84661a6,BFD291d,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,2024-06-23T18:59:28.320Z,2024-06-23T19:16:26.278Z,16,Other: Sesame,Not on a special diet,Usual,,...,2,"3 week, 1 weekend",BFD291,pork and pork dishes,"pork chow mein, stir fry (with noodles)",,,,,
16771,ea501800-092b-486c-ae13-f55a4d77183e,BFD303d,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,2024-06-23T17:58:05.399Z,2024-06-23T18:05:24.728Z,7,Vegetable oil (rapeseed),Not on a special diet,More,Weekend day / Friday,...,2,"3 week, 1 weekend",BFD303,noodles and noodle dishes,"vegetable chow mein, stir fry (with noodles)",,,,,
16829,74d029f8-f0c0-4364-8765-f0e7b286792f,BFD304d,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,2024-06-23T17:38:21.851Z,2024-06-23T17:53:42.965Z,15,Vegetable oil (rapeseed),Not on a special diet,More,Weekend day / Friday,...,2,"3 week, 1 weekend",BFD304,noodles and noodle dishes,"vegetable chow mein, stir fry (with noodles)",,,,,
18005,9fa4dbc1-54b8-4b0b-af28-78d28e86b5bd,BFD327a,Mozilla/5.0 (Linux; Android 10; K) AppleWebKit...,2024-07-08T10:16:23.850Z,2024-07-08T10:33:01.586Z,16,Olive oil,Not on a special diet,Usual,,...,1,"3 week, 1 weekend",BFD327,chicken/turkey dishes,oriental chicken stir fry (including noodles) ...,,,,,


## 1.1 Description Part

In [None]:
def match_nova_by_description(text):
    text = str(text).lower().strip()

    # === NOVA 1: water ===
    if any(w in text for w in ["tap water", "still water", "filtered water", "plain water"]):
        if "flavour" not in text:
            return 1, "plain water (description)"

    # plain dairy
    if any(w in text for w in ["semi skimmed milk", "skimmed milk", "whole milk"]) and "flavour" not in text:
        return 1, "plain milk"
    if any(w in text for w in ["natural yoghurt", "fromage frais"]) and "flavour" not in text:
        return 1, "plain yoghurt"

    # raw/unprocessed ===
    # raw
    import re

    if re.search(r'\braw\b', text):
        return 1, "raw (word-bound)"

    # uncooked oat
    if re.search(r"\buncooked\b", text) and re.search(r"\boat(s)?\b", text):
        return 1, "raw cereal: oats (description)"


    # === NOVA 3: homemade/light-processed ===
    if any(w in text for w in ["homemade", "home made"]):
        return 3, "homemade"
    if any(w in text for w in ["boiled", "mashed potato", "baked potato", "jacket potato"]):
        return 3, "boiled/baked/jacket"

    # === NOVA 4: sachet porridge ===
    if "porridge sachet" in text or ("porridge" in text and "oat so simple" in text):
        return 4, "sachet porridge (description)"

    # takeaway
    if "takeaway" in text or "take away" in text:
        return 4, "takeaway food"

    # sweets/dessert/snack
    if any(w in text for w in ["jam", "conserve", "marmalade", "chocolate spread", "ice cream topping", "marzipan"]):
        return 4, "spread/syrup"
    if any(w in text for w in ["cracker", "savoury biscuit", "cheddar biscuit", "cream cracker"]):
        return 4, "processed snack"
    if any(w in text for w in ["sweets", "gums", "jelly", "boiled sweets", "mints", "liquorice", "popcorn"]):
        return 4, "sweet snack"
    if any(w in text for w in ["ice cream", "dessert", "milkshake"]):
        return 4, "processed dessert"
    if any(w in text for w in ["margarine", "clover spread", "flora"]):
        return 4, "processed fat"
    if "flavoured milk" in text or "chocolate milk" in text:
        return 4, "flavoured milk"
    if "ketchup" in text and "home made" not in text:
        return 4, "processed ketchup"
    if "instant" in text and "porridge" not in text:
        return 4, "instant food"


    return None, None


## 1.2 Group Part

In [None]:
def match_nova_by_group(group, description):
    group = str(group).lower().strip()
    description = str(description).lower().strip()

    # === NOVA 1: group water ===
    if group.strip() in ["water", "tap water", "filtered water"]:
        return 1, "water (group)"

    # milk/yoghurt
    if "fresh fruit" in group:
        return 1, "fruit (group)"
    if "dried fruit" in group:
        return 1, "dried fruit (group)"
    if "vegetables" in group and "fried" not in group:
        return 1, "vegetables (group)"
    if any(word in group for word in ["semi skimmed milk", "skimmed milk", "whole milk"]):
        if "flavour" not in description and "fruit" not in description:
            return 1, "milk (group)"
    if any(word in group for word in ["natural yoghurt", "fromage frais"]):
        if "flavour" not in description and "fruit" not in description:
            return 1, "yoghurt/plain dairy (group)"

    # === NOVA 3: some fat ===
    if any(w in group for w in ["olive oil", "rapeseed oil", "sunflower oil", "vegetable oil", "butter"]):
        return 3, "culinary fat/oil (group)"

    # === NOVA 4 ===
    if any(w in group for w in ["margarine", "fat spread", "flora", "dairy fat spreads", "hard marg"]):
        return 4, "processed fat (group)"
    if any(w in group for w in ["jam", "conserve", "marmalade"]):
        return 4, "preserves (group)"
    if "other breakfast cereals" in group or "muesli" in group or "bran flakes" in group:
        return 4, "processed cereal (group)"

    return None, None


In [None]:
print(intake_df.columns)


Index(['SurveyID', 'UserID', 'Source', 'Starttime', 'Submissiontime',
       'Timetocomplete', 'Cookingoilused', 'Diet', 'Foodamount',
       'Reasonforunusualfoodamount',
       ...
       'Modification_Identification', 'discontinued', 'NDNS_Checks',
       'UserID_specific', 'Day', 'weekday', 'ratio', 'UserID_clean',
       'Foodgroupen_clean', 'Descriptionen_clean'],
      dtype='object', length=168)


## 1.3 Combined Rule Matching & Application

In [None]:
def match_nova(row):
    description = row["Descriptionen_clean"]
    group = row["Foodgroupen_clean"]

    # try description
    nova, reason = match_nova_by_description(description)
    if nova is not None:
        return pd.Series([nova, "description: " + reason])

    # fallback to group
    nova, reason = match_nova_by_group(group, description)
    if nova is not None:
        return pd.Series([nova, "group: " + reason])

    return pd.Series([None, None])


In [None]:
intake_df[["NOVA_step1", "match_reason"]] = intake_df.apply(match_nova, axis=1)

## 1.4 Save outcome_step1

In [None]:
output_path = "/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_step1.csv"

cols_to_save = [
    "Descriptionen",
    "Descriptionen_clean",
    "Foodgroupen",
    "Foodgroupen_clean",
    "NOVA_step1",
    "match_reason"
]
intake_df[cols_to_save].to_csv(output_path, index=False)


# 2. TF-IDF Matching

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 2.1: Merge three NOVA control files and clean them
ndns_df['source'] = 'NDNS'
giulia_df['source'] = 'Giulia'
off_df['source'] = 'OFF'

nova_all = pd.concat([ndns_df, giulia_df, off_df], axis=0)
nova_all = nova_all.drop_duplicates(subset='FoodName_clean').reset_index(drop=True)

# Step 2.2: Select only the samples missing from NOVA_step1
to_match_mask = intake_df['NOVA_step1'].isna()
intake_tfidf_df = intake_df[to_match_mask].copy()

# Step 2.3: TF-IDF vectorization and similarity calculation
vectorizer = TfidfVectorizer()
tfidf_intake = vectorizer.fit_transform(intake_tfidf_df['Descriptionen_clean'])
tfidf_nova = vectorizer.transform(nova_all['FoodName_clean'])

cos_sim = cosine_similarity(tfidf_intake, tfidf_nova)
best_match_idx = cos_sim.argmax(axis=1)
best_scores = cos_sim.max(axis=1)

# Step 2.4: Only keep the matching results with score >= 0.85
score_threshold = 0.85
valid_mask = best_scores >= score_threshold

intake_tfidf_df.loc[valid_mask, 'TFIDF_score'] = best_scores[valid_mask]
intake_tfidf_df.loc[valid_mask, 'TFIDF_match_name'] = nova_all.iloc[best_match_idx[valid_mask]]['FoodName_clean'].values
intake_tfidf_df.loc[valid_mask, 'NOVA_step2'] = nova_all.iloc[best_match_idx[valid_mask]]['NOVA'].values
intake_tfidf_df.loc[valid_mask, 'Match_source'] = nova_all.iloc[best_match_idx[valid_mask]]['source'].values

# Step 2.5: Merge the results back to the original intake_df
intake_df.loc[intake_tfidf_df.index, 'TFIDF_score'] = intake_tfidf_df['TFIDF_score']
intake_df.loc[intake_tfidf_df.index, 'TFIDF_match_name'] = intake_tfidf_df['TFIDF_match_name']
intake_df.loc[intake_tfidf_df.index, 'NOVA_step2'] = intake_tfidf_df['NOVA_step2']
intake_df.loc[intake_tfidf_df.index, 'Match_source'] = intake_tfidf_df['Match_source']


In [None]:
output_path = "/content/drive/MyDrive/UPF-HFI/outcome/intake_with_step2_tfidf.csv"

columns_to_save = [
    'Descriptionen', 'Descriptionen_clean',
    'NOVA_step1', 'match_reason',
    'NOVA_step2', 'TFIDF_score', 'TFIDF_match_name', 'Match_source'
]
intake_df[columns_to_save].to_csv(output_path, index=False)


# 3. SBERT

In [None]:
from sentence_transformers import SentenceTransformer, util

# Step 3.1: Filtering samples not matched by Step1 and Step2
mask_sbert = intake_df['NOVA_step1'].isna() & intake_df['NOVA_step2'].isna()
intake_sbert_df = intake_df[mask_sbert].copy()

# Step 3.2: Loading the SBERT Model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Step 3.3: Encoding into sentence vectors
intake_embeddings = model.encode(intake_sbert_df['Descriptionen_clean'].tolist(), convert_to_tensor=True)
nova_embeddings = model.encode(nova_all['FoodName_clean'].tolist(), convert_to_tensor=True)

# Step 3.4: Similarity Matrix
cos_scores = util.pytorch_cos_sim(intake_embeddings, nova_embeddings)
best_match_scores, best_match_indices = cos_scores.max(dim=1)

# Step 3.5: Threshold control
threshold = 0.8
score_array = best_match_scores.cpu().numpy()
index_array = best_match_indices.cpu().numpy()

# High score matching rows
high_mask = score_array >= threshold
low_mask = ~high_mask

# Fill in the high score match results
intake_sbert_df.loc[high_mask, 'SBERT_score'] = score_array[high_mask]
intake_sbert_df.loc[high_mask, 'SBERT_match_name'] = nova_all.iloc[index_array[high_mask]]['FoodName_clean'].values
intake_sbert_df.loc[high_mask, 'NOVA_step3'] = nova_all.iloc[index_array[high_mask]]['NOVA'].values
intake_sbert_df.loc[high_mask, 'SBERT_match_source'] = nova_all.iloc[index_array[high_mask]]['source'].values

# For low score matches, only the match information is recorded, but NOVA is not populated
intake_sbert_df.loc[low_mask, 'SBERT_score'] = score_array[low_mask]
intake_sbert_df.loc[low_mask, 'SBERT_match_name'] = nova_all.iloc[index_array[low_mask]]['FoodName_clean'].values
intake_sbert_df.loc[low_mask, 'SBERT_flag'] = 'low_confidence'

# Merge back to master table
intake_df.loc[intake_sbert_df.index, ['SBERT_score', 'SBERT_match_name', 'NOVA_step3', 'SBERT_match_source', 'SBERT_flag']] = \
    intake_sbert_df[['SBERT_score', 'SBERT_match_name', 'NOVA_step3', 'SBERT_match_source', 'SBERT_flag']]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
output_path = "/content/drive/MyDrive/UPF-HFI/outcome/intake_with_step3_sbert.csv"

columns_to_save = [
    'Descriptionen', 'Descriptionen_clean',
    'NOVA_step1', 'match_reason',
    'NOVA_step2', 'TFIDF_score', 'TFIDF_match_name',
    'NOVA_step3', 'SBERT_score', 'SBERT_match_name',
    'Match_source', 'SBERT_match_source', 'SBERT_flag'
]
intake_df[columns_to_save].to_csv(output_path, index=False)


In [None]:
# Calculate the number of unmatched samples
unmatched_mask = (
    intake_df['NOVA_step1'].isna() &
    intake_df['NOVA_step2'].isna() &
    intake_df['NOVA_step3'].isna()
)
unmatched_count = unmatched_mask.sum()

# Total sample size
total_count = len(intake_df)
matched_count = total_count - unmatched_count
matched_percent = matched_count / total_count * 100

print(f"Number of matched samples:{matched_count} / {total_count}")
print(f"Match Coverage:{matched_percent:.2f}%")
print(f"Number of samples not yet matched:{unmatched_count}")


Number of matched samples:18175 / 22217
Match Coverage:81.81%
Number of samples not yet matched:4042


# 4. Validation conflicts

Check if the samples matched successfully in Step 2 (TF-IDF) and Step 3 (SBERT) have different NOVA results for matching in the two models.

In [None]:
# Comparing the NOVA of Step2 and Step3
conflict_df = intake_df[
    intake_df['NOVA_step2'].notna() &
    intake_df['NOVA_step3'].notna() &
    (intake_df['NOVA_step2'] != intake_df['NOVA_step3'])
].copy()

# Add a conflict description column
conflict_df['Conflict_type'] = 'Step2 vs Step3 NOVA mismatch'

print(f'Model conflict analysis only completed, a total of {len(conflict_df)} samples of NOVA classification conflicts for TF-IDF vs SBERT were found.')


Model conflict analysis only completed, a total of 0 samples of NOVA classification conflicts for TF-IDF vs SBERT were found.


# 5. Combine all of the above

In [None]:
print(intake_df.columns.tolist())


['SurveyID', 'UserID', 'Source', 'Starttime', 'Submissiontime', 'Timetocomplete', 'Cookingoilused', 'Diet', 'Foodamount', 'Reasonforunusualfoodamount', 'Proxy', 'ProxyIssues', 'MealIndex', 'MealID', 'Mealname', 'Mealtime', 'Foodsource', 'FoodIndex', 'Searchterm', 'FoodID', 'Intake24foodcode', 'Descriptionen', 'Descriptionlocal', 'Nutrienttablename', 'Nutrienttablecode', 'Foodgroupcode', 'Foodgroupen', 'Foodgrouplocal', 'Readymeal', 'Brand', 'Asservedweightfactor', 'Servingsizegml', 'Servingimage', 'Leftoversgml', 'Leftoversimage', 'Portionsizegml', 'Reasonableamount', 'MissingfoodID', 'Missingfooddescription', 'Missingfoodportionsize', 'Missingfoodleftovers', 'Subgroupcode', 'Water', 'Totalnitrogen', 'Nitrogenconversionfactor', 'Protein', 'Fat', 'Carbohydrate', 'Energykcal', 'EnergykJ', 'Alcohol', 'Englystfibre', 'Starch', 'Totalsugars', 'AOAC', 'Nonmilkextrinsicsugars', 'Intrinsicandmilksugars', 'Glucose', 'Fructose', 'Maltose', 'Lactose', 'Sucrose', 'OthersugarsUK', 'FSTablesugar', '

In [None]:
rename_map = {
    "NOVA_step1": "NOVA_keyword",
    "match_reason": "Keyword_reason",
    "NOVA_step2": "NOVA_tfidf",
    "TFIDF_match_name": "TFIDF_match",
    "Match_source": "TFIDF_source",
    "NOVA_step3": "NOVA_sbert",
    "SBERT_match_name": "SBERT_match",
    "SBERT_match_source": "SBERT_source"
}
intake_df.rename(columns=rename_map, inplace=True)

# Step 5.1 Checking for contradictory matches between models
intake_df['Model_disagree_flag'] = (
    intake_df['TFIDF_match'].notna() &
    intake_df['SBERT_match'].notna() &
    (intake_df['TFIDF_match'] != intake_df['SBERT_match'])
)

intake_df['NOVA_disagree_flag'] = (
    intake_df['Model_disagree_flag'] &
    (intake_df['NOVA_tfidf'] != intake_df['NOVA_sbert'])
)

intake_df['Decision_flag'] = intake_df['NOVA_disagree_flag'].map(
    lambda x: "manual_review" if x else "auto_accept"
)

# Step 5.2 Select Final Trusted Value: Keyword > SBERT > TF-IDF
final_mask = intake_df['Decision_flag'] == "auto_accept"
intake_df['NOVA_final'] = None

intake_df.loc[final_mask & intake_df['NOVA_keyword'].notna(), 'NOVA_final'] = intake_df['NOVA_keyword']
intake_df.loc[final_mask & intake_df['NOVA_final'].isna() & intake_df['NOVA_sbert'].notna(), 'NOVA_final'] = intake_df['NOVA_sbert']
intake_df.loc[final_mask & intake_df['NOVA_final'].isna() & intake_df['NOVA_tfidf'].notna(), 'NOVA_final'] = intake_df['NOVA_tfidf']

# Step 5.3 Add column
def get_matched_name(row):
    if pd.notna(row['NOVA_keyword']):
        return row['TFIDF_match']
    elif pd.notna(row['NOVA_sbert']):
        return row['SBERT_match']

def get_final_source(row):
    if pd.notna(row['NOVA_keyword']):
        return 'keyword'
    elif pd.notna(row['NOVA_sbert']):
        return row['SBERT_source']
    elif pd.notna(row['NOVA_tfidf']):
        return row['TFIDF_source']
    else:
        return None

def get_final_reason(row):
    if pd.notna(row['NOVA_keyword']):
        return row['Keyword_reason']
    elif pd.notna(row['NOVA_sbert']):
        return 'SBERT matched'
    elif pd.notna(row['NOVA_tfidf']):
        return 'TF-IDF matched'
    else:
        return 'Unmatched'

intake_df['Matched_name'] = intake_df.apply(get_matched_name, axis=1)
intake_df['Final_source'] = intake_df.apply(get_final_source, axis=1)
intake_df['Final_reason'] = intake_df.apply(get_final_reason, axis=1)

# Step 5.4 Save the final result
final_cols = [
    "Descriptionen", "Descriptionen_clean",
    "NOVA_keyword", "Keyword_reason",
    "NOVA_tfidf", "TFIDF_score", "TFIDF_match", "TFIDF_source",
    "NOVA_sbert", "SBERT_score", "SBERT_match", "SBERT_source",
    "Model_disagree_flag", "NOVA_disagree_flag", "Decision_flag",
    "NOVA_final","Matched_name","Final_source", "Final_reason"
]

intake_df[final_cols].to_csv("/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_final.csv", index=False)


In [None]:
slim_cols = [
    "Descriptionen",
    "NOVA_final",
    "Matched_name",
    "Final_reason"
]

intake_df[slim_cols].to_csv(
    "/content/drive/MyDrive/UPF-HFI/outcome/intake_with_nova_final_slim.csv",
    index=False
)


# 6. Unmatched-18.19%(4042)

In [None]:
# Add the Failure_reason column to the summary table
def get_failure_reason(row):
    if pd.notna(row['NOVA_final']):
        return None
    elif row['Decision_flag'] == 'manual_review':
        return 'Model disagreement'
    elif pd.isna(row['NOVA_keyword']) and pd.isna(row['NOVA_tfidf']) and pd.isna(row['NOVA_sbert']):
        return 'No match found'
    else:
        return 'Conflict unresolved'

intake_df['Failure_reason'] = intake_df.apply(get_failure_reason, axis=1)

# Filter out unmatched rows
unmatched_df = intake_df[intake_df['NOVA_final'].isna()].copy()

# Save
unmatched_df[["Descriptionen", "Failure_reason"]].to_csv(
    "/content/drive/MyDrive/UPF-HFI/outcome/intake_unmatched_summary.csv",
    index=False
)


In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 6.1: Get unmatched samples
unmatched_df = intake_df[intake_df['NOVA_final'].isna()].copy()

# Step 6.2: TF-IDF fallback（score ≥ 0.6）
vectorizer = TfidfVectorizer()
tfidf_intake = vectorizer.fit_transform(unmatched_df['Descriptionen_clean'])
tfidf_nova = vectorizer.transform(nova_all['FoodName_clean'])
cos_sim = cosine_similarity(tfidf_intake, tfidf_nova)

tfidf_best_idx = cos_sim.argmax(axis=1)
tfidf_best_scores = cos_sim.max(axis=1)
tfidf_thresh = 0.6
tfidf_mask = tfidf_best_scores >= tfidf_thresh

unmatched_df.loc[tfidf_mask, 'NOVA_final_fallback'] = nova_all.iloc[tfidf_best_idx[tfidf_mask]]['NOVA'].values
unmatched_df.loc[tfidf_mask, 'Fallback_source'] = nova_all.iloc[tfidf_best_idx[tfidf_mask]]['source'].values
unmatched_df.loc[tfidf_mask, 'Fallback_match_name'] = nova_all.iloc[tfidf_best_idx[tfidf_mask]]['FoodName_clean'].values
unmatched_df.loc[tfidf_mask, 'Fallback_reason'] = 'TF-IDF fallback'

# Step 6.3: SBERT fallback match (score ≥ 0.6, TF-IDF misses only)
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
sbert_input = unmatched_df['Descriptionen_clean'].tolist()
sbert_embed_intake = model.encode(sbert_input, convert_to_tensor=True)
sbert_embed_nova = model.encode(nova_all['FoodName_clean'].tolist(), convert_to_tensor=True)

sbert_sim = util.pytorch_cos_sim(sbert_embed_intake, sbert_embed_nova)
sbert_best_scores, sbert_best_idx = sbert_sim.max(dim=1)

sbert_thresh = 0.6

bool_mask = sbert_best_scores.cpu().numpy() >= sbert_thresh
sbert_mask = pd.Series(bool_mask, index=unmatched_df.index)

# Get matches with index alignment
matched_indices = sbert_best_idx.cpu().numpy()[sbert_mask.values]
matched_rows = unmatched_df[sbert_mask]

# Fill in the fallback match result
unmatched_df.loc[matched_rows.index, 'NOVA_final_fallback'] = nova_all.iloc[matched_indices]['NOVA'].values
unmatched_df.loc[matched_rows.index, 'Fallback_source'] = nova_all.iloc[matched_indices]['source'].values
unmatched_df.loc[matched_rows.index, 'Fallback_match_name'] = nova_all.iloc[matched_indices]['FoodName_clean'].values
unmatched_df.loc[matched_rows.index, 'Fallback_reason'] = 'SBERT fallback'

In [None]:
# Step 6.4: Filter out successful fallback rows and save them
fallback_df = unmatched_df[unmatched_df['NOVA_final_fallback'].notna()][[
    "Descriptionen", "NOVA_final_fallback", "Fallback_source","Fallback_match_name","Fallback_reason"
]]

fallback_df.to_csv(
    "/content/drive/MyDrive/UPF-HFI/outcome/intake_unmatched_fallback_matched.csv",
    index=False
)

print(f'Fallback match complete, total {len(fallback_df)} strips unmatched sample')


Fallback match complete, total 3809 strips unmatched sample


In [None]:
# Step6.5: Getting unmatched samples of fallback failures
failed_fallback_df = unmatched_df[unmatched_df['NOVA_final_fallback'].isna()]

failure_counts = (
    failed_fallback_df['Descriptionen']
    .value_counts()
    .reset_index()
)
failure_counts.columns = ['Descriptionen', 'Count']

# Export
failure_counts.to_csv(
    "/content/drive/MyDrive/UPF-HFI/outcome/fallback_failure_counts.csv",
    index=False
)

# Displayed
print("Description of all fallback failure samples:")
print(failure_counts.head(30))


Description of all fallback failure samples:
                                        Descriptionen  Count
0                                        Vimto, still     61
1                                  Pom Bears (crisps)     52
2                                            Pringles     18
3                                         Cheesy mash     15
4                                               Fanta     10
5                             Magnum classic or white     10
6                                            Smarties      8
7                        Lollipops (e.g. Chupa Chups)      7
8                                   Fox's party rings      6
9                            Walker Sensations crisps      6
10                                          Poppadums      5
11                                         Fanta Zero      4
12                                     Malteser bunny      3
13                                      7 up / Sprite      3
14                                      