In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
# Import the OpenFoodFacts database
df2 = pd.read_csv("en.openfoodfacts.org.products.csv", sep="\t", 
                  converters={"ingredients_tags": lambda x: x.strip("[]'").strip('"').split(","), 
                                "ingredients_analysis_tags": lambda x: x.strip("[]'").strip('"').split(","),})

In [6]:
# Remove unnecessary columns
df2 = df2.drop(columns=["code", "url", "creator", "created_t", 
                        "last_modified_t", "last_modified_datetime", 
                        "last_modified_by", "packaging", "packaging_tags", 
                        "packaging_en", "packaging_text", "brands", "brands_tags", 
                        "categories", "categories_tags", "origins", "origins_tags", 
                        "origins_en", "manufacturing_places", "manufacturing_places_tags", 
                        "labels", "labels_tags", "labels_en", "emb_codes", "emb_codes_tags", 
                        "first_packaging_code_geo", "cities", "cities_tags", "purchase_places", 
                        "stores", "countries", "countries_tags", "countries_en", 
                        "ingredients_text", "allergens", "allergens_en", "traces", 
                        "traces_tags", "traces_en", "additives_n", "additives", 
                        "additives_tags", "additives_en", "nutriscore_score", 
                        "nutriscore_grade", "nova_group", "pnns_groups_1", 
                        "pnns_groups_2", "states", "states_tags", "states_en", 
                        "brand_owner", "ecoscore_score", "ecoscore_grade", 
                        "owner", "data_quality_errors_tags", "unique_scans_n", 
                        "popularity_tags", "completeness", "last_image_t", 
                        "last_image_datetime", "image_url", "image_small_url", 
                        "image_ingredients_url", "image_ingredients_small_url", 
                        "image_nutrition_url", "image_nutrition_small_url", 
                        "butyric-acid_100g", "caproic-acid_100g", "caprylic-acid_100g", 
                        "capric-acid_100g", "lauric-acid_100g", "myristic-acid_100g", 
                        "palmitic-acid_100g", "stearic-acid_100g", "arachidic-acid_100g", 
                        "behenic-acid_100g", "lignoceric-acid_100g", "cerotic-acid_100g", 
                        "montanic-acid_100g", "melissic-acid_100g", "abbreviated_product_name", 
                        "cocoa_100g", "chlorophyl_100g", "carbon-footprint_100g", 
                        "carbon-footprint-from-meat-or-fish_100g", "nutrition-score-fr_100g", 
                        "nutrition-score-uk_100g", "glycemic-index_100g", "water-hardness_100g", 
                        "choline_100g", "phylloquinone_100g", "beta-glucan_100g", "inositol_100g", 
                        "carnitine_100g", "sulphate_100g", "nitrate_100g", "cholesterol_100g", 
                        "added-sugars_100g", "sucrose_100g", "glucose_100g", "fructose_100g", 
                        "lactose_100g", "maltose_100g", "maltodextrins_100g", "starch_100g", 
                        "polyols_100g", "erythritol_100g", "fiber_100g", "soluble-fiber_100g", 
                        "insoluble-fiber_100g", "casein_100g", "serum-proteins_100g", "nucleotides_100g", 
                        "added-salt_100g", "alcohol_100g", "vitamin-a_100g", "beta-carotene_100g", 
                        "vitamin-d_100g", "vitamin-e_100g", "vitamin-k_100g", "vitamin-c_100g", 
                        "vitamin-b1_100g", "vitamin-b2_100g", "vitamin-pp_100g", "vitamin-b6_100g", 
                        "vitamin-b9_100g", "folates_100g", "vitamin-b12_100g", "biotin_100g", 
                        "pantothenic-acid_100g", "silica_100g", "bicarbonate_100g", 
                        "potassium_100g", "chloride_100g", "calcium_100g", "phosphorus_100g", 
                        "iron_100g", "magnesium_100g", "zinc_100g", "copper_100g", 
                        "manganese_100g", "fluoride_100g", "selenium_100g", "chromium_100g", 
                        "molybdenum_100g", "iodine_100g", "caffeine_100g", "taurine_100g",
                        "ph_100g", "fruits-vegetables-nuts_100g", "fruits-vegetables-nuts-dried_100g", 
                        "fruits-vegetables-nuts-estimate_100g", "fruits-vegetables-nuts-estimate-from-ingredients_100g", 
                        "collagen-meat-protein-ratio_100g", "saturated-fat_100g", 
                        "unsaturated-fat_100g", "monounsaturated-fat_100g", "polyunsaturated-fat_100g", 
                        "omega-3-fat_100g", "alpha-linolenic-acid_100g", "eicosapentaenoic-acid_100g", 
                        "docosahexaenoic-acid_100g", "omega-6-fat_100g", "linoleic-acid_100g", 
                        "arachidonic-acid_100g", "gamma-linolenic-acid_100g", 
                        "dihomo-gamma-linolenic-acid_100g", "omega-9-fat_100g", 
                        "oleic-acid_100g", "elaidic-acid_100g", "gondoic-acid_100g",
                        "mead-acid_100g", "erucic-acid_100g", "nervonic-acid_100g", 
                        "trans-fat_100g", "energy-kj_100g", "energy-kcal_100g", "energy_100g", 
                        "energy-from-fat_100g", "no_nutrition_data", "created_datetime"])

In [8]:
# Drop rows with null values for fat, protein, or carbs
df3 = df2[df2["fat_100g"].isnull()]
df4 = df3[df3["proteins_100g"].isnull()]
df5 = df4[df4["carbohydrates_100g"].isnull()]
df6 = df2.drop(df5.index).reset_index(drop=True)

In [13]:
# Drop rows with null values for product name
df7 = df6.drop(df6[df6["generic_name"].isnull()].index).reset_index(drop=True)

In [54]:
# Drop rows with no ingredients
nutrients = df7.drop(df7[df7["ingredients_tags"].isnull()].index).reset_index(drop=True)

In [None]:
# Remove all non-English ingredients
for i in tqdm(range(len(nutrients))):
    for ingr in nutrients["ingredients_tags"][i]:
        if ingr[0:3] != "en:":
            nutrients["ingredients_tags"][i].remove(ingr)   

In [None]:
# Filter again to find any stragglers
for i in tqdm(range(len(nutrients))):
    for ingr in nutrients["ingredients_tags"][i]:
        if ingr[0:3] != "en:":
            nutrients["ingredients_tags"][i].remove(ingr)

In [None]:
# Remove all recipes that now no longer have any ingredients
to_remove = []
for i in tqdm(range(len(nutrients))):
    if len(nutrients["ingredients_tags"][i]) == 0:
        to_remove.insert(0, i)
        
for j in tqdm(to_remove):
    nutrients.drop(j, inplace=True)

nutrients.reset_index(inplace=True, drop=True)

In [None]:
# Import the taste profile dataset
df_taste = pd.read_csv("20170202 Sensory database v004.csv")

In [None]:
# Remove unnecessary columns
df_taste.drop(columns=["Food_code", "NEVO_code", "Product_description_NL", 
                 "Food_group_code", "Food_group_NL", "Date", "Serving_methods", "Preparation_method", "Reference_control_foods",
                "no_sweet", "sd_sweet", "se_sweet", "no_salt", "sd_salt", "se_salt",
                "no_fat", "sd_fat", "se_fat", "no_umami", "sd_umami", "se_umami",
                "no_sour", "sd_sour", "se_sour", "no_bitter", "sd_bitter", "se_bitter"], 
              inplace=True)

In [None]:
# Remove all rows for the (non) alcoholic beverages food group as they are irrelevant
df_taste2 = df_taste[df_taste["Food_group_EN"] != "(non) alcoholic beverages"].reset_index(drop=True)

In [None]:
df_taste2.to_csv("taste_profiles_unlabeled.csv")

# At this point manual labeling was applied, 
# together with manual filtering of unnecessary data

df_taste_labeled = pd.read_csv("taste_profiles_labeled.csv")

In [None]:
# Import labeled taste profile data and rename columns to make more sense
df_taste_labeled.drop(columns=["Unnamed: 0", "Product_brand"], inplace=True)
df_taste_labeled.rename(columns={"Product_description_EN": "Product_description", "Food_group_EN": "Food_group"}, inplace=True)

In [None]:
taste = df_taste_labeled

In [None]:
# Map ingredients in the nutrients data to ingredients in the taste profile data
taste["fat_100g"] = 0
taste["carbs_100g"] = 0
taste["proteins_100g"] = 0

for i in tqdm(range(len(taste))):
    shortest = -1
    for j in range(len(nutrients["ingredients_tags"])):
        lst = []
        for ingr in nutrients["ingredients_tags"][j]:
            if taste["Key_ingredient"][i] in ingr:
                lst.append(j)
        
        if lst:
            minlen = 99999
            for ind in lst:
                if len(nutrients["ingredients_tags"][ind]) < minlen:
                    minlen = len(nutrients["ingredients_tags"][ind])
                    shortest = ind
    
    if shortest > 0:
        taste["fat_100g"][i] = nutrients["fat_100g"][shortest]
        taste["carbs_100g"][i] = nutrients["carbohydrates_100g"][shortest]
        taste["proteins_100g"][i] = nutrients["proteins_100g"][shortest]

In [None]:
# Drop all values where fat, protein, and carbs are all 0
ingredients = taste[(taste["fat_100g"] != 0) | (taste["proteins_100g"] != 0) | (taste["carbs_100g"] != 0)]
ingredients.reset_index(inplace=True, drop=True)

In [None]:
# Check for all unique ingredients whether duplicates exist
for i in ingredients["Key_ingredient"].unique():
    if len(ingredients[ingredients["Key_ingredient"] == i]) > 1:
        print(ingredients[ingredients["Key_ingredient"] == i])

# Duplicate deletion
ingredients.drop([32,33,72], inplace=True)
ingredients.reset_index(inplace=True, drop=True)

In [None]:
ingredients.to_csv("ingredients_with_taste_profiles_and_nutrients.csv")