In [29]:
import shutil
import sys
import uuid
from pathlib import Path
import pandas as pd

In [30]:
from ipynb.fs.full.ingredient_embeddings_similarity import exported as Ingredients

In [31]:
# Read Dataset Files
with open("./Food Datasets/Recipes5k/annotations/train_images.txt", "r") as file:
    train_images_txt = file.readlines()

with open("./Food Datasets/Recipes5k/annotations/train_labels.txt", "r") as file:
    train_labels_txt = file.readlines()

with open("./Food Datasets/Recipes5k/annotations/test_images.txt", "r") as file:
    test_images_txt = file.readlines()

with open("./Food Datasets/Recipes5k/annotations/test_labels.txt", "r") as file:
    test_labels_txt = file.readlines()

with open("./Food Datasets/Recipes5k/annotations/val_images.txt", "r") as file:
    val_images_txt = file.readlines()

with open("./Food Datasets/Recipes5k/annotations/val_labels.txt", "r") as file:
    val_labels_txt = file.readlines()

with open("./Food Datasets/Recipes5k/annotations/ingredients_simplified_Recipes5k.txt", "r") as file:
    ingredients_simplified_txt = file.readlines()

In [32]:
# Define Filtering and Mapping (defined to standardize ingredient names.)
filtered = set(
    [
        "& half",
        "asian",
        "baking",
        "balls",
        "barbecue",
        "bbq",
        "blackening",
        "bulb",
        "crisps",
        "chop",
        "coarse",
        "concentrate",
        "double",
        "dogs",
        "dress russian",
        "fat",
        "fri",
        "fry",
        "heart",
        "italian",
        "jack",
        "kahl/u00faa",
        "ling",
        "meal",
        "mexican",
        "mie",
        "min",
        "nonstick spray",
        "non stick spray",
        "organic",
        "pan drippings",
        "p/u00e2t\\u00e9",
        "port",
        "protein",
        "preserves" "regular",
        "riso",
        "rocket",
        "rounds",
        "rub",
        "squash",
        "smoke",
        "spring",
        "sponge",
        "squirt",
        "stew",
        "stout",
        "v8",
        "vitamin",
        "well",
        "whipped",
        "whipped topping",
        "whipping",
    ]
)
mapped = {
    "abura age": "tofu",
    "aburage": "tofu",
    "ahi": "ahi tuna",
    "aioli": "garlic",
    "angel hair": "spaghetti",
    "allspice": "pepper",
    "asiago": "asiago cheese",
    "ancho": "anchovies",
    "anchovy": "anchovies",
    "bay": "bay leaf",
    "ball park franks": "frankfurters",
    "bawang goreng": "onion",
    "beet": "beets",
    "beans": "bean",
    "biscuit": "biscuits",
    "brewed espresso": "espresso",
    "bottom round": "bottom round steak",
    "bulk italian sausag": "italian sausag",
    "ch/u00e8vre": "chevre cheese",
    "chevre": "chevre cheese",
    "chilli": "chili",
    "chuck": "chuck roast",
    "clam": "clams",
    "clove": "cloves",
    "cola": "coke",
    "country crock/u00ae spread": "butter",
    "cr/u00e8me de menthe": "creme de menthe",
    "cr\\u00e8me fra\\u00eeche": "creme fraiche",
    "crackers": "crackers",
    "cracker meal": "crackers",
    "corn-on-the-cob": "corncobs",
    "cornflake": "cornflakes",
    "cold water": "water",
    "dijon": "dijon mustard",
    "deveined shrimp": "shrimp",
    "dutch process cocoa": "cocoa",
    "dutch-processed cocoa": "cocoa",
    "fettucine": "fettuccine",
    "filo dough": "phyllo dough",
    "fillo dough": "phyllo dough",
    "flank": "flank steak",
    "flax meal": "flaxseed",
    "flax": "flaxseed",
    "frankfurter": "frankfurters",
    "gew\\u00fcrztraminer": "gewurztraminer",
    "gelatine": "gelatin",
    "green chile": "green chilli",
    "green chiles": "green chilli",
    "green chilies": "green chilli",
    "grit": "grits",
    "gyoza skins": "gyoza wrappers",
    "hellmann' best food mayonnais": "mayonnaise",
    "hellmann' best food real mayonnais": "mayonnaise",
    "hoagi rolls": "hoagi roll",
    "kampyo": "dried gourd strips",
    "kanpyo": "dried gourd strips",
    "kecap manis": "indonesian sweet soy sauce",
    "kernel corn": "corn kernel",
    "kim chee": "kimchi",
    "lady fingers": "ladyfingers",
    "liqueur": "liquor",
    "luke warm water": "water",
    "mccormick\\u00ae vanilla": "vanilla extract",
    "mirin": "rice wine",
    "mission\\u00ae gluten tortillas": "tortillas",
    "neufch\\u00e2tel": "neufchatel cheese",
    "mussels": "mussel",
    "old el paso\\u2122 green chiles": "green chiles",
    "oreo\\u00ae cookies": "oreo cookies",
    "pillsbury\\u2122 crescent dinner rolls": "dinner rolls",
    "phyllo": "phyllo dough",
    "poblano": "poblano chiles",
    "pretzels": "pretzel",
    "quickcooking grits": "grits",
    "rose water": "rosewater",
    "reese's": "reese's peanut butter cups",
    "sandwiches": "sandwich",
    "sushi grade tuna": "ahi tuna",
    "serrano chilies": "serrano chile",
    "shell": "shells",
    "shell-on shrimp": "shrimp",
    "snickers": "snickers bars",
    "stouffer''s lasagna": "lasagna",
    "store-bought ravioli": "ravioli",
    "tonkatsu": "pork cutlet",
    "top ramen": "ramen",
    "thai chile": "thai chili",
    "turbinado": "brown sugar",
    "uncook shrimp": "shrimp",
    "waffles": "waffle",
    "warm water": "water",
    "won ton wrappers": "wonton wrappers",
    "wonton skins": "wonton wrappers",
    "yoghurt": "yogurt",
    "wish chunki blue chees dress": "blue cheese dressing",
    "yellow food coloring": "food colouring",
}


def get_unique_ingredients(ingredient_list):
    unique_ingredient = set()
    for x in ingredient_list:
        unique_ingredient.update(x.strip().split(","))
    return [*unique_ingredient]  # return a list by unpacking all values in set

# Filters and maps ingredient names.
def preprocess_ingredients(ingredient, filtered=filtered, mapped=mapped):
    if ingredient in filtered:
        return None
    mapped_name = mapped.get(ingredient)
    if mapped_name is None:
        return ingredient
    return mapped_name

# Preprocesses all ingredients in a list.
def preprocess_all_ingredients(ingredients_list):
    unique_ingredient = set()
    for ingredient in ingredients_list:
        new_name = preprocess_ingredients(ingredient)
        if new_name is not None:
            unique_ingredient.add(new_name)
    return [*unique_ingredient]

# Builds a nutrition table by embedding ingredients and finding their nutritional values.
def build_nutrition_table(ingredients_list):
    rows = []
    for ingredient in ingredients_list:
        embedding = Ingredients.embed([ingredient])
        (
            fndds_similarity,
            fndds_index,
            fndds_category,
        ) = Ingredients.get_most_similar_from_fndds(embedding)
        (
            nutrition5k_similarity,
            nutrition5k_index,
        ) = Ingredients.get_most_similar_from_nutrition5k(embedding)
        row = []
        if fndds_similarity > 0.7 and fndds_similarity > nutrition5k_similarity:
            ingredient_nutrition = Ingredients.get_ingredient_nutrient_from_fndds(
                fndds_category, fndds_index
            )
            row.extend(
                [
                    ingredient,
                    ingredient_nutrition["Energy (kcal)"],
                    ingredient_nutrition["Carbohydrate (g)"],
                    ingredient_nutrition["Protein (g)"],
                    ingredient_nutrition["Total Fat (g)"],
                ]
            )
        elif nutrition5k_similarity > 0.7:
            ingredient_nutrition = Ingredients.get_ingredient_nutrient_from_nutrition5k(
                nutrition5k_index
            )
            row.extend(
                [
                    ingredient,
                    ingredient_nutrition["cal/g"],
                    ingredient_nutrition["carb(g)"],
                    ingredient_nutrition["protein(g)"],
                    ingredient_nutrition["fat(g)"],
                ]
            )
        else:
            row.extend([ingredient, None, None, None, None])
        rows.append(row)
    return pd.DataFrame(
        rows,
        columns=["Ingredient", "Calorie (kcal)", "Carbs (g)", "Protein (g)", "Fat (g)"],
    )

In [33]:
# Preprocess Ingredients and Build Nutrition Table
unique_ingredients = get_unique_ingredients(ingredients_simplified_txt)
unique_ingredients = preprocess_all_ingredients(unique_ingredients)

In [34]:
nutrition_table = build_nutrition_table(unique_ingredients)

In [35]:
# filter out those without nutrition values
notnull_mask = pd.notnull(nutrition_table["Calorie (kcal)"])
nutrition_table = nutrition_table[notnull_mask]

In [36]:
nutrition_table

Unnamed: 0,Ingredient,Calorie (kcal),Carbs (g),Protein (g),Fat (g)
0,kielbasa,3.330,0.0285,0.1372,0.2918
3,oregano,2.700,0.6900,0.0900,0.0400
7,calamari,1.751,0.0810,0.1780,0.0700
10,prime rib,3.513,0.0000,0.2270,0.2810
13,lemon,0.290,0.0900,0.0110,0.0030
...,...,...,...,...,...
885,turnip,0.217,0.0510,0.0070,0.0010
886,sugar,3.880,1.0000,0.0000,0.0000
887,grapes,0.690,0.1800,0.0070,0.0020
890,goji berries,0.570,0.1400,0.0070,0.0030


In [37]:
# Moves images to a destination directory with a unique filename.
def move_image(path):
    src_dir = Path("./Food Datasets/Recipes5k/images")
    parent_dest_dir = Path("./Food Datasets/final-dataset/images")
    file_name = str(int(uuid.uuid4()))
    src_path = src_dir / path.strip()
    dir_name = src_path.parent.name
    # check if dir containing file exists in destination
    dest_dir = parent_dest_dir / dir_name
    if not dest_dir.exists():
        dest_dir.mkdir()
    dest_path = dest_dir / (file_name + src_path.suffix)
    shutil.copy(src_path, dest_path)
    return (dir_name, file_name)

# Calculates average nutrition values for a dish.
def get_dish_nutrition(index, ingredients_lookup, nutrition_table):
    total_carbs = 0
    total_protein = 0
    total_fat = 0
    total_calorie = 0
    total_ingredient = 0
    ingredients = ingredients_lookup[index].strip()
    ingredients_list = ingredients.split(",")
    final_ingredients = []
    for ingredient in ingredients_list:
        ingredient_name = preprocess_ingredients(ingredient)
        if ingredient_name is None:
            continue
        final_ingredients.append(ingredient_name)
        row_in_nutrition_table = nutrition_table[
            nutrition_table["Ingredient"] == ingredient_name
        ]
        if len(row_in_nutrition_table) == 0:
            continue
        total_ingredient += 1
        total_calorie += row_in_nutrition_table["Calorie (kcal)"].values[0]
        total_carbs += row_in_nutrition_table["Carbs (g)"].values[0]
        total_protein += row_in_nutrition_table["Protein (g)"].values[0]
        total_fat += row_in_nutrition_table["Fat (g)"].values[0]
    return [
        total_calorie / total_ingredient,
        total_carbs / total_ingredient,
        total_protein / total_ingredient,
        total_fat / total_ingredient,
        ",".join(final_ingredients),
    ]

# Processes images and their metadata.
def preprocess(image_path, labels, ingredients_lookup, nutrition_table):
    rows = []
    for path, label in zip(image_path, labels):
        label = int(label.strip())
        row = []
        dir_name, file_name = move_image(path)
        nutrient_data = get_dish_nutrition(label, ingredients_lookup, nutrition_table)
        row.append(file_name)
        row.append(dir_name)
        row.extend(nutrient_data)
        rows.append(row)
    return pd.DataFrame(
        rows,
        columns=[
            "ID/File Name",
            "Category",
            "Calorie(kcal)",
            "Carbohydrate(g)",
            "Protein(g)",
            "Fat(g)",
            "Ingredients",
        ],
    )

In [39]:
cleaned_metadata = preprocess(
    train_images_txt, train_labels_txt, ingredients_simplified_txt, nutrition_table
)

In [40]:
cleaned_metadata_2 = preprocess(
    test_images_txt, test_labels_txt, ingredients_simplified_txt, nutrition_table
)

In [41]:
cleaned_metadata_3 = preprocess(
    val_images_txt, val_labels_txt, ingredients_simplified_txt, nutrition_table
)

In [42]:
cleaned_metadata = pd.concat([cleaned_metadata, cleaned_metadata_2, cleaned_metadata_3])

In [43]:
cleaned_metadata["ID/File Name"] = cleaned_metadata["ID/File Name"].apply(
    lambda x: x + ".jpg"
)

In [44]:
cleaned_metadata

Unnamed: 0,ID/File Name,Category,Calorie(kcal),Carbohydrate(g),Protein(g),Fat(g),Ingredients
0,12790897753363414658195480905645925506.jpg,apple_pie,3.439286,0.271571,0.016143,0.260286,"flour,salt,oil,water,apple,sugar,cinnamon,butter"
1,130391530038831659140352973468628851662.jpg,apple_pie,2.844429,0.359086,0.029300,0.150071,"shells,pie,sugar,flour,cinnamon,apple,lemon,bu..."
2,289041187288056903905112987392991381035.jpg,apple_pie,3.281222,0.265778,0.036889,0.241889,"apple,lemon,sugar,flour,cinnamon,nut,butter,sa..."
3,12803641515665126416334742068395405336.jpg,apple_pie,2.639375,0.328700,0.024512,0.140875,"pie,apple,sugar,corn starch,cinnamon,lemon,but..."
4,238029147288329584733917031427917384382.jpg,apple_pie,3.643333,0.316100,0.046350,0.260083,"apple,brown sugar,butter,cinnamon,pepper,nut,pie"
...,...,...,...,...,...,...,...
629,28003108364530691269219973631342675366.jpg,chocolate_ice_cream,2.273833,0.373667,0.067833,0.095500,"sugar,milk,salt,cocoa,egg,chocolate,vanilla"
630,71991914496439841621598060354510261682.jpg,chocolate_ice_cream,2.209000,0.541000,0.077667,0.051333,"milk,sugar,cocoa,vanilla"
631,307275286815086233143550949257836595528.jpg,chocolate_ice_cream,2.833000,0.562150,0.099200,0.080225,"cocoa,egg,sugar,vanilla,oreo cookies"
632,174244636396989441835723233953681522019.jpg,chocolate_ice_cream,1.947400,0.329400,0.096600,0.072800,"sugar,egg,egg,cocoa,milk,vanilla"


### Writing relevant metadata into the final-dataset/metadata folder

In [45]:
# write to csv
# This code reads and preprocesses the Recipes5k dataset, standardizes ingredient names, calculates nutritional values, moves images to a new directory, and saves the cleaned metadata to CSV files.
cleaned_metadata.to_csv(
    "./Food Datasets/final-dataset/metadata/recipes5k_metadata.csv",
    sep="\t",
    index=False,
    float_format="%.4f",
)

In [46]:
nutrition_table.to_csv(
    "./Food Datasets/final-dataset/metadata/recipes5k_ingredients_nutrition_table.csv",
    sep="\t",
    index=False,
    float_format="%.4f",
)