In [1]:
import random
import re
import pandas as pd
import numpy as np
import json
import requests
from google.colab import files

# Config

In [2]:
originating_restaurant = "subway"
restaurant_path = f"https://github.com/IsaacFigNewton/SandoMatch/raw/refs/heads/main/sandwich-dataset/restaurant-sandos/{originating_restaurant}/{originating_restaurant}"

vegetarian_bases = {"egg", "veggie patty"}
vegan_bases = {"veggie patty"}

# Define the set of words to remove
filter_words = {"ground", "chopped", "sliced", "freshly", "fresh"}

In [3]:
# use this document to determine which ingredients are valid
#   and thus, which will be classified as subtypes of others
json_data = requests.get("https://github.com/IsaacFigNewton/SandoMatch/raw/refs/heads/main/sandwich-dataset/db-entry-requirements/valid_ingredient_types.json").json()

# create lists of equal length for dataframe columns
max_len = max(len(v) for v in json_data.values())

processed_data = {
    k: v + [np.nan] * (max_len - len(v))
    for k, v in json_data.items()
}

valid_ingredients = pd.DataFrame(processed_data)

In [4]:
valid_ingredients[:50]

Unnamed: 0,breads,meats,cheeses,vegetables,condiments,spices
0,baguette,beef,cheddar,lettuce,mayonnaise,salt
1,ciabatta,chicken,mozzarella,tomato,mustard,black pepper
2,pita,turkey,swiss,onion,oil,cumin
3,naan,pork,provolone,pepper,butter,paprika
4,tortilla,ham,brie,spinach,ketchup,coriander
5,roll,bacon,gorgonzola,cucumber,relish,oregano
6,sourdough,sausage,feta,basil,vinegar,basil
7,rye,bratwurst,parmesan,sprouts,dressing,thyme
8,multigrain,salami,gouda,avocado,salsa,rosemary
9,whole wheat,prosciutto,ricotta,kale,sauce,cayenne pepper


# Read JSON into Pandas Tables

In [5]:
sandwiches_table = pd.read_json(f"{restaurant_path}_menu.json")
ingredients_table = pd.read_csv(f"{restaurant_path}_ingredients.csv")

## Clean the ingredients table

In [6]:
ingredients_table['category'] = ingredients_table['category']\
                                                              .str\
                                                              .lower()
ingredients_table['ingredient'] = ingredients_table['ingredient']\
                                                              .str\
                                                              .lower()

In [7]:
try:
  ingredients_table['cost'] = ingredients_table['cost'].astype(float)
except ValueError as e:
  print(f"Bad cost value: {e}")
  ingredients_table['cost'] = np.nan
try:
  ingredients_table['calories'] = ingredients_table['calories'].astype(int)
except ValueError as e:
  print(f"Bad calorie value: {e}")
  ingredients_table['calories'] = np.nan

Bad calorie value: cannot convert float NaN to integer


In [8]:
ingredients_table.fillna(0, inplace=True)
ingredients_table["ingredient"] = ingredients_table["ingredient"].apply(lambda x: x.replace(" ", "_"))

In [9]:
ingredients_table.head()

Unnamed: 0,category,ingredient,cost,calories
0,meats,black_forest_ham,3.75,0.0
1,meats,honey-smoked_ham,3.75,0.0
2,meats,steak,3.75,0.0
3,meats,turkey_breast,3.75,0.0
4,meats,chicken_breast,3.75,0.0


## Process ingredients table

In [10]:
ingredients_table_json = {
  "breads": dict(),
  "meats": dict(),
  "vegetables": dict(),
  "cheeses": dict(),
  "condiments": dict(),
  "spices": dict()
}

for _, row in ingredients_table.iterrows():
    category = row['category']\
                .lower()\
                .replace('_', ' ')
    ingredient = row['ingredient']\
                .lower()\
                .replace('_', ' ')

    # print(f"Category: {category}\
    #       \nIngredient: {ingredient}")

    # if the ingredient doesn't exist
    if ingredient not in ingredients_table_json[category].keys():
        ingredients_table_json[category][ingredient] = dict()

    if 'cost' not in ingredients_table_json[category][ingredient].keys():
        ingredients_table_json[category][ingredient]['cost'] = row["cost"]

    # defaults to null if no calorie count provided
    if 'calories' not in ingredients_table_json[category][ingredient].keys():
        ingredients_table_json[category][ingredient]['calories'] = row["calories"]

In [11]:
ingredients_table = pd\
                      .DataFrame({originating_restaurant: ingredients_table_json})\
                      .transpose()\
                      .reset_index()\
                      .rename(columns={"index": "_id"})
ingredients_table.head()

Unnamed: 0,_id,breads,meats,vegetables,cheeses,condiments,spices
0,subway,"{'white': {'cost': 4.0, 'calories': 0.0}, '9-g...","{'black forest ham': {'cost': 3.75, 'calories'...","{'avocado': {'cost': 0.0, 'calories': 0.0}, 'l...","{'smoked cheddar': {'cost': 3.75, 'calories': ...","{'caesar dressing': {'cost': 0.0, 'calories': ...",{}


## Clean sandwich/menu table

In [12]:
# # only use the default ingredients list, to make ingredient set more generalizable
# ingredients_table = pd\
#                       .read_json("https://github.com/IsaacFigNewton/SandoMatch/raw/refs/heads/main/sandwich-dataset/db-tables/restaurant_ingredients.json")
# ingredients_table = ingredients_table.loc[ingredients_table["_id"] == "default"].drop(columns=["_id"])
# ingredients_table.head()

In [13]:
# basically reset the indices
sandwiches_table.drop(columns=["id"], inplace=True)

# reset the index
ingredients_table.set_index("_id", inplace=True)

In [14]:
sandwiches_table.head()

Unnamed: 0,name,ingredients,cuisine,cost,calories
0,Elite Chicken & Bacon Ranch,"[Chicken Breast, Bacon, Ranch Dressing, Cheese]",,15.28,580
1,The Philly,"[Steak, Cheese]",American,13.78,505
2,Subway Club,"[Turkey Breast, Black Forest Ham, Roast Beef]",,14.78,500
3,All-American Club,"[Turkey Breast, Black Forest Ham, Bacon]",American,14.28,540
4,The Hotshot Italiano,"[Genoa Salami, Pepperoni, Ham]",Italian,12.28,630


In [15]:
ingredients_table.head()

Unnamed: 0_level_0,breads,meats,vegetables,cheeses,condiments,spices
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
subway,"{'white': {'cost': 4.0, 'calories': 0.0}, '9-g...","{'black forest ham': {'cost': 3.75, 'calories'...","{'avocado': {'cost': 0.0, 'calories': 0.0}, 'l...","{'smoked cheddar': {'cost': 3.75, 'calories': ...","{'caesar dressing': {'cost': 0.0, 'calories': ...",{}


# Helper Functions

In [16]:
ingredients_table.head()

Unnamed: 0_level_0,breads,meats,vegetables,cheeses,condiments,spices
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
subway,"{'white': {'cost': 4.0, 'calories': 0.0}, '9-g...","{'black forest ham': {'cost': 3.75, 'calories'...","{'avocado': {'cost': 0.0, 'calories': 0.0}, 'l...","{'smoked cheddar': {'cost': 3.75, 'calories': ...","{'caesar dressing': {'cost': 0.0, 'calories': ...",{}


In [29]:
def add_ingredient_to_entry(entry,
                            ingredient,
                            ingredient_category,
                            exclusion_set1="",
                            exclusion_set2="",
                            exclusion_words=set()):
    # print(f"Ingredient Category: {ingredient_category}")
    for ingred_type in valid_ingredients[ingredient_category]:

      # print(f"\t\tIngredient Type: {ingred_type}")
      if not ingred_type is np.nan and ingred_type in ingredient:
          # print(f"\t\tIngredient Type Found in Ingredient: {ingredient}")

          # Build exclusion set
          exclusion_set = exclusion_words
          if exclusion_set1 and exclusion_set2:
              exclusion_set = exclusion_set.union(set(ingredients_table[exclusion_set1].keys()))
              exclusion_set = exclusion_set.union(set(ingredients_table[exclusion_set2].keys()))

          print(f"Exclusion Set: {exclusion_set}")
          # Check for exclusions
          if not any(ex_word in ingredient for ex_word in exclusion_set):

              if ingred_type not in entry["ingredients"][ingredient_category]:
                  entry["ingredients"][ingredient_category][ingred_type] = []

              entry["ingredients"][ingredient_category][ingred_type].append(ingredient)
              break

In [18]:
def categorize_ingredients(ingredients_dict):
    entry = {
        "ingredients": {
            "breads": dict(),
            "meats": dict(),
            "cheeses": dict(),
            "vegetables": dict(),
            "condiments": dict(),
            "spices": dict()
        },
        "cost": 0,
        "calories": 0,
        "rating": random.randint(1, 5),
        "dietary_tags": [],
        # store reviews as list of {rating, text} objects or maybe object ref ids?
        "reviews": []
    }

    # for each ingredient in a sandwich's recipe
    for ingredient in ingredients_dict:
        # Remove filter words
        pattern = r"\b(" + "|".join(map(re.escape, filter_words)) + r")\b"
        ingredient = re.sub(pattern, "", ingredient.lower()).strip()

        # Simple plural handling
        if ingredient.endswith('s'):
            ingredient = ingredient[:-1]

        print(f"Ingredient: '{ingredient}'")

        add_ingredient_to_entry(entry, ingredient, "breads")
        add_ingredient_to_entry(entry, ingredient, "meats")
        add_ingredient_to_entry(entry, ingredient, "cheeses")
        add_ingredient_to_entry(entry, ingredient, "vegetables",
                                exclusion_set1="condiments",
                                exclusion_set2="spices",
                                exclusion_words={"oil", "pepper jack", "pepperoni"})
        add_ingredient_to_entry(entry, ingredient, "condiments")
        add_ingredient_to_entry(entry, ingredient, "spices")

    # Compute cost and calories
    for category in entry["ingredients"]:
        print(f"Category: {category}")
        for ingredient_type in entry["ingredients"][category]:
            print(f"\tIngredient Type: {ingredient_type}")
            lookup_key = ingredient_type
            print(f"\tIngredient being searched: '{lookup_key}'")

            try:
                ingredient_data = ingredients_table[category][lookup_key]
                print(ingredient_data)

                # Estimate the sandwich's total cost
                entry["cost"] += ingredient_data["cost"]
                # Estimate the sandwich's total calorie count
                entry["calories"] += ingredient_data["calories"]

            except Exception as e:
                print(f"Exception: {e}")
                continue

    # Round the cost and calories to the nearest hundredth
    entry["cost"] = round(entry["cost"], 2)
    entry["calories"] = round(entry["calories"], 2)


    # Add dietary tags based on ingredients
    # If the base is a veggie patty, egg, or vegetable
    sandwich_bases = set(entry["ingredients"]["meats"].keys())
    if (not sandwich_bases.difference(vegan_bases)\
        or not sandwich_bases) and not entry["ingredients"]["cheeses"]:
        entry["dietary_tags"].append("vegan")
    elif not sandwich_bases.difference(vegetarian_bases)\
        or not sandwich_bases:
        entry["dietary_tags"].append("vegetarian")

    print(f"Total Cost: {entry['cost']}")
    print(f"Total Calories: {entry['calories']}")
    print(f"Rating: {entry['rating']}")
    print(entry)
    return entry

#Main

In [19]:
def get_sando_name(sandwich, formatted_entry):
  name = ""
  # if a cuisine type was provided, prefix the name with it
  if sandwich.get("cuisine"):
      name = sandwich["cuisine"] + " "

  ingredients = formatted_entry["ingredients"]
  # if a name isn't provided, name it after the first meat in the list
  if len(ingredients["meats"]) > 0:
      name += (list(ingredients["meats"].keys()))[0]
  # or the first cheese
  elif len(ingredients["cheeses"]) > 0:
      name += (list(ingredients["cheeses"].keys()))[0]
  # or the first vegetable
  elif len(ingredients["vegetables"]) > 0:
      name += (list(ingredients["vegetables"].keys()))[0]

  return name

In [20]:
def format_entry(sandwich):
  formatted_entry = {
      "id_": i,
      "cuisine": sandwich.get("cuisine"),
      "restaurant": None
  }

  # If it's indicated that the sandwich came from a specific restaurant
  if originating_restaurant:
      print(f"Restaurant: {originating_restaurant}")
      formatted_entry["restaurant"] = originating_restaurant

  # If the sandwich entry comes with a sandwich name
  if sandwich.get("name"):
      print(f"Name: {sandwich['name']}")
      formatted_entry["name"] = sandwich["name"]


  formatted_entry.update(categorize_ingredients(sandwich["ingredients"]))

  # if a name was not provided for the sandwich
  if not sandwich.get("name"):
      name = get_sando_name(sandwich, formatted_entry)

      # set the name
      formatted_entry["name"] = (name + " sandwich")\
                                  .replace("_", " ")\
                                  .title()

  # If the sandwich entry comes with a cost, overwrite the estimate
  #   do this after getting the estimate so that future regression models
  #   can be more easily applied to this data
  if sandwich.get("cost"):
      print(f"Cost: {sandwich['cost']}")
      formatted_entry["cost"] = sandwich["cost"]
  if sandwich.get("calories"):
      print(f"Calories: {sandwich['calories']}")
      formatted_entry["calories"] = sandwich["calories"]

  return formatted_entry

In [30]:
# Reformat every sandwich entry
formatted_sandwiches = []
i = 0
for sandwich_name, sandwich in sandwiches_table.iterrows():
    print("Parsing sandwich...")
    print(f"ID: {sandwich_name}")
    print(f"Cuisine: {sandwich.get('cuisine')}")

    formatted_sandwiches.append(format_entry(sandwich))
    print("\n\n\n")

    i += 1

Parsing sandwich...
ID: 0
Cuisine: None
Restaurant: subway
Name: Elite Chicken & Bacon Ranch
Ingredient: 'chicken breast'
Exclusion Set: set()
Ingredient: 'bacon'
Exclusion Set: set()
Ingredient: 'ranch dressing'
Exclusion Set: set()
Ingredient: 'cheese'
Category: breads
Category: meats
	Ingredient Type: chicken
	Ingredient being searched: 'chicken'
Exception: 'chicken'
	Ingredient Type: bacon
	Ingredient being searched: 'bacon'
Exception: 'bacon'
Category: cheeses
Category: vegetables
Category: condiments
	Ingredient Type: dressing
	Ingredient being searched: 'dressing'
Exception: 'dressing'
Category: spices
Total Cost: 0
Total Calories: 0
Rating: 2
{'ingredients': {'breads': {}, 'meats': {'chicken': ['chicken breast'], 'bacon': ['bacon']}, 'cheeses': {}, 'vegetables': {}, 'condiments': {'dressing': ['ranch dressing']}, 'spices': {}}, 'cost': 0, 'calories': 0, 'rating': 2, 'dietary_tags': [], 'reviews': []}
Cost: 15.28
Calories: 580




Parsing sandwich...
ID: 1
Cuisine: American
Rest

In [31]:
formatted_sandwiches

[{'id_': 0,
  'cuisine': None,
  'restaurant': 'subway',
  'name': 'Elite Chicken & Bacon Ranch',
  'ingredients': {'breads': {},
   'meats': {'chicken': ['chicken breast'], 'bacon': ['bacon']},
   'cheeses': {},
   'vegetables': {},
   'condiments': {'dressing': ['ranch dressing']},
   'spices': {}},
  'cost': 15.28,
  'calories': 580,
  'rating': 2,
  'dietary_tags': [],
  'reviews': []},
 {'id_': 1,
  'cuisine': 'American',
  'restaurant': 'subway',
  'name': 'The Philly',
  'ingredients': {'breads': {},
   'meats': {},
   'cheeses': {},
   'vegetables': {},
   'condiments': {},
   'spices': {}},
  'cost': 13.78,
  'calories': 505,
  'rating': 1,
  'dietary_tags': ['vegan'],
  'reviews': []},
 {'id_': 2,
  'cuisine': None,
  'restaurant': 'subway',
  'name': 'Subway Club',
  'ingredients': {'breads': {},
   'meats': {'turkey': ['turkey breast'],
    'ham': ['black forest ham'],
    'beef': ['roast beef']},
   'cheeses': {},
   'vegetables': {},
   'condiments': {},
   'spices': {}},

# Download restructured files

In [32]:
def download_restructured_data(json_data, file_label="menu"):
  output_file = f'restructured_{originating_restaurant}_{file_label}.json'
  with open(output_file, 'w') as outfile:
      json.dump(json_data, outfile, indent=4)

  # Download the reformatted JSON file
  files.download(output_file)

In [33]:
download_restructured_data(ingredients_table_json, file_label="ingredients")
download_restructured_data(formatted_sandwiches, file_label="menu")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>