In [28]:
import pandas as pd
import json
import requests
from google.colab import files

# Step 1: Fetch JSON data from GitHub URL
url = "https://raw.githubusercontent.com/IsaacFigNewton/SandoMatch/refs/heads/main/sandwich-dataset/generated_sandwiches.json"
response = requests.get(url)
sandwiches_data = response.json()

In [29]:
breads = [
    "baguette", "ciabatta", "pita", "naan", "tortilla", "roll", "sourdough", "rye", "multigrain",
    "whole wheat", "white bread", "brioche", "focaccia", "panini", "english muffin", "bagel",
    "croissant", "milk bread", "flatbread", "pumpernickel", "pretzel roll", "potato bread", "bolillo",
    "torta", "lavash", "cornbread", "chapati", "paratha", "wrap", "bun", "steamed bun"
]

meats = [
    "beef", "chicken", "turkey", "pork", "ham", "bacon", "sausage", "bratwurst", "salami", "prosciutto",
    "pepperoni", "pastrami", "mortadella", "lamb", "duck", "roast beef", "veal", "venison", "chorizo",
    "capicola", "bresaola", "pancetta", "smoked fish", "tuna", "shrimp", "crab", "lobster", "meatballs",
    "pulled pork", "carnitas", "corned beef", "brisket", "gyro", "char siu", "shawarma", "jerky"
]

vegetables = [
    "lettuce", "tomato", "onion", "pepper", "spinach", "cucumber", "basil", "sprouts", "avocado",
    "kale", "zucchini", "carrot", "radish", "celery", "mushrooms", "pickle", "scallion", "watercress",
    "eggplant", "beet", "fennel", "artichoke", "garlic", "olive", "sun-dried tomato", "kimchi", "cabbage",
    "greens", "herbs"
]

condiments = [
    "mayonnaise", "mustard", "oil", "butter", "ketchup", "relish", "vinegar", "dressing",
    "salsa", "sauce", "pesto", "chimichurri", "aioli", "tahini", "honey", "horseradish",
    "hummus", "guacamole", "tartar sauce", "yogurt sauce", "dijon", "marinara",
    "glaze", "chutney"
]

cheeses = [
    "cheddar", "mozzarella", "swiss", "provolone", "brie", "gorgonzola", "feta", "parmesan",
    "gouda", "ricotta", "blue cheese", "goat cheese", "monterey jack", "cream cheese",
    "american cheese", "colby", "pepper jack", "gruyere", "camembert", "asiago", "romano",
    "havarti", "burrata", "stracciatella", "queso fresco", "cotija", "paneer", "halloumi"
]

spices = [
    "salt", "black pepper", "cumin", "paprika", "coriander", "oregano", "basil", "thyme",
    "rosemary", "cayenne pepper", "chili powder", "turmeric", "garlic powder", "onion powder",
    "parsley", "dill", "cinnamon", "nutmeg", "allspice", "cloves", "cardamom", "bay leaf",
    "saffron", "sage", "mustard seed", "fenugreek", "anise", "five-spice", "curry powder",
    "creole seasoning", "italian seasoning", "za'atar", "sumac"
]

In [30]:
def get_ingredient_subtypes(ingredient):
    ingredient_tags = ingredient.split(" ")
    ingr_subtypes = ingredient_tags[:-1]
    return ingr_subtypes

In [31]:
# Define categories for organizing ingredients
def categorize_ingredients(ingredients_list):
    categorized = {
        "bread": {bread: list() for bread in breads},
        "meats": {meat: list() for meat in meats},
        "cheeses": {cheese: list() for cheese in cheeses},
        "vegetables": {veggie: list() for veggie in vegetables},
        "condiments": {condiment: list() for condiment in condiments},
        "spices": {spice: list() for spice in spices}
    }

    for ingredient in ingredients_list:
        for bread in breads:
            if bread in ingredient:
                ingr_subtypes = get_ingredient_subtypes(ingredient)
                categorized["bread"][bread] = ingr_subtypes
                break

        for meat in meats:
            if meat in ingredient:
                ingr_subtypes = get_ingredient_subtypes(ingredient)
                categorized["vegetables"][meat] = ingr_subtypes
                break

        for cheese in cheeses:
            if cheese in ingredient:
                ingr_subtypes = get_ingredient_subtypes(ingredient)
                categorized["cheeses"][cheese] = ingr_subtypes
                break

        for veggie in vegetables:
            if veggie in ingredient:
                ingr_subtypes = get_ingredient_subtypes(ingredient)
                categorized["vegetables"][veggie] = ingr_subtypes
                break

        for condiment in condiments:
            if condiment in ingredient:
                ingr_subtypes = get_ingredient_subtypes(ingredient)
                categorized["condiments"][condiment] = ingr_subtypes
                break

        for spice in spices:
            if spice in ingredient:
                ingr_subtypes = get_ingredient_subtypes(ingredient)
                categorized["spices"][spice] = ingr_subtypes
                break

    return categorized

In [32]:
# Reformat each sandwich entry
formatted_sandwiches = []
for sandwich in sandwiches_data:
    formatted_entry = {
        "id": sandwich["id"],
        "ingredients": categorize_ingredients(sandwich["ingredients"]),
        "cuisine": sandwich.get("cuisine")
    }
    formatted_sandwiches.append(formatted_entry)

# Convert to a pandas DataFrame and export
output_file = 'formatted_sandwich_entries.json'
with open(output_file, 'w') as outfile:
    json.dump(formatted_sandwiches, outfile, indent=4)

# Step 4: Download the reformatted JSON file
files.download(output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>