In [1]:
import os
import requests
import json
import gzip
import tarfile

In [9]:
# Function to download files
def download_file(url, dest_path):
    if not os.path.exists(dest_path):
        print(f"Downloading {url}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(dest_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Saved to {dest_path}")
    else:
        print(f"{dest_path} already exists. Skipping download.")

# Function to preview first n JSON entries
def preview_json(file_path, n=3):
    print(f"\n📄 Previewing first {n} entries from {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    for i, entry in enumerate(data[:n]):
        print(f"\n--- Entry {i+1} ---")
        print(json.dumps(entry, indent=2))

In [None]:
import os
import requests
import tarfile
import json

# URLs from the dataset email
url_map = {
    "recipe1M_layers.tar.gz": "http://wednesday.csail.mit.edu/temporal/release/recipe1M_layers.tar.gz",
    "det_ingrs.json": "http://wednesday.csail.mit.edu/temporal/release/det_ingrs.json",
    "recipes_with_nutritional_info.json": "http://wednesday.csail.mit.edu/temporal/release/recipes_with_nutritional_info.json",
}

# Setup directories
download_dir = "recipe1m_downloads"
extract_dir = os.path.join(download_dir, "extracted")
os.makedirs(download_dir, exist_ok=True)
os.makedirs(extract_dir, exist_ok=True)


# Download all files
for filename, url in url_map.items():
    dest_path = os.path.join(download_dir, filename)
    download_file(url, dest_path)

# Extract tar.gz file
tar_path = os.path.join(download_dir, "recipe1M_layers.tar.gz")
print(f"\n🗂️ Extracting {tar_path}...")
with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(path=extract_dir)
print(f"✅ Extracted to {extract_dir}")

# Preview layer1.json (extracted)
layer1_path = os.path.join(extract_dir, "layer1.json")
preview_json(layer1_path)

# Preview det_ingrs.json
det_ingrs_path = os.path.join(download_dir, "det_ingrs.json")
preview_json(det_ingrs_path)

# Preview nutrition info if present
nutrition_path = os.path.join(download_dir, "recipes_with_nutritional_info.json")
if os.path.exists(nutrition_path):
    preview_json(nutrition_path)
else:
    print("\n⚠️ Nutrition file not found. Skipping preview.")


In [14]:
import json
import pandas as pd

layer1_path = "recipe1m_downloads/extracted/layer1.json"

with open(layer1_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Total number of recipes: {len(data)}")

def load_and_flatten_layer1(path, n=None):
    """
    Load and flatten the Recipe1M layer1 JSON structure into a pandas DataFrame.
    If n is given, only loads the first n recipes.
    """
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if n:
        data = data[:n]

    # Flatten entries
    flattened = []
    for entry in data:
        ingredients = [ing["text"] for ing in entry.get("ingredients", [])]
        instructions = [step["text"] for step in entry.get("instructions", [])]

        flattened.append({
            "id": entry.get("id"),
            "title": entry.get("title"),
            "partition": entry.get("partition"),
            "url": entry.get("url"),
            "ingredients": ", ".join(ingredients),
            "instructions": " ".join(instructions)
        })

    return pd.DataFrame(flattened)

# Load the DataFrame
layer1_path = "recipe1m_downloads/extracted/layer1.json"
df = load_and_flatten_layer1(layer1_path, n=10)

# Show the preview
pd.set_option("display.max_colwidth", None)  # Show full text
df.head(3)

Total number of recipes: 1029720


Unnamed: 0,id,title,partition,url,ingredients,instructions
0,000018c8a5,Worlds Best Mac and Cheese,train,http://www.epicurious.com/recipes/food/views/-world-s-best-mac-and-cheese-387747,"6 ounces penne, 2 cups Beechers Flagship Cheese Sauce (recipe follows), 1 ounce Cheddar, grated (1/4 cup), 1 ounce Gruyere cheese, grated (1/4 cup), 1/4 to 1/2 teaspoon chipotle chili powder (see Note), 1/4 cup (1/2 stick) unsalted butter, 1/3 cup all-purpose flour, 3 cups milk, 14 ounces semihard cheese (page 23), grated (about 3 1/2 cups), 2 ounces semisoft cheese (page 23), grated (1/2 cup), 1/2 teaspoon kosher salt, 1/4 to 1/2 teaspoon chipotle chili powder, 1/8 teaspoon garlic powder, (makes about 4 cups)","Preheat the oven to 350 F. Butter or oil an 8-inch baking dish. Cook the penne 2 minutes less than package directions. (It will finish cooking in the oven.) Rinse the pasta in cold water and set aside. Combine the cooked pasta and the sauce in a medium bowl and mix carefully but thoroughly. Scrape the pasta into the prepared baking dish. Sprinkle the top with the cheeses and then the chili powder. Bake, uncovered, for 20 minutes. Let the mac and cheese sit for 5 minutes before serving. Melt the butter in a heavy-bottomed saucepan over medium heat and whisk in the flour. Continue whisking and cooking for 2 minutes. Slowly add the milk, whisking constantly. Cook until the sauce thickens, about 10 minutes, stirring frequently. Remove from the heat. Add the cheeses, salt, chili powder, and garlic powder. Stir until the cheese is melted and all ingredients are incorporated, about 3 minutes. Use immediately, or refrigerate for up to 3 days. This sauce reheats nicely on the stove in a saucepan over low heat. Stir frequently so the sauce doesnt scorch. This recipe can be assembled before baking and frozen for up to 3 monthsjust be sure to use a freezer-to-oven pan and increase the baking time to 50 minutes. One-half teaspoon of chipotle chili powder makes a spicy mac, so make sure your family and friends can handle it! The proportion of pasta to cheese sauce is crucial to the success of the dish. It will look like a lot of sauce for the pasta, but some of the liquid will be absorbed."
1,000033e39b,Dilly Macaroni Salad Recipe,train,http://cookeatshare.com/recipes/dilly-macaroni-salad-49166,"1 c. elbow macaroni, 1 c. cubed American cheese (4 ounce.), 1/2 c. sliced celery, 1/2 c. minced green pepper, 3 tbsp. minced pimento, 1/2 c. mayonnaise or possibly salad dressing, 1 tbsp. vinegar, 3/4 teaspoon salt, 1/2 teaspoon dry dill weed","Cook macaroni according to package directions; drain well. Cold. Combine macaroni, cheese cubes, celery, green pepper and pimento. Blend together mayonnaise or possibly salad dressing, vinegar, salt and dill weed; add in to macaroni mix. Toss lightly. Cover and refrigeratewell. Serve salad in lettuce lined bowl if you like. Makes 6 servings."
2,000035f7ed,Gazpacho,train,http://www.foodnetwork.com/recipes/gazpacho1.html,"8 tomatoes, quartered, Kosher salt, 1 red onion, cut into small dice, 1 green bell pepper, cut into small dice, 1 red bell pepper, cut into small dice, 1 yellow bell pepper, cut into small dice, 1/2 cucumber, cut into small dice, Extra-virgin olive oil, for drizzling, 3 leaves fresh basil, finely chopped","Add the tomatoes to a food processor with a pinch of salt and puree until smooth. Combine the onions, bell peppers and cucumbers with the tomato puree in a large bowl. Chill at least 1 hour. Drizzle with olive oil, garnish with chopped basil and serve."


In [10]:
# Preview layer1.json (extracted)
layer2_path = os.path.join(extract_dir, "layer2.json")
preview_json(layer2_path)


📄 Previewing first 3 entries from recipe1m_downloads/extracted/layer2.json...

--- Entry 1 ---
{
  "id": "00003a70b1",
  "images": [
    {
      "id": "3e233001e2.jpg",
      "url": "http://img.sndimg.com/food/image/upload/w_512,h_512,c_fit,fl_progressive,q_95/v1/img/recipes/47/91/49/picaYYmb9.jpg"
    },
    {
      "id": "7f749987f9.jpg",
      "url": "http://img.sndimg.com/food/image/upload/w_512,h_512,c_fit,fl_progressive,q_95/v1/img/recipes/47/91/49/picpy37SW.jpg"
    },
    {
      "id": "aaf6b2dcd3.jpg",
      "url": "http://img.sndimg.com/food/image/upload/w_512,h_512,c_fit,fl_progressive,q_95/v1/img/recipes/47/91/49/picX9CNE2.jpg"
    }
  ]
}

--- Entry 2 ---
{
  "id": "000075604a",
  "images": [
    {
      "id": "6bdca6e490.jpg",
      "url": "https://img-global.cpcdn.com/001_recipes/5806945844854784/0x0/photo.jpg"
    }
  ]
}

--- Entry 3 ---
{
  "id": "00007bfd16",
  "images": [
    {
      "id": "6409eab844.jpg",
      "url": "http://img.sndimg.com/food/image/upload/w_51

In [13]:
# Preview det_ingrs.json
det_ingrs_path = os.path.join(download_dir, "det_ingrs.json")
preview_json(det_ingrs_path, n=1)


📄 Previewing first 1 entries from recipe1m_downloads/det_ingrs.json...

--- Entry 1 ---
{
  "valid": [
    true,
    true,
    true,
    true,
    true,
    true,
    true,
    true,
    false,
    false,
    true,
    true,
    true,
    false
  ],
  "id": "000018c8a5",
  "ingredients": [
    {
      "text": "penne"
    },
    {
      "text": "cheese sauce"
    },
    {
      "text": "cheddar cheese"
    },
    {
      "text": "gruyere cheese"
    },
    {
      "text": "dried chipotle powder"
    },
    {
      "text": "unsalted butter"
    },
    {
      "text": "all - purpose flour"
    },
    {
      "text": "milk"
    },
    {
      "text": "14 ounces semihard cheese (page 23), grated (about 3 1/2 cups)"
    },
    {
      "text": "2 ounces semisoft cheese (page 23), grated (1/2 cup)"
    },
    {
      "text": "kosher salt"
    },
    {
      "text": "dried chipotle powder"
    },
    {
      "text": "garlic powder"
    },
    {
      "text": "(makes about 4 cups)"
    }
  ]
}


In [12]:
def load_clean_ingredients(file_path, n=None):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    if n:
        data = data[:n]

    clean_data = {}

    for entry in data:
        recipe_id = entry["id"]
        ingredients = entry["ingredients"]
        validity = entry["valid"]

        clean_ings = [
            ing["text"] for ing, is_valid in zip(ingredients, validity) if is_valid
        ]

        clean_data[recipe_id] = clean_ings

    return clean_data

# Example: Load and preview 3 cleaned recipes
det_ingrs_path = "recipe1m_downloads/det_ingrs.json"
clean_ingredients = load_clean_ingredients(det_ingrs_path, n=3)

for rid, ing_list in clean_ingredients.items():
    print(f"\n🧾 Recipe ID: {rid}")
    print("✅ Cleaned Ingredients:")
    for ing in ing_list:
        print(f" - {ing}")


🧾 Recipe ID: 000018c8a5
✅ Cleaned Ingredients:
 - penne
 - cheese sauce
 - cheddar cheese
 - gruyere cheese
 - dried chipotle powder
 - unsalted butter
 - all - purpose flour
 - milk
 - kosher salt
 - dried chipotle powder
 - garlic powder

🧾 Recipe ID: 000033e39b
✅ Cleaned Ingredients:
 - elbow macaroni
 - American cheese
 - celery
 - green peppers
 - pimentos
 - mayonnaise
 - vinegar
 - salt
 - dry dill weed

🧾 Recipe ID: 000035f7ed
✅ Cleaned Ingredients:
 - tomatoes
 - kosher salt
 - red onion
 - green bell pepper
 - red bell pepper
 - yellow bell pepper
 - cucumber
 - olive oil
 - fresh basil


In [15]:
# Preview nutrition info if present
nutrition_path = os.path.join(download_dir, "recipes_with_nutritional_info.json")
if os.path.exists(nutrition_path):
    preview_json(nutrition_path)
else:
    print("\n⚠️ Nutrition file not found. Skipping preview.")


📄 Previewing first 3 entries from recipe1m_downloads/recipes_with_nutritional_info.json...

--- Entry 1 ---
{
  "fsa_lights_per100g": {
    "fat": "green",
    "salt": "green",
    "saturates": "green",
    "sugars": "orange"
  },
  "id": "000095fc1d",
  "ingredients": [
    {
      "text": "yogurt, greek, plain, nonfat"
    },
    {
      "text": "strawberries, raw"
    },
    {
      "text": "cereals ready-to-eat, granola, homemade"
    }
  ],
  "instructions": [
    {
      "text": "Layer all ingredients in a serving dish."
    }
  ],
  "nutr_per_ingredient": [
    {
      "fat": 0.8845044000000001,
      "nrg": 133.80964,
      "pro": 23.110512399999998,
      "sat": 0.26535132,
      "sod": 81.64656,
      "sug": 7.348190400000001
    },
    {
      "fat": 0.46,
      "nrg": 49.0,
      "pro": 1.02,
      "sat": 0.023,
      "sod": 2.0,
      "sug": 7.43
    },
    {
      "fat": 7.415,
      "nrg": 149.25,
      "pro": 4.17,
      "sat": 1.207,
      "sod": 8.0,
      "sug": 6.0

In [16]:
def parse_nutrition_entry(entry):
    ingredients = entry["ingredients"]
    quantities = entry["quantity"]
    units = entry["unit"]
    weights = entry["weight_per_ingr"]
    nutrition = entry["nutr_per_ingredient"]

    combined_ingredients = []

    for i in range(len(ingredients)):
        combined_ingredients.append({
            "ingredient": ingredients[i]["text"],
            "quantity": quantities[i]["text"] if i < len(quantities) else None,
            "unit": units[i]["text"] if i < len(units) else None,
            "weight_g": weights[i] if i < len(weights) else None,
            "nutrition": nutrition[i] if i < len(nutrition) else None
        })

    return {
        "id": entry["id"],
        "title": entry["title"],
        "partition": entry["partition"],
        "ingredients": combined_ingredients,
        "nutrition_per_100g": entry["nutr_values_per100g"],
        "fsa_lights": entry["fsa_lights_per100g"],
        "instructions": " ".join([step["text"] for step in entry["instructions"]]),
        "url": entry["url"]
    }

# Example usage
import json

with open("recipe1m_downloads/recipes_with_nutritional_info.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Transform the first entry
clean_entry = parse_nutrition_entry(data[0])

import pprint
pprint.pprint(clean_entry)

{'fsa_lights': {'fat': 'green',
                'salt': 'green',
                'saturates': 'green',
                'sugars': 'orange'},
 'id': '000095fc1d',
 'ingredients': [{'ingredient': 'yogurt, greek, plain, nonfat',
                  'nutrition': {'fat': 0.8845044000000001,
                                'nrg': 133.80964,
                                'pro': 23.110512399999998,
                                'sat': 0.26535132,
                                'sod': 81.64656,
                                'sug': 7.348190400000001},
                  'quantity': '8',
                  'unit': 'ounce',
                  'weight_g': 226.796},
                 {'ingredient': 'strawberries, raw',
                  'nutrition': {'fat': 0.46,
                                'nrg': 49.0,
                                'pro': 1.02,
                                'sat': 0.023,
                                'sod': 2.0,
                                'sug': 7.43},
              

In [21]:
import json
import pandas as pd

def parse_full_nutrition_entry(entry):
    n = len(entry["ingredients"])
    
    ingredients_structured = []
    for i in range(n):
        ingredients_structured.append({
            "ingredient": entry["ingredients"][i]["text"] if i < len(entry["ingredients"]) else None,
            "quantity": entry["quantity"][i]["text"] if i < len(entry["quantity"]) else None,
            "unit": entry["unit"][i]["text"] if i < len(entry["unit"]) else None,
            "weight_g": entry["weight_per_ingr"][i] if i < len(entry["weight_per_ingr"]) else None,
            "nutrition": entry["nutr_per_ingredient"][i] if i < len(entry["nutr_per_ingredient"]) else None
        })
    
    return {
        "id": entry.get("id"),
        "title": entry.get("title"),
        "partition": entry.get("partition"),
        "url": entry.get("url"),
        "instructions": " ".join([step["text"] for step in entry.get("instructions", [])]),
        "ingredients_structured": ingredients_structured,
        "nutrition_per_100g": entry.get("nutr_values_per100g", {}),
        "fsa_lights_per_100g": entry.get("fsa_lights_per100g", {})
    }

# Load your JSON file or individual object here
with open("recipe1m_downloads/recipes_with_nutritional_info.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Example: parse just the first entry (you can loop later)
parsed = [parse_full_nutrition_entry(recipe) for recipe in data]

# Convert to DataFrame
df_nutrition_full = pd.DataFrame(parsed)

# Optional: display full columns nicely
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
df_nutrition_full.head(1)

Unnamed: 0,id,title,partition,url,instructions,ingredients_structured,nutrition_per_100g,fsa_lights_per_100g
0,000095fc1d,Yogurt Parfaits,train,http://tastykitchen.com/recipes/breakfastbrunch/yogurt-parfaits/,Layer all ingredients in a serving dish.,"[{'ingredient': 'yogurt, greek, plain, nonfat', 'quantity': '8', 'unit': 'ounce', 'weight_g': 226.796, 'nutrition': {'fat': 0.8845044000000001, 'nrg': 133.80964, 'pro': 23.110512399999998, 'sat': 0.26535132, 'sod': 81.64656, 'sug': 7.348190400000001}}, {'ingredient': 'strawberries, raw', 'quantity': '1', 'unit': 'cup', 'weight_g': 152.0, 'nutrition': {'fat': 0.46, 'nrg': 49.0, 'pro': 1.02, 'sat': 0.023, 'sod': 2.0, 'sug': 7.43}}, {'ingredient': 'cereals ready-to-eat, granola, homemade', 'quantity': '1/4', 'unit': 'cup', 'weight_g': 30.5, 'nutrition': {'fat': 7.415, 'nrg': 149.25, 'pro': 4.17, 'sat': 1.207, 'sod': 8.0, 'sug': 6.04}}]","{'energy': 81.12946131894766, 'fat': 2.140139263515891, 'protein': 6.914436593565536, 'salt': 0.05597816738985967, 'saturates': 0.36534716195613937, 'sugars': 5.08634103436144}","{'fat': 'green', 'salt': 'green', 'saturates': 'green', 'sugars': 'orange'}"


In [22]:
import json
import pandas as pd

layer1_path = "recipe1m_downloads/extracted/layer1.json"

with open(layer1_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Total number of recipes: {len(data)}")

def load_and_flatten_layer1(path, n=None):
    """
    Load and flatten the Recipe1M layer1 JSON structure into a pandas DataFrame.
    If n is given, only loads the first n recipes.
    """
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if n:
        data = data[:n]

    # Flatten entries
    flattened = []
    for entry in data:
        ingredients = [ing["text"] for ing in entry.get("ingredients", [])]
        instructions = [step["text"] for step in entry.get("instructions", [])]

        flattened.append({
            "id": entry.get("id"),
            "title": entry.get("title"),
            "partition": entry.get("partition"),
            "url": entry.get("url"),
            "ingredients": ", ".join(ingredients),
            "instructions": " ".join(instructions)
        })

    return pd.DataFrame(flattened)

# Load the DataFrame
layer1_path = "recipe1m_downloads/extracted/layer1.json"
df = load_and_flatten_layer1(layer1_path, n=10)

# Show the preview
pd.set_option("display.max_colwidth", None)  # Show full text
# Optional: display full columns nicely
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
df.head(1)

Total number of recipes: 1029720


Unnamed: 0,id,title,partition,url,ingredients,instructions
0,000018c8a5,Worlds Best Mac and Cheese,train,http://www.epicurious.com/recipes/food/views/-world-s-best-mac-and-cheese-387747,"6 ounces penne, 2 cups Beechers Flagship Cheese Sauce (recipe follows), 1 ounce Cheddar, grated (1/4 cup), 1 ounce Gruyere cheese, grated (1/4 cup), 1/4 to 1/2 teaspoon chipotle chili powder (see Note), 1/4 cup (1/2 stick) unsalted butter, 1/3 cup all-purpose flour, 3 cups milk, 14 ounces semihard cheese (page 23), grated (about 3 1/2 cups), 2 ounces semisoft cheese (page 23), grated (1/2 cup), 1/2 teaspoon kosher salt, 1/4 to 1/2 teaspoon chipotle chili powder, 1/8 teaspoon garlic powder, (makes about 4 cups)","Preheat the oven to 350 F. Butter or oil an 8-inch baking dish. Cook the penne 2 minutes less than package directions. (It will finish cooking in the oven.) Rinse the pasta in cold water and set aside. Combine the cooked pasta and the sauce in a medium bowl and mix carefully but thoroughly. Scrape the pasta into the prepared baking dish. Sprinkle the top with the cheeses and then the chili powder. Bake, uncovered, for 20 minutes. Let the mac and cheese sit for 5 minutes before serving. Melt the butter in a heavy-bottomed saucepan over medium heat and whisk in the flour. Continue whisking and cooking for 2 minutes. Slowly add the milk, whisking constantly. Cook until the sauce thickens, about 10 minutes, stirring frequently. Remove from the heat. Add the cheeses, salt, chili powder, and garlic powder. Stir until the cheese is melted and all ingredients are incorporated, about 3 minutes. Use immediately, or refrigerate for up to 3 days. This sauce reheats nicely on the stove in a saucepan over low heat. Stir frequently so the sauce doesnt scorch. This recipe can be assembled before baking and frozen for up to 3 monthsjust be sure to use a freezer-to-oven pan and increase the baking time to 50 minutes. One-half teaspoon of chipotle chili powder makes a spicy mac, so make sure your family and friends can handle it! The proportion of pasta to cheese sauce is crucial to the success of the dish. It will look like a lot of sauce for the pasta, but some of the liquid will be absorbed."


In [None]:
import json
import pandas as pd

# === Load JSON files ===
with open("recipe1m_downloads/extracted/layer1.json", "r", encoding="utf-8") as f:
    layer1_data = json.load(f)

with open("recipe1m_downloads/det_ingrs.json", "r", encoding="utf-8") as f:
    det_ingrs_data = json.load(f)

with open("recipe1m_downloads/recipes_with_nutritional_info.json", "r", encoding="utf-8") as f:
    nutrition_data = json.load(f)

# === Convert to dicts for fast lookup ===
det_ingrs_dict = {
    r["id"]: [ing["text"] for ing, valid in zip(r["ingredients"], r["valid"]) if valid]
    for r in det_ingrs_data
}

nutrition_dict = {
    r["id"]: {
        "ingredients_normalized": [ing["text"] for ing in r["ingredients"]],
        "ingredient_units": [
            f'{q["text"]} {u["text"]}' for q, u in zip(r["quantity"], r["unit"])
        ],
        "nutrition_per_100g": r.get("nutr_values_per100g", {}),
        "fsa_lights": r.get("fsa_lights_per100g", {})
    }
    for r in nutrition_data
}

# === Merge all three ===
rows = []

for entry in layer1_data:
    rid = entry["id"]
    
    ingredients_raw = [i["text"] for i in entry.get("ingredients", [])]
    instructions = " ".join([s["text"] for s in entry.get("instructions", [])])
    
    row = {
        "id": rid,
        "title": entry.get("title"),
        "partition": entry.get("partition"),
        "url": entry.get("url"),
        "ingredients_raw": ingredients_raw,
        "ingredients_clean": det_ingrs_dict.get(rid, []),
        "ingredients_normalized": nutrition_dict.get(rid, {}).get("ingredients_normalized", []),
        "ingredient_units": nutrition_dict.get(rid, {}).get("ingredient_units", []),
        "instructions": instructions,
        "nutrition_per_100g": nutrition_dict.get(rid, {}).get("nutrition_per_100g", {}),
        "fsa_lights": nutrition_dict.get(rid, {}).get("fsa_lights", {})
    }
    
    rows.append(row)

# === Create DataFrame ===
df = pd.DataFrame(rows)

# Preview
pd.set_option("display.max_colwidth", None)
df.head(1)


In [19]:
df_valid = df[
    (df["ingredients_clean"].str.len() > 0) &
    (df["ingredients_normalized"].str.len() > 0)
]

In [None]:
df_valid.head(1)

In [26]:
# -- df_layer1: from layer1.json --
with open("recipe1m_downloads/extracted/layer1.json", "r", encoding="utf-8") as f:
    layer1_data = json.load(f)

df_layer1 = pd.DataFrame([
    {
        "id": r["id"],
        "title": r["title"],
        "partition": r["partition"],
        "url": r.get("url"),
        "ingredients_raw": [i["text"] for i in r.get("ingredients", [])],
        "instructions_full": " ".join([s["text"] for s in r.get("instructions", [])])
    }
    for r in layer1_data
])

# -- df_det_ingrs: from det_ingrs.json --
with open("recipe1m_downloads/det_ingrs.json", "r", encoding="utf-8") as f:
    det_ingrs_data = json.load(f)

df_det_ingrs = pd.DataFrame([
    {
        "id": r["id"],
        "ingredients_clean": [ing["text"] for ing, valid in zip(r["ingredients"], r["valid"]) if valid]
    }
    for r in det_ingrs_data
])

# -- df_nutrition_full: already created from earlier step --

# Step 1: merge layer1 and det_ingrs
df_combined = pd.merge(df_layer1, df_det_ingrs, on="id", how="outer")

# Step 2: merge with nutrition
df_final = pd.merge(df_combined, df_nutrition_full, on="id", how="outer", suffixes=("", "_nutrition"))

# Optional: pick best version of instructions (layer1 usually has full)
#df_final["instructions"] = df_final["instructions_full"].fillna(df_final["instructions_nutrition"])
#df_final.drop(columns=["instructions_full", "instructions_nutrition"], inplace=True)

# Preview
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
#df_final.head(2)


In [31]:
df_final_filtered = df_final[
    (df_final["ingredients_raw"].notnull()) |
    (df_final["ingredients_clean"].notnull()) |
    (df_final["ingredients_structured"].notna()) |
    (df_final["instructions"].notnull())
]


In [32]:
df_final_filtered.head(1)

Unnamed: 0,id,title,partition,url,ingredients_raw,instructions_full,ingredients_clean,title_nutrition,partition_nutrition,url_nutrition,instructions,ingredients_structured,nutrition_per_100g,fsa_lights_per_100g
0,000018c8a5,Worlds Best Mac and Cheese,train,http://www.epicurious.com/recipes/food/views/-world-s-best-mac-and-cheese-387747,"[6 ounces penne, 2 cups Beechers Flagship Cheese Sauce (recipe follows), 1 ounce Cheddar, grated (1/4 cup), 1 ounce Gruyere cheese, grated (1/4 cup), 1/4 to 1/2 teaspoon chipotle chili powder (see Note), 1/4 cup (1/2 stick) unsalted butter, 1/3 cup all-purpose flour, 3 cups milk, 14 ounces semihard cheese (page 23), grated (about 3 1/2 cups), 2 ounces semisoft cheese (page 23), grated (1/2 cup), 1/2 teaspoon kosher salt, 1/4 to 1/2 teaspoon chipotle chili powder, 1/8 teaspoon garlic powder, (makes about 4 cups)]","Preheat the oven to 350 F. Butter or oil an 8-inch baking dish. Cook the penne 2 minutes less than package directions. (It will finish cooking in the oven.) Rinse the pasta in cold water and set aside. Combine the cooked pasta and the sauce in a medium bowl and mix carefully but thoroughly. Scrape the pasta into the prepared baking dish. Sprinkle the top with the cheeses and then the chili powder. Bake, uncovered, for 20 minutes. Let the mac and cheese sit for 5 minutes before serving. Melt the butter in a heavy-bottomed saucepan over medium heat and whisk in the flour. Continue whisking and cooking for 2 minutes. Slowly add the milk, whisking constantly. Cook until the sauce thickens, about 10 minutes, stirring frequently. Remove from the heat. Add the cheeses, salt, chili powder, and garlic powder. Stir until the cheese is melted and all ingredients are incorporated, about 3 minutes. Use immediately, or refrigerate for up to 3 days. This sauce reheats nicely on the stove in a saucepan over low heat. Stir frequently so the sauce doesnt scorch. This recipe can be assembled before baking and frozen for up to 3 monthsjust be sure to use a freezer-to-oven pan and increase the baking time to 50 minutes. One-half teaspoon of chipotle chili powder makes a spicy mac, so make sure your family and friends can handle it! The proportion of pasta to cheese sauce is crucial to the success of the dish. It will look like a lot of sauce for the pasta, but some of the liquid will be absorbed.","[penne, cheese sauce, cheddar cheese, gruyere cheese, dried chipotle powder, unsalted butter, all - purpose flour, milk, kosher salt, dried chipotle powder, garlic powder]",,,,,,,
