In [None]:
import os
import requests
import tarfile
import json

# URLs from the dataset email
url_map = {
    "recipe1M_layers.tar.gz": "http://wednesday.csail.mit.edu/temporal/release/recipe1M_layers.tar.gz",
    "det_ingrs.json": "http://wednesday.csail.mit.edu/temporal/release/det_ingrs.json",
    "recipes_with_nutritional_info.json": "http://wednesday.csail.mit.edu/temporal/release/recipes_with_nutritional_info.json",
}

# Setup directories
download_dir = "recipe1m_downloads"
extract_dir = os.path.join(download_dir, "extracted")
os.makedirs(download_dir, exist_ok=True)
os.makedirs(extract_dir, exist_ok=True)


# Download all files
for filename, url in url_map.items():
    dest_path = os.path.join(download_dir, filename)
    download_file(url, dest_path)

# Extract tar.gz file
tar_path = os.path.join(download_dir, "recipe1M_layers.tar.gz")
print(f"\n🗂️ Extracting {tar_path}...")
with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(path=extract_dir)
print(f"✅ Extracted to {extract_dir}")

In [2]:
import pandas as pd
import json
# -- df_layer1: from layer1.json --
with open("recipe1m_downloads/extracted/layer1.json", "r", encoding="utf-8") as f:
    layer1_data = json.load(f)

df_layer1 = pd.DataFrame([
    {
        "id": r["id"],
        "title": r["title"],
        "partition": r["partition"],
        "url": r.get("url"),
        "ingredients_raw": [i["text"] for i in r.get("ingredients", [])],
        "instructions_full": " ".join([s["text"] for s in r.get("instructions", [])])
    }
    for r in layer1_data
])
df_layer1.head(1)

Unnamed: 0,id,title,partition,url,ingredients_raw,instructions_full
0,000018c8a5,Worlds Best Mac and Cheese,train,http://www.epicurious.com/recipes/food/views/-...,"[6 ounces penne, 2 cups Beechers Flagship Chee...",Preheat the oven to 350 F. Butter or oil an 8-...


In [3]:
# -- df_det_ingrs: from det_ingrs.json --
with open("recipe1m_downloads/det_ingrs.json", "r", encoding="utf-8") as f:
    det_ingrs_data = json.load(f)

df_det_ingrs = pd.DataFrame([
    {
        "id": r["id"],
        "ingredients_clean": [ing["text"] for ing, valid in zip(r["ingredients"], r["valid"]) if valid]
    }
    for r in det_ingrs_data
])
df_det_ingrs.head(1)

Unnamed: 0,id,ingredients_clean
0,000018c8a5,"[penne, cheese sauce, cheddar cheese, gruyere ..."


In [4]:
import json
import pandas as pd

def parse_full_nutrition_entry(entry):
    n = len(entry["ingredients"])
    
    ingredients_structured = []
    for i in range(n):
        ingredients_structured.append({
            "ingredient": entry["ingredients"][i]["text"] if i < len(entry["ingredients"]) else None,
            "quantity": entry["quantity"][i]["text"] if i < len(entry["quantity"]) else None,
            "unit": entry["unit"][i]["text"] if i < len(entry["unit"]) else None,
            "weight_g": entry["weight_per_ingr"][i] if i < len(entry["weight_per_ingr"]) else None,
            "nutrition": entry["nutr_per_ingredient"][i] if i < len(entry["nutr_per_ingredient"]) else None
        })
    
    return {
        "id": entry.get("id"),
        "title": entry.get("title"),
        "partition": entry.get("partition"),
        "url": entry.get("url"),
        "instructions": " ".join([step["text"] for step in entry.get("instructions", [])]),
        "ingredients_structured": ingredients_structured,
        "nutrition_per_100g": entry.get("nutr_values_per100g", {}),
        "fsa_lights_per_100g": entry.get("fsa_lights_per100g", {})
    }

# Load your JSON file or individual object here
with open("recipe1m_downloads/recipes_with_nutritional_info.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Example: parse just the first entry (you can loop later)
parsed = [parse_full_nutrition_entry(recipe) for recipe in data]

# Convert to DataFrame
df_nutrition_full = pd.DataFrame(parsed)

# Optional: display full columns nicely
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
df_nutrition_full.head(1)

Unnamed: 0,id,title,partition,url,instructions,ingredients_structured,nutrition_per_100g,fsa_lights_per_100g
0,000095fc1d,Yogurt Parfaits,train,http://tastykitchen.com/recipes/breakfastbrunch/yogurt-parfaits/,Layer all ingredients in a serving dish.,"[{'ingredient': 'yogurt, greek, plain, nonfat', 'quantity': '8', 'unit': 'ounce', 'weight_g': 226.796, 'nutrition': {'fat': 0.8845044000000001, 'nrg': 133.80964, 'pro': 23.110512399999998, 'sat': 0.26535132, 'sod': 81.64656, 'sug': 7.348190400000001}}, {'ingredient': 'strawberries, raw', 'quantity': '1', 'unit': 'cup', 'weight_g': 152.0, 'nutrition': {'fat': 0.46, 'nrg': 49.0, 'pro': 1.02, 'sat': 0.023, 'sod': 2.0, 'sug': 7.43}}, {'ingredient': 'cereals ready-to-eat, granola, homemade', 'quantity': '1/4', 'unit': 'cup', 'weight_g': 30.5, 'nutrition': {'fat': 7.415, 'nrg': 149.25, 'pro': 4.17, 'sat': 1.207, 'sod': 8.0, 'sug': 6.04}}]","{'energy': 81.12946131894766, 'fat': 2.140139263515891, 'protein': 6.914436593565536, 'salt': 0.05597816738985967, 'saturates': 0.36534716195613937, 'sugars': 5.08634103436144}","{'fat': 'green', 'salt': 'green', 'saturates': 'green', 'sugars': 'orange'}"


In [5]:
# Step 1: merge layer1 and det_ingrs
df_combined = pd.merge(df_layer1, df_det_ingrs, on="id", how="inner")

# Step 2: merge with nutrition
df_final = pd.merge(df_combined, df_nutrition_full, on="id", how="inner", suffixes=("", "_nutrition"))
df_final.head(1)

Unnamed: 0,id,title,partition,url,ingredients_raw,instructions_full,ingredients_clean,title_nutrition,partition_nutrition,url_nutrition,instructions,ingredients_structured,nutrition_per_100g,fsa_lights_per_100g
0,000095fc1d,Yogurt Parfaits,train,http://tastykitchen.com/recipes/breakfastbrunch/yogurt-parfaits/,"[8 ounces, weight Light Fat Free Vanilla Yogurt (I Used Activia), 1 cup Fresh Sliced Strawberries, 1/4 cups Low-fat Granola]",Layer all ingredients in a serving dish.,"[non - fat vanilla yogurt, strawberries, low - fat granola]",Yogurt Parfaits,train,http://tastykitchen.com/recipes/breakfastbrunch/yogurt-parfaits/,Layer all ingredients in a serving dish.,"[{'ingredient': 'yogurt, greek, plain, nonfat', 'quantity': '8', 'unit': 'ounce', 'weight_g': 226.796, 'nutrition': {'fat': 0.8845044000000001, 'nrg': 133.80964, 'pro': 23.110512399999998, 'sat': 0.26535132, 'sod': 81.64656, 'sug': 7.348190400000001}}, {'ingredient': 'strawberries, raw', 'quantity': '1', 'unit': 'cup', 'weight_g': 152.0, 'nutrition': {'fat': 0.46, 'nrg': 49.0, 'pro': 1.02, 'sat': 0.023, 'sod': 2.0, 'sug': 7.43}}, {'ingredient': 'cereals ready-to-eat, granola, homemade', 'quantity': '1/4', 'unit': 'cup', 'weight_g': 30.5, 'nutrition': {'fat': 7.415, 'nrg': 149.25, 'pro': 4.17, 'sat': 1.207, 'sod': 8.0, 'sug': 6.04}}]","{'energy': 81.12946131894766, 'fat': 2.140139263515891, 'protein': 6.914436593565536, 'salt': 0.05597816738985967, 'saturates': 0.36534716195613937, 'sugars': 5.08634103436144}","{'fat': 'green', 'salt': 'green', 'saturates': 'green', 'sugars': 'orange'}"


In [6]:
len(df_final)

51235

In [9]:
drop_columns = ['partition','url', 'partition_nutrition']
df_final_filtered = df_final.drop(drop_columns, axis = 1)
df_final_filtered.head(1)

Unnamed: 0,id,title,ingredients_raw,instructions_full,ingredients_clean,title_nutrition,url_nutrition,instructions,ingredients_structured,nutrition_per_100g,fsa_lights_per_100g
0,000095fc1d,Yogurt Parfaits,"[8 ounces, weight Light Fat Free Vanilla Yogurt (I Used Activia), 1 cup Fresh Sliced Strawberries, 1/4 cups Low-fat Granola]",Layer all ingredients in a serving dish.,"[non - fat vanilla yogurt, strawberries, low - fat granola]",Yogurt Parfaits,http://tastykitchen.com/recipes/breakfastbrunch/yogurt-parfaits/,Layer all ingredients in a serving dish.,"[{'ingredient': 'yogurt, greek, plain, nonfat', 'quantity': '8', 'unit': 'ounce', 'weight_g': 226.796, 'nutrition': {'fat': 0.8845044000000001, 'nrg': 133.80964, 'pro': 23.110512399999998, 'sat': 0.26535132, 'sod': 81.64656, 'sug': 7.348190400000001}}, {'ingredient': 'strawberries, raw', 'quantity': '1', 'unit': 'cup', 'weight_g': 152.0, 'nutrition': {'fat': 0.46, 'nrg': 49.0, 'pro': 1.02, 'sat': 0.023, 'sod': 2.0, 'sug': 7.43}}, {'ingredient': 'cereals ready-to-eat, granola, homemade', 'quantity': '1/4', 'unit': 'cup', 'weight_g': 30.5, 'nutrition': {'fat': 7.415, 'nrg': 149.25, 'pro': 4.17, 'sat': 1.207, 'sod': 8.0, 'sug': 6.04}}]","{'energy': 81.12946131894766, 'fat': 2.140139263515891, 'protein': 6.914436593565536, 'salt': 0.05597816738985967, 'saturates': 0.36534716195613937, 'sugars': 5.08634103436144}","{'fat': 'green', 'salt': 'green', 'saturates': 'green', 'sugars': 'orange'}"
