In [1]:
import requests
import pandas as pd

# ----- STEP 1: Nutrition lookup table (you can expand this) -----
NUTRITION_TABLE = {
    'chicken breast': {'calories': 165, 'fat': 3.6, 'protein': 31, 'fiber': 0},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'fiber': 2.1},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'fiber': 0.4},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'fiber': 0},
    'pepper': {'calories': 251, 'fat': 3.3, 'protein': 10.4, 'fiber': 25.3},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'fiber': 1.2},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'fiber': 2.8}
    # Add more ingredients as needed
}

# ----- STEP 2: API access functions -----
def get_meals_by_category(category='Chicken'):
    url = f"https://www.themealdb.com/api/json/v1/1/filter.php?c={category}"
    response = requests.get(url)
    return response.json().get('meals', [])

def get_meal_details(meal_id):
    url = f"https://www.themealdb.com/api/json/v1/1/lookup.php?i={meal_id}"
    response = requests.get(url)
    meals = response.json().get('meals', [])
    return meals[0] if meals else {}

# ----- STEP 3: Ingredient extraction -----
def extract_ingredients(meal):
    ingredients = {}
    for i in range(1, 21):
        ing = meal.get(f"strIngredient{i}")
        meas = meal.get(f"strMeasure{i}")
        if ing and ing.strip():
            ingredients[ing.strip().lower()] = meas.strip() if meas else ''
    return ingredients

# ----- STEP 4: Nutrition estimation -----
def estimate_nutrition(ingredients):
    totals = {'calories': 0, 'fat': 0, 'protein': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing:  # fuzzy match
                nut = NUTRITION_TABLE[known]
                for key in totals:
                    totals[key] += nut.get(key, 0)
    return totals

# ----- STEP 5: Main scraping + saving -----
def scrape_meals(category='Chicken', limit=5):
    meals = get_meals_by_category(category)
    meal_data = []

    for meal in meals[:limit]:
        details = get_meal_details(meal['idMeal'])
        ingredients = extract_ingredients(details)
        nutrition = estimate_nutrition(ingredients.keys())

        data = {
            'MealID': meal['idMeal'],
            'Meal': details.get('strMeal'),
            'Category': details.get('strCategory'),
            'Area': details.get('strArea'),
            'Instructions': details.get('strInstructions'),
            'Tags': details.get('strTags'),
            'YouTube': details.get('strYoutube'),
            'Ingredients': ingredients,
            'Calories': nutrition['calories'],
            'Fat': nutrition['fat'],
            'Protein': nutrition['protein'],
            'Fiber': nutrition['fiber'],
        }
        meal_data.append(data)

    return pd.DataFrame(meal_data)

# ----- STEP 6: Run and save -----
if __name__ == "__main__":
    df_recipes = scrape_meals(category='Chicken', limit=5)
    df_recipes.to_csv("themealdb_recipes.csv", index=False)
    print("✅ Scraped and saved recipes with estimated nutrition to 'themealdb_recipes.csv'")


✅ Scraped and saved recipes with estimated nutrition to 'themealdb_recipes.csv'


In [3]:
import requests
import pandas as pd
import time

# Nutrition lookup table (simplified per 100g; customize/expand as needed)
NUTRITION_TABLE = {
    'chicken':     {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice':        {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion':       {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic':      {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato':      {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk':        {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter':      {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'carrot':      {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8},
    'egg':         {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt':        {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
}

def get_all_meal_ids():
    meal_ids = set()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        url = f'https://www.themealdb.com/api/json/v1/1/search.php?f={letter}'
        res = requests.get(url)
        meals = res.json().get('meals', [])
        for meal in meals:
            meal_ids.add(meal['idMeal'])
        time.sleep(0.5)  # gentle delay
    return list(meal_ids)

def get_meal_details(meal_id):
    url = f"https://www.themealdb.com/api/json/v1/1/lookup.php?i={meal_id}"
    response = requests.get(url)
    meals = response.json().get('meals', [])
    return meals[0] if meals else None

def extract_ingredients(meal):
    ingredients = []
    for i in range(1, 21):
        ing = meal.get(f"strIngredient{i}")
        if ing and ing.strip():
            ingredients.append(ing.strip().lower())
    return ingredients

def estimate_nutrition(ingredients):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing:
                for key in total:
                    total[key] += NUTRITION_TABLE[known][key]
    return total

def scrape_meals(limit=500):
    all_ids = get_all_meal_ids()
    recipes = []
    for meal_id in all_ids[:limit]:
        meal = get_meal_details(meal_id)
        if meal:
            ingredients = extract_ingredients(meal)
            nutrition = estimate_nutrition(ingredients)
            recipes.append({
                'MealID': meal_id,
                'Meal': meal.get('strMeal'),
                'Category': meal.get('strCategory'),
                'Area': meal.get('strArea'),
                'Instructions': meal.get('strInstructions'),
                'Ingredients': ', '.join(ingredients),
                'Calories': nutrition['calories'],
                'Fat': nutrition['fat'],
                'Protein': nutrition['protein'],
                'Sugar': nutrition['sugar'],
                'Fiber': nutrition['fiber'],
                'Carbohydrates': nutrition['carbs']
            })
        time.sleep(0.2)
    return pd.DataFrame(recipes)

# Run the scraper
df = scrape_meals(limit=500)
df.to_csv("themealdb_500_recipes.csv", index=False)
print("✅ Scraped 500 recipes from TheMealDB and saved with  nutrients.")


TypeError: 'NoneType' object is not iterable

In [4]:
import requests
import pandas as pd
import time

# Simplified nutrient lookup table (approximate per 100g)
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
}

def get_all_meal_ids():
    meal_ids = set()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        url = f'https://www.themealdb.com/api/json/v1/1/search.php?f={letter}'
        res = requests.get(url)
        data = res.json()
        meals = data.get('meals', [])
        if meals:  # ✅ Prevents TypeError when meals is None
            for meal in meals:
                meal_ids.add(meal['idMeal'])
        time.sleep(0.3)
    return list(meal_ids)

def get_meal_details(meal_id):
    url = f"https://www.themealdb.com/api/json/v1/1/lookup.php?i={meal_id}"
    response = requests.get(url)
    meals = response.json().get('meals', [])
    return meals[0] if meals else None

def extract_ingredients(meal):
    ingredients = []
    for i in range(1, 21):
        ing = meal.get(f"strIngredient{i}")
        if ing and ing.strip():
            ingredients.append(ing.strip().lower())
    return ingredients

def estimate_nutrition(ingredients):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing:
                for key in total:
                    total[key] += NUTRITION_TABLE[known][key]
    return total

def scrape_meals(limit=500):
    all_ids = get_all_meal_ids()
    recipes = []
    for meal_id in all_ids[:limit]:
        meal = get_meal_details(meal_id)
        if meal:
            ingredients = extract_ingredients(meal)
            nutrition = estimate_nutrition(ingredients)
            recipes.append({
                'MealID': meal_id,
                'Meal': meal.get('strMeal'),
                'Category': meal.get('strCategory'),
                'Area': meal.get('strArea'),
                'Instructions': meal.get('strInstructions'),
                'Ingredients': ', '.join(ingredients),
                'Calories': nutrition['calories'],
                'Fat': nutrition['fat'],
                'Protein': nutrition['protein'],
                'Sugar': nutrition['sugar'],
                'Fiber': nutrition['fiber'],
                'Carbohydrates': nutrition['carbs']
            })
        time.sleep(0.2)
    return pd.DataFrame(recipes)

# Run the scraper
df = scrape_meals(limit=500)
df.to_csv("themealdb_500_recipes.csv", index=False)
print("✅ Done! Scraped 500 recipes from TheMealDB with estimated nutrition.")


✅ Done! Scraped 500 recipes from TheMealDB with estimated nutrition.


In [16]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Nutritional lookup (approx. per 100g)
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
}

# Headless browser setup for scraping
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--user-data-dir=/tmp/bbc-profile')  # avoids SessionNotCreatedException

driver = webdriver.Chrome(options=options)

def get_bbc_recipe_links(pages=50):
    links = set()
    for page in range(1, pages + 1):
        print(f"🔍 Fetching page {page}")
        url = f"https://www.bbcgoodfood.com/recipes/category/all?page={page}"
        driver.get(url)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for a in soup.select('h2.heading-4 a'):
            href = a.get('href')
            if href and '/recipes/' in href:
                links.add("https://www.bbcgoodfood.com" + href)
    return list(links)

def extract_ingredients_from_bbc(soup):
    return [li.text.strip().lower() for li in soup.select('.ingredients-section li') if li.text.strip()]

def estimate_nutrition(ingredients):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing:
                for key in total:
                    total[key] += NUTRITION_TABLE[known][key]
    return total

def get_bbc_recipe_details(url):
    try:
        driver.get(url)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        title = soup.select_one('h1').text.strip()
        ingredients = extract_ingredients_from_bbc(soup)
        method = ' '.join([step.text.strip() for step in soup.select('.method__item')])
        nutrition = estimate_nutrition(ingredients)
        return {
            'Meal': title,
            'URL': url,
            'Ingredients': ', '.join(ingredients),
            'Instructions': method,
            'Calories': nutrition['calories'],
            'Fat': nutrition['fat'],
            'Protein': nutrition['protein'],
            'Sugar': nutrition['sugar'],
            'Fiber': nutrition['fiber'],
            'Carbohydrates': nutrition['carbs']
        }
    except Exception as e:
        print(f"⚠️ Error scraping {url}: {e}")
        return None

def scrape_bbc_meals(limit=500):
    links = get_bbc_recipe_links(pages=50)
    print(f"Found {len(links)} recipes.")
    recipes = []
    for i, url in enumerate(links[:limit]):
        print(f"⏳ Scraping {i+1}/{limit}")
        meal = get_bbc_recipe_details(url)
        if meal:
            recipes.append(meal)
        time.sleep(1)
    return pd.DataFrame(recipes)

# Run scraper
df = scrape_bbc_meals(limit=500)
df.to_csv("bbc_good_food_500_recipes.csv", index=False)
print("✅ Done! Scraped 500 BBC Good Food recipes with estimated nutrition.")

driver.quit()



🔍 Fetching page 1
🔍 Fetching page 2
🔍 Fetching page 3
🔍 Fetching page 4
🔍 Fetching page 5
🔍 Fetching page 6
🔍 Fetching page 7
🔍 Fetching page 8
🔍 Fetching page 9
🔍 Fetching page 10
🔍 Fetching page 11
🔍 Fetching page 12
🔍 Fetching page 13
🔍 Fetching page 14
🔍 Fetching page 15
🔍 Fetching page 16
🔍 Fetching page 17
🔍 Fetching page 18
🔍 Fetching page 19
🔍 Fetching page 20
🔍 Fetching page 21
🔍 Fetching page 22
🔍 Fetching page 23
🔍 Fetching page 24
🔍 Fetching page 25
🔍 Fetching page 26
🔍 Fetching page 27
🔍 Fetching page 28
🔍 Fetching page 29
🔍 Fetching page 30
🔍 Fetching page 31
🔍 Fetching page 32
🔍 Fetching page 33
🔍 Fetching page 34
🔍 Fetching page 35
🔍 Fetching page 36
🔍 Fetching page 37
🔍 Fetching page 38
🔍 Fetching page 39
🔍 Fetching page 40
🔍 Fetching page 41
🔍 Fetching page 42
🔍 Fetching page 43
🔍 Fetching page 44
🔍 Fetching page 45
🔍 Fetching page 46
🔍 Fetching page 47
🔍 Fetching page 48
🔍 Fetching page 49
🔍 Fetching page 50
Found 0 recipes.
✅ Done! Scraped 500 BBC Good Food recipes 

In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Simple nutrition lookup table (expand as needed)
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
}

def get_recipe_links(pages=25):
    links = []
    for i in range(1, pages + 1):
        url = f"https://www.allrecipes.com/recipes/?page={i}"
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'html.parser')
        for a in soup.select('a.card__titleLink'):
            link = a.get('href')
            if link and 'https://www.allrecipes.com/recipe/' in link:
                links.append(link)
        time.sleep(0.5)
    return list(set(links))

def extract_recipe(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')

    name = soup.find('h1', class_='headline').text.strip() if soup.find('h1', class_='headline') else 'N/A'
    ingredients_tags = soup.select('span.ingredients-item-name')
    ingredients = [i.text.strip().lower() for i in ingredients_tags if i.text.strip()]
    instructions_tag = soup.select('ul.instructions-section li p')
    instructions = ' '.join(i.text.strip() for i in instructions_tag)

    nutrition = estimate_nutrition(ingredients)
    return {
        'Meal': name,
        'Ingredients': ', '.join(ingredients),
        'Instructions': instructions,
        'Calories': nutrition['calories'],
        'Fat': nutrition['fat'],
        'Protein': nutrition['protein'],
        'Sugar': nutrition['sugar'],
        'Fiber': nutrition['fiber'],
        'Carbohydrates': nutrition['carbs']
    }

def estimate_nutrition(ingredients):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing:
                for key in total:
                    total[key] += NUTRITION_TABLE[known][key]
    return total

def scrape_allrecipes(limit=500):
    links = get_recipe_links(pages=40)
    data = []
    for i, link in enumerate(links[:limit]):
        try:
            print(f"Scraping ({i+1}/{limit}): {link}")
            recipe = extract_recipe(link)
            data.append(recipe)
            time.sleep(0.4)
        except Exception as e:
            print(f"❌ Error scraping {link}: {e}")
    return pd.DataFrame(data)

# Scrape and Save
df = scrape_allrecipes(limit=500)
df.to_csv("allrecipes_500.csv", index=False)
print("✅ Done! Saved 500 recipes from AllRecipes.com.")


✅ Done! Saved 500 recipes from AllRecipes.com.


In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Nutrition lookup table (can be expanded)
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

def get_recipe_links(pages=20):
    links = set()
    for page in range(1, pages + 1):
        url = f'https://www.allrecipes.com/recipes/?page={page}'
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.content, 'html.parser')
        for tag in soup.find_all('a', href=True):
            href = tag['href']
            if href.startswith("https://www.allrecipes.com/recipe/") and href.count('/') >= 5:
                links.add(href.split("?")[0])  # remove tracking params
        time.sleep(0.5)
    return list(links)

def extract_recipe(url):
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content, 'html.parser')

    try:
        name = soup.find('h1').text.strip()
    except:
        return None

    ingredients = [i.text.strip().lower() for i in soup.select('span.ingredients-item-name') if i.text.strip()]
    instructions = ' '.join([i.text.strip() for i in soup.select('ul.instructions-section li p')])

    nutrition = estimate_nutrition(ingredients)

    return {
        'Meal': name,
        'Ingredients': ', '.join(ingredients),
        'Instructions': instructions,
        'Calories': nutrition['calories'],
        'Fat': nutrition['fat'],
        'Protein': nutrition['protein'],
        'Sugar': nutrition['sugar'],
        'Fiber': nutrition['fiber'],
        'Carbohydrates': nutrition['carbs']
    }

def estimate_nutrition(ingredients):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing:
                for key in total:
                    total[key] += NUTRITION_TABLE[known][key]
    return total

def scrape_allrecipes(limit=500):
    links = get_recipe_links(pages=30)
    print(f"🔗 Found {len(links)} recipe links")
    data = []
    for i, link in enumerate(links[:limit]):
        print(f"Scraping {i+1}/{limit}: {link}")
        recipe = extract_recipe(link)
        if recipe:
            data.append(recipe)
        time.sleep(0.4)
    return pd.DataFrame(data)

# Run scraper
df = scrape_allrecipes(limit=500)
df.to_csv("allrecipes_500_recipes.csv", index=False)
print("✅ Done! Scraped", len(df), "recipes.")


🔗 Found 0 recipe links
✅ Done! Scraped 0 recipes.


In [19]:
!pip install recipe-scrapers pandas


Collecting recipe-scrapers
  Downloading recipe_scrapers-15.7.1-py3-none-any.whl.metadata (6.2 kB)
Collecting extruct>=0.17.0 (from recipe-scrapers)
  Downloading extruct-0.18.0-py2.py3-none-any.whl.metadata (36 kB)
Collecting isodate>=0.6.1 (from recipe-scrapers)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Collecting lxml-html-clean (from extruct>=0.17.0->recipe-scrapers)
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Collecting rdflib>=6.0.0 (from extruct>=0.17.0->recipe-scrapers)
  Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)
Collecting pyrdfa3 (from extruct>=0.17.0->recipe-scrapers)
  Downloading pyRdfa3-3.6.4-py3-none-any.whl.metadata (3.4 kB)
Collecting mf2py (from extruct>=0.17.0->recipe-scrapers)
  Downloading mf2py-2.0.1-py3-none-any.whl.metadata (5.4 kB)
Collecting w3lib (from extruct>=0.17.0->recipe-scrapers)
  Downloading w3lib-2.3.1-py3-none-any.whl.metadata (2.3 kB)
Collecting html-text>=0.5.1 (from extruct>=0.17.0->

In [20]:
import requests
import pandas as pd
import time
from recipe_scrapers import scrape_me

# Simplified nutrient lookup table (approximate per 100g)
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
}

def estimate_nutrition(ingredients):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing.lower():
                for key in total:
                    total[key] += NUTRITION_TABLE[known][key]
    return total

def scrape_recipes(urls):
    recipes = []
    for url in urls:
        try:
            scraper = scrape_me(url)
            title = scraper.title()
            ingredients = scraper.ingredients()
            instructions = scraper.instructions()
            nutrition = estimate_nutrition(ingredients)
            recipes.append({
                'Title': title,
                'Ingredients': ', '.join(ingredients),
                'Instructions': instructions,
                'Calories': nutrition['calories'],
                'Fat': nutrition['fat'],
                'Protein': nutrition['protein'],
                'Sugar': nutrition['sugar'],
                'Fiber': nutrition['fiber'],
                'Carbohydrates': nutrition['carbs']
            })
            time.sleep(0.2)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
    return pd.DataFrame(recipes)

# Example list of recipe URLs
recipe_urls = [
    'https://www.allrecipes.com/recipe/158968/spinach-and-feta-turkey-burgers/',
    'https://www.allrecipes.com/recipe/24074/alysias-basic-meat-lasagna/',
    'https://www.allrecipes.com/recipe/229960/chef-johns-chicken-parmesan/',
    # Add more URLs as needed
]

# Run the scraper
df = scrape_recipes(recipe_urls)
df.to_csv("recipes_dataset.csv", index=False)
print(f"✅ Done! Scraped {len(df)} recipes with estimated nutrition.")


Error scraping https://www.allrecipes.com/recipe/229960/chef-johns-chicken-parmesan/: HTTP Error 404: Not Found
✅ Done! Scraped 2 recipes with estimated nutrition.


In [21]:
import json
import pandas as pd

# Load the Recipe1M dataset
with open('recipes.json', 'r') as f:
    data = json.load(f)

# Simplified nutrient lookup table (approximate per 100g)
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
}

def estimate_nutrition(ingredients):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing.lower():
                for key in total:
                    total[key] += NUTRITION_TABLE[known][key]
    return total

# Process the first 500 recipes
recipes = []
for recipe in data[:500]:
    title = recipe.get('title', '')
    ingredients = recipe.get('ingredients', [])
    instructions = recipe.get('instructions', '')
    nutrition = estimate_nutrition(ingredients)
    recipes.append({
        'Title': title,
        'Ingredients': ', '.join(ingredients),
        'Instructions': instructions,
        'Calories': nutrition['calories'],
        'Fat': nutrition['fat'],
        'Protein': nutrition['protein'],
        'Sugar': nutrition['sugar'],
        'Fiber': nutrition['fiber'],
        'Carbohydrates': nutrition['carbs']
    })

# Save to CSV
df = pd.DataFrame(recipes)
df.to_csv("recipe1m_500_recipes.csv", index=False)
print(f"✅ Done! Processed {len(df)} recipes with estimated nutrition.")


FileNotFoundError: [Errno 2] No such file or directory: 'recipes.json'

In [22]:
import json
import pandas as pd
import os

# Simplified nutrition table (per 100g approx)
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
}

def estimate_nutrition(ingredients):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        ing_lower = ing.lower()
        for known in NUTRITION_TABLE:
            if known in ing_lower:
                for key in total:
                    total[key] += NUTRITION_TABLE[known][key]
    return total

def load_and_process_json(filename='layer1.json', limit=500):
    if not os.path.exists(filename):
        print(f"❌ File '{filename}' not found. Please upload or download it first.")
        return None

    with open(filename, 'r') as f:
        data = json.load(f)

    recipes = []
    for item in data[:limit]:
        title = item.get('title', '').strip()
        instructions = item.get('instructions', '')
        ingredients = [i.get('text', '').strip() for i in item.get('ingredients', [])]
        nutrition = estimate_nutrition(ingredients)

        recipes.append({
            'Title': title,
            'Instructions': instructions,
            'Ingredients': ', '.join(ingredients),
            'Calories': nutrition['calories'],
            'Fat': nutrition['fat'],
            'Protein': nutrition['protein'],
            'Sugar': nutrition['sugar'],
            'Fiber': nutrition['fiber'],
            'Carbohydrates': nutrition['carbs']
        })

    df = pd.DataFrame(recipes)
    df.to_csv("recipe1m_500_recipes.csv", index=False)
    print(f"✅ Saved {len(df)} recipes to 'recipe1m_500_recipes.csv'")
    return df

# Run the loader
df = load_and_process_json('layer1.json')  # Change filename if needed


❌ File 'layer1.json' not found. Please upload or download it first.


In [23]:
# Step 1: Download sample layer1.json (~500 recipes)
!wget -O layer1.json https://huggingface.co/datasets/ashraq/recipe-nlg/resolve/main/sample_layer1.json

# Step 2: Import required libraries
import json
import pandas as pd

# Step 3: Define nutrition table (per 100g, simplified)
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
}

# Step 4: Load layer1.json and extract recipes
with open("layer1.json", "r") as f:
    data = json.load(f)

recipes = []
for item in data[:500]:  # limit to 500 recipes
    title = item.get("title", "")
    instructions = item.get("instructions", "")
    ingredients = [ing["text"].lower() for ing in item.get("ingredients", [])]

    # Estimate nutrition
    nutrition = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing:
                for key in nutrition:
                    nutrition[key] += NUTRITION_TABLE[known][key]

    recipes.append({
        "Title": title,
        "Instructions": instructions,
        "Ingredients": ", ".join(ingredients),
        "Calories": nutrition['calories'],
        "Fat": nutrition['fat'],
        "Protein": nutrition['protein'],
        "Sugar": nutrition['sugar'],
        "Fiber": nutrition['fiber'],
        "Carbohydrates": nutrition['carbs']
    })

# Step 5: Save to CSV
df = pd.DataFrame(recipes)
df.to_csv("recipe1m_sample_500.csv", index=False)
print("✅ Done! Saved 500 recipes with estimated nutrition to 'recipe1m_sample_500.csv'")


--2025-06-03 01:50:51--  https://huggingface.co/datasets/ashraq/recipe-nlg/resolve/main/sample_layer1.json
Resolving huggingface.co (huggingface.co)... 18.172.134.4, 18.172.134.88, 18.172.134.124, ...
Connecting to huggingface.co (huggingface.co)|18.172.134.4|:443... connected.
HTTP request sent, awaiting response... 401 Unauthorized

Username/Password Authentication Failed.


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/content/TheMealDB_with_nutrition.csv')

# Simple nutrition lookup table
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8}
}

# Helper function to estimate nutrition
def estimate_nutrition(ingredient_text):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    if isinstance(ingredient_text, str):
        ingredients = [i.strip().lower() for i in ingredient_text.split(',')]
        for ing in ingredients:
            for known in NUTRITION_TABLE:
                if known in ing:
                    for key in total:
                        total[key] += NUTRITION_TABLE[known][key]
    return pd.Series(total)

# Apply estimation to each row
nutrition_df = df['Ingredients'].apply(estimate_nutrition)

# Merge with original DataFrame
df_updated = pd.concat([df, nutrition_df], axis=1)

# Save new dataset
df_updated.to_csv("/content/TheMealDB_with_nutrition.csv", index=False)
print("✅ Estimated nutrition added and saved as 'TheMealDB_with_estimated_nutrition.csv'")


In [25]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/content/TheMealDB_with_nutrition.csv')

# Simple nutrition lookup table
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8}
}

# Helper function to estimate nutrition
def estimate_nutrition(ingredient_text):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    if isinstance(ingredient_text, str):
        ingredients = [i.strip().lower() for i in ingredient_text.split(',')]
        for ing in ingredients:
            for known in NUTRITION_TABLE:
                if known in ing:
                    for key in total:
                        total[key] += NUTRITION_TABLE[known][key]
    return pd.Series(total)

# Apply estimation to each row
nutrition_df = df['Ingredients'].apply(estimate_nutrition)

# Merge with original DataFrame
df_updated = pd.concat([df, nutrition_df], axis=1)

# Save new dataset
df_updated.to_csv("/content/TheMealDB_with_nutrition.csv", index=False)
print("✅ Estimated nutrition added and saved as 'TheMealDB_with_estimated_nutrition.csv'")

✅ Estimated nutrition added and saved as 'TheMealDB_with_estimated_nutrition.csv'


In [26]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/content/all_mealdb_recipes_120+.csv')

# Simple nutrition lookup table
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8}
}

# Function to estimate nutrition from ingredients
def estimate_nutrition(ingredient_text):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    if isinstance(ingredient_text, str):
        ingredients = [i.strip().lower() for i in ingredient_text.split(',')]
        for ing in ingredients:
            for known in NUTRITION_TABLE:
                if known in ing:
                    for key in total:
                        total[key] += NUTRITION_TABLE[known][key]
    return pd.Series(total)

# Apply nutrition estimation
nutrition_df = df['Ingredients'].apply(estimate_nutrition)

# Combine nutrition info with original data
df_updated = pd.concat([df, nutrition_df], axis=1)

# Save updated DataFrame to CSV
output_path = "/content/all_mealdb_recipes_120+.csv"
df_updated.to_csv(output_path, index=False)

print(f"✅ Nutrition estimation complete and saved to: {output_path}")


✅ Nutrition estimation complete and saved to: /content/all_mealdb_recipes_120+.csv


In [28]:
import pandas as pd

# Load dataset
file_path = '/content/themealdb_recipes.csv'
df = pd.read_csv(file_path)

# Show basic info before cleaning
print("Before cleaning:")
print(df.info())
print("\nMissing values per column:\n", df.isnull().sum())

# Strip whitespace and standardize column names
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ', '_').str.lower()

# Remove duplicate rows
df = df.drop_duplicates()

# Remove leading/trailing spaces in string columns
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip()

# Handle missing values
# Option 1: Drop rows where essential columns (e.g., name, ingredients) are missing
essential_columns = ['meal', 'ingredients']  # Update if necessary
df = df.dropna(subset=[col for col in essential_columns if col in df.columns])

# Option 2 (optional): Fill missing values in non-essential columns
df = df.fillna('Unknown')  # Or use specific values depending on column

# Normalize text: lowercase for relevant fields
for col in ['meal', 'category', 'area', 'ingredients']:
    if col in df.columns:
        df[col] = df[col].str.lower()

# Save cleaned dataset
cleaned_path = '/content/themealdb_recipes.csv'
df.to_csv(cleaned_path, index=False)

# Summary
print("\n✅ Dataset cleaned and saved to:", cleaned_path)
print(df.head())


Before cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MealID        5 non-null      int64  
 1   Meal          5 non-null      object 
 2   Category      5 non-null      object 
 3   Area          5 non-null      object 
 4   Instructions  5 non-null      object 
 5   Tags          1 non-null      object 
 6   YouTube       5 non-null      object 
 7   Ingredients   5 non-null      object 
 8   Calories      5 non-null      int64  
 9   Fat           5 non-null      float64
 10  Protein       5 non-null      float64
 11  Fiber         5 non-null      float64
dtypes: float64(3), int64(2), object(7)
memory usage: 612.0+ bytes
None

Missing values per column:
 MealID          0
Meal            0
Category        0
Area            0
Instructions    0
Tags            4
YouTube         0
Ingredients     0
Calories        0
Fat             0

In [29]:
import pandas as pd

# Load dataset
file_path = '/content/TheMealDB_with_nutrition.csv'
df = pd.read_csv(file_path)

# Show basic info before cleaning
print("Before cleaning:")
print(df.info())
print("\nMissing values per column:\n", df.isnull().sum())

# Strip whitespace and standardize column names
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ', '_').str.lower()

# Remove duplicate rows
df = df.drop_duplicates()

# Remove leading/trailing spaces in string columns
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip()

# Handle missing values
# Option 1: Drop rows where essential columns (e.g., name, ingredients) are missing
essential_columns = ['meal', 'ingredients']  # Update if necessary
df = df.dropna(subset=[col for col in essential_columns if col in df.columns])

# Option 2 (optional): Fill missing values in non-essential columns
df = df.fillna('Unknown')  # Or use specific values depending on column

# Normalize text: lowercase for relevant fields
for col in ['meal', 'category', 'area', 'ingredients']:
    if col in df.columns:
        df[col] = df[col].str.lower()

# Save cleaned dataset
cleaned_path = '/content/TheMealDB_with_nutrition.csv'
df.to_csv(cleaned_path, index=False)

# Summary
print("\n✅ Dataset cleaned and saved to:", cleaned_path)
print(df.head())

Before cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304 entries, 0 to 303
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MealID                  304 non-null    int64  
 1   Meal                    304 non-null    object 
 2   Category                304 non-null    object 
 3   Area                    304 non-null    object 
 4   Instructions            304 non-null    object 
 5   Tags                    183 non-null    object 
 6   YouTube                 288 non-null    object 
 7   Ingredients             304 non-null    object 
 8   EstimatedCalories_kcal  304 non-null    float64
 9   EstimatedProtein_g      304 non-null    float64
 10  EstimatedFat_g          304 non-null    float64
 11  EstimatedCarbs_g        304 non-null    float64
 12  calories                304 non-null    float64
 13  fat                     304 non-null    float64
 14  protein                 3

In [30]:
import pandas as pd

# Load dataset
file_path = '/content/all_mealdb_recipes_120+.csv'
df = pd.read_csv(file_path)

# Show basic info before cleaning
print("Before cleaning:")
print(df.info())
print("\nMissing values per column:\n", df.isnull().sum())

# Strip whitespace and standardize column names
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ', '_').str.lower()

# Remove duplicate rows
df = df.drop_duplicates()

# Remove leading/trailing spaces in string columns
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip()

# Handle missing values
# Option 1: Drop rows where essential columns (e.g., name, ingredients) are missing
essential_columns = ['meal', 'ingredients']  # Update if necessary
df = df.dropna(subset=[col for col in essential_columns if col in df.columns])

# Option 2 (optional): Fill missing values in non-essential columns
df = df.fillna('Unknown')  # Or use specific values depending on column

# Normalize text: lowercase for relevant fields
for col in ['meal', 'category', 'area', 'ingredients']:
    if col in df.columns:
        df[col] = df[col].str.lower()

# Save cleaned dataset
cleaned_path = '/content/all_mealdb_recipes_120+.csv'
df.to_csv(cleaned_path, index=False)

# Summary
print("\n✅ Dataset cleaned and saved to:", cleaned_path)
print(df.head())

Before cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MealID        120 non-null    int64  
 1   Meal          120 non-null    object 
 2   Category      120 non-null    object 
 3   Area          120 non-null    object 
 4   Instructions  120 non-null    object 
 5   Tags          72 non-null     object 
 6   YouTube       111 non-null    object 
 7   Ingredients   120 non-null    object 
 8   calories      120 non-null    float64
 9   fat           120 non-null    float64
 10  protein       120 non-null    float64
 11  carbs         120 non-null    float64
 12  sugar         120 non-null    float64
 13  fiber         120 non-null    float64
dtypes: float64(6), int64(1), object(7)
memory usage: 13.3+ KB
None

Missing values per column:
 MealID           0
Meal             0
Category         0
Area             0
Instructions     

In [31]:
import pandas as pd

# Load dataset
file_path = '/content/themealdb_500_recipes.csv'
df = pd.read_csv(file_path)

# Show basic info before cleaning
print("Before cleaning:")
print(df.info())
print("\nMissing values per column:\n", df.isnull().sum())

# Strip whitespace and standardize column names
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ', '_').str.lower()

# Remove duplicate rows
df = df.drop_duplicates()

# Remove leading/trailing spaces in string columns
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip()

# Handle missing values
# Option 1: Drop rows where essential columns (e.g., name, ingredients) are missing
essential_columns = ['meal', 'ingredients']  # Update if necessary
df = df.dropna(subset=[col for col in essential_columns if col in df.columns])

# Option 2 (optional): Fill missing values in non-essential columns
df = df.fillna('Unknown')  # Or use specific values depending on column

# Normalize text: lowercase for relevant fields
for col in ['meal', 'category', 'area', 'ingredients']:
    if col in df.columns:
        df[col] = df[col].str.lower()

# Save cleaned dataset
cleaned_path = '/content/themealdb_500_recipes.csv'
df.to_csv(cleaned_path, index=False)

# Summary
print("\n✅ Dataset cleaned and saved to:", cleaned_path)
print(df.head())

Before cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MealID         302 non-null    int64  
 1   Meal           302 non-null    object 
 2   Category       302 non-null    object 
 3   Area           302 non-null    object 
 4   Instructions   302 non-null    object 
 5   Ingredients    302 non-null    object 
 6   Calories       302 non-null    int64  
 7   Fat            302 non-null    float64
 8   Protein        302 non-null    float64
 9   Sugar          302 non-null    float64
 10  Fiber          302 non-null    float64
 11  Carbohydrates  302 non-null    float64
dtypes: float64(5), int64(2), object(5)
memory usage: 28.4+ KB
None

Missing values per column:
 MealID           0
Meal             0
Category         0
Area             0
Instructions     0
Ingredients      0
Calories         0
Fat              0
Protein      

In [33]:
import pandas as pd

# Load the CSV files
df1 = pd.read_csv('/content/TheMealDB_with_nutrition.csv')
df2 = pd.read_csv('/content/all_mealdb_recipes_120+.csv')
df3 = pd.read_csv('/content/themealdb_500_recipes.csv')
df4 = pd.read_csv('/content/themealdb_recipes.csv')

# Concatenate all dataframes
merged_df = pd.concat([df1, df2, df3, df4], ignore_index=True)


# merged_df = merged_df.drop_duplicates(subset='idMeal')

# Save the merged dataframe to a new CSV
merged_df.to_csv('/content/merged dataset.csv', index=False)

print("Merge complete. Saved as 'merged_mealdb_recipes.csv'")


Merge complete. Saved as 'merged_mealdb_recipes.csv'
