In [None]:
import requests
import pandas as pd

# ----- STEP 1: Nutrition lookup table (you can expand this) -----
NUTRITION_TABLE = {
    'chicken breast': {'calories': 165, 'fat': 3.6, 'protein': 31, 'fiber': 0},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'fiber': 2.1},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'fiber': 0.4},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'fiber': 0},
    'pepper': {'calories': 251, 'fat': 3.3, 'protein': 10.4, 'fiber': 25.3},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'fiber': 1.2},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'fiber': 2.8}
    # Add more ingredients as needed
}

# ----- STEP 2: API access functions -----
def get_meals_by_category(category='Chicken'):
    url = f"https://www.themealdb.com/api/json/v1/1/filter.php?c={category}"
    response = requests.get(url)
    return response.json().get('meals', [])

def get_meal_details(meal_id):
    url = f"https://www.themealdb.com/api/json/v1/1/lookup.php?i={meal_id}"
    response = requests.get(url)
    meals = response.json().get('meals', [])
    return meals[0] if meals else {}

# ----- STEP 3: Ingredient extraction -----
def extract_ingredients(meal):
    ingredients = {}
    for i in range(1, 21):
        ing = meal.get(f"strIngredient{i}")
        meas = meal.get(f"strMeasure{i}")
        if ing and ing.strip():
            ingredients[ing.strip().lower()] = meas.strip() if meas else ''
    return ingredients

# ----- STEP 4: Nutrition estimation -----
def estimate_nutrition(ingredients):
    totals = {'calories': 0, 'fat': 0, 'protein': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing:  # fuzzy match
                nut = NUTRITION_TABLE[known]
                for key in totals:
                    totals[key] += nut.get(key, 0)
    return totals

# ----- STEP 5: Main scraping + saving -----
def scrape_meals(category='Chicken', limit=5):
    meals = get_meals_by_category(category)
    meal_data = []

    for meal in meals[:limit]:
        details = get_meal_details(meal['idMeal'])
        ingredients = extract_ingredients(details)
        nutrition = estimate_nutrition(ingredients.keys())

        data = {
            'MealID': meal['idMeal'],
            'Meal': details.get('strMeal'),
            'Category': details.get('strCategory'),
            'Area': details.get('strArea'),
            'Instructions': details.get('strInstructions'),
            'Tags': details.get('strTags'),
            'YouTube': details.get('strYoutube'),
            'Ingredients': ingredients,
            'Calories': nutrition['calories'],
            'Fat': nutrition['fat'],
            'Protein': nutrition['protein'],
            'Fiber': nutrition['fiber'],
        }
        meal_data.append(data)

    return pd.DataFrame(meal_data)

# ----- STEP 6: Run and save -----
if __name__ == "__main__":
    df_recipes = scrape_meals(category='Chicken', limit=5)
    df_recipes.to_csv("themealdb_recipes.csv", index=False)
    print("✅ Scraped and saved recipes with estimated nutrition to 'themealdb_recipes.csv'")

In [None]:
import requests
import pandas as pd
import time

# Simplified nutrient lookup table (approximate per 100g)
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
}

def get_all_meal_ids():
    meal_ids = set()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        url = f'https://www.themealdb.com/api/json/v1/1/search.php?f={letter}'
        res = requests.get(url)
        data = res.json()
        meals = data.get('meals', [])
        if meals:  # ✅ Prevents TypeError when meals is None
            for meal in meals:
                meal_ids.add(meal['idMeal'])
        time.sleep(0.3)
    return list(meal_ids)

def get_meal_details(meal_id):
    url = f"https://www.themealdb.com/api/json/v1/1/lookup.php?i={meal_id}"
    response = requests.get(url)
    meals = response.json().get('meals', [])
    return meals[0] if meals else None

def extract_ingredients(meal):
    ingredients = []
    for i in range(1, 21):
        ing = meal.get(f"strIngredient{i}")
        if ing and ing.strip():
            ingredients.append(ing.strip().lower())
    return ingredients

def estimate_nutrition(ingredients):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing:
                for key in total:
                    total[key] += NUTRITION_TABLE[known][key]
    return total

def scrape_meals(limit=500):
    all_ids = get_all_meal_ids()
    recipes = []
    for meal_id in all_ids[:limit]:
        meal = get_meal_details(meal_id)
        if meal:
            ingredients = extract_ingredients(meal)
            nutrition = estimate_nutrition(ingredients)
            recipes.append({
                'MealID': meal_id,
                'Meal': meal.get('strMeal'),
                'Category': meal.get('strCategory'),
                'Area': meal.get('strArea'),
                'Instructions': meal.get('strInstructions'),
                'Ingredients': ', '.join(ingredients),
                'Calories': nutrition['calories'],
                'Fat': nutrition['fat'],
                'Protein': nutrition['protein'],
                'Sugar': nutrition['sugar'],
                'Fiber': nutrition['fiber'],
                'Carbohydrates': nutrition['carbs']
            })
        time.sleep(0.2)
    return pd.DataFrame(recipes)

# Run the scraper
df = scrape_meals(limit=500)
df.to_csv("themealdb_500_recipes.csv", index=False)
print("✅ Done! Scraped 500 recipes from TheMealDB with estimated nutrition.")

In [None]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Nutritional lookup (approx. per 100g)
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'carrot': {'calories': 41, 'fat': 0.2, 'protein': 0.9, 'carbs': 10, 'sugar': 4.7, 'fiber': 2.8},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
    'salt': {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
}

# Headless browser setup for scraping
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--user-data-dir=/tmp/bbc-profile')  # avoids SessionNotCreatedException

driver = webdriver.Chrome(options=options)

def get_bbc_recipe_links(pages=50):
    links = set()
    for page in range(1, pages + 1):
        print(f"🔍 Fetching page {page}")
        url = f"https://www.bbcgoodfood.com/recipes/category/all?page={page}"
        driver.get(url)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for a in soup.select('h2.heading-4 a'):
            href = a.get('href')
            if href and '/recipes/' in href:
                links.add("https://www.bbcgoodfood.com" + href)
    return list(links)

def extract_ingredients_from_bbc(soup):
    return [li.text.strip().lower() for li in soup.select('.ingredients-section li') if li.text.strip()]

def estimate_nutrition(ingredients):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing:
                for key in total:
                    total[key] += NUTRITION_TABLE[known][key]
    return total

def get_bbc_recipe_details(url):
    try:
        driver.get(url)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        title = soup.select_one('h1').text.strip()
        ingredients = extract_ingredients_from_bbc(soup)
        method = ' '.join([step.text.strip() for step in soup.select('.method__item')])
        nutrition = estimate_nutrition(ingredients)
        return {
            'Meal': title,
            'URL': url,
            'Ingredients': ', '.join(ingredients),
            'Instructions': method,
            'Calories': nutrition['calories'],
            'Fat': nutrition['fat'],
            'Protein': nutrition['protein'],
            'Sugar': nutrition['sugar'],
            'Fiber': nutrition['fiber'],
            'Carbohydrates': nutrition['carbs']
        }
    except Exception as e:
        print(f"⚠️ Error scraping {url}: {e}")
        return None

def scrape_bbc_meals(limit=500):
    links = get_bbc_recipe_links(pages=50)
    print(f"Found {len(links)} recipes.")
    recipes = []
    for i, url in enumerate(links[:limit]):
        print(f"⏳ Scraping {i+1}/{limit}")
        meal = get_bbc_recipe_details(url)
        if meal:
            recipes.append(meal)
        time.sleep(1)
    return pd.DataFrame(recipes)

# Run scraper
df = scrape_bbc_meals(limit=500)
df.to_csv("bbc_good_food_500_recipes.csv", index=False)
print("✅ Done! Scraped 500 BBC Good Food recipes with estimated nutrition.")

driver.quit()


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Simple nutrition lookup table (expand as needed)
NUTRITION_TABLE = {
    'chicken': {'calories': 239, 'fat': 14, 'protein': 27, 'carbs': 0, 'sugar': 0, 'fiber': 0},
    'rice': {'calories': 130, 'fat': 0.3, 'protein': 2.7, 'carbs': 28, 'sugar': 0.1, 'fiber': 0.4},
    'onion': {'calories': 40, 'fat': 0.1, 'protein': 1.1, 'carbs': 9.3, 'sugar': 4.2, 'fiber': 1.7},
    'garlic': {'calories': 149, 'fat': 0.5, 'protein': 6.4, 'carbs': 33, 'sugar': 1, 'fiber': 2.1},
    'tomato': {'calories': 18, 'fat': 0.2, 'protein': 0.9, 'carbs': 3.9, 'sugar': 2.6, 'fiber': 1.2},
    'milk': {'calories': 42, 'fat': 1, 'protein': 3.4, 'carbs': 5, 'sugar': 5, 'fiber': 0},
    'butter': {'calories': 717, 'fat': 81, 'protein': 0.9, 'carbs': 0.1, 'sugar': 0.1, 'fiber': 0},
    'egg': {'calories': 155, 'fat': 11, 'protein': 13, 'carbs': 1.1, 'sugar': 1.1, 'fiber': 0},
}

def get_recipe_links(pages=25):
    links = []
    for i in range(1, pages + 1):
        url = f"https://www.allrecipes.com/recipes/?page={i}"
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'html.parser')
        for a in soup.select('a.card__titleLink'):
            link = a.get('href')
            if link and 'https://www.allrecipes.com/recipe/' in link:
                links.append(link)
        time.sleep(0.5)
    return list(set(links))

def extract_recipe(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')

    name = soup.find('h1', class_='headline').text.strip() if soup.find('h1', class_='headline') else 'N/A'
    ingredients_tags = soup.select('span.ingredients-item-name')
    ingredients = [i.text.strip().lower() for i in ingredients_tags if i.text.strip()]
    instructions_tag = soup.select('ul.instructions-section li p')
    instructions = ' '.join(i.text.strip() for i in instructions_tag)

    nutrition = estimate_nutrition(ingredients)
    return {
        'Meal': name,
        'Ingredients': ', '.join(ingredients),
        'Instructions': instructions,
        'Calories': nutrition['calories'],
        'Fat': nutrition['fat'],
        'Protein': nutrition['protein'],
        'Sugar': nutrition['sugar'],
        'Fiber': nutrition['fiber'],
        'Carbohydrates': nutrition['carbs']
    }

def estimate_nutrition(ingredients):
    total = {'calories': 0, 'fat': 0, 'protein': 0, 'carbs': 0, 'sugar': 0, 'fiber': 0}
    for ing in ingredients:
        for known in NUTRITION_TABLE:
            if known in ing:
                for key in total:
                    total[key] += NUTRITION_TABLE[known][key]
    return total

def scrape_allrecipes(limit=500):
    links = get_recipe_links(pages=40)
    data = []
    for i, link in enumerate(links[:limit]):
        try:
            print(f"Scraping ({i+1}/{limit}): {link}")
            recipe = extract_recipe(link)
            data.append(recipe)
            time.sleep(0.4)
        except Exception as e:
            print(f"❌ Error scraping {link}: {e}")
    return pd.DataFrame(data)

# Scrape and Save
df = scrape_allrecipes(limit=500)
df.to_csv("allrecipes_500.csv", index=False)
print("✅ Done! Saved 500 recipes from AllRecipes.com.")