In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# Function to fetch and parse a recipe page
def get_recipe_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    try:
        title = soup.find('h1').get_text(strip=True)
    except AttributeError:
        title = 'N/A'

    ingredients = []
    for li in soup.select("ul.ingredient-list li"):
        ingredients.append(li.get_text(strip=True))

    instructions = []
    for step in soup.select("ol.recipe-procedures li"):
        step_text = step.get_text(strip=True)
        if step_text:
            instructions.append(step_text)

    return {
        'title': title,
        'ingredients': ingredients,
        'instructions': instructions
    }

# Function to scrape multiple recipes from the Serious Eats listing pages
def scrape_serious_eats_recipes(base_url, num_pages=1):
    recipes = []

    for page in range(1, num_pages + 1):
        print(f"Scraping page {page}...")
        url = f"{base_url}?page={page}"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find recipe links
        recipe_links = []
        for a in soup.select('a[href^="https://www.seriouseats.com/"][data-vars-gtm-click-type="recipe"]'):
            link = a['href']
            if link not in recipe_links:
                recipe_links.append(link)

        print(f"Found {len(recipe_links)} recipes on page {page}.")

        for link in recipe_links:
            try:
                print(f"Fetching recipe: {link}")
                details = get_recipe_details(link)
                recipes.append(details)
                time.sleep(1)  # be polite to the server
            except Exception as e:
                print(f"Failed to fetch {link}: {e}")

    return recipes

# Save recipes to CSV
def save_recipes_to_csv(recipes, filename='serious_eats_recipes.csv'):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'Ingredients', 'Instructions'])

        for recipe in recipes:
            writer.writerow([
                recipe['title'],
                '; '.join(recipe['ingredients']),
                ' '.join(recipe['instructions'])
            ])
    print(f"Saved {len(recipes)} recipes to {filename}")

# Example usage
if __name__ == "__main__":
    BASE_URL = "https://www.seriouseats.com/recipes"
    recipes = scrape_serious_eats_recipes(BASE_URL, num_pages=2)
    save_recipes_to_csv(recipes)


Scraping page 1...
Found 0 recipes on page 1.
Scraping page 2...
Found 0 recipes on page 2.
Saved 0 recipes to serious_eats_recipes.csv


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# List of sample recipe URLs
recipe_urls = [
    "https://www.seriouseats.com/best-chocolate-chip-cookies-recipe",
    "https://www.seriouseats.com/classic-macaroni-and-cheese-recipe",
    "https://www.seriouseats.com/easy-chicken-curry-recipe"
]

# Function to get recipe data
def get_recipe_data(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content, 'html.parser')

    # Title
    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "No Title"

    # Ingredients
    ingredients = []
    ingredient_tags = soup.select("ul.structured-ingredients__list li")
    if not ingredient_tags:  # fallback for older structure
        ingredient_tags = soup.select("li.ingredient")
    for li in ingredient_tags:
        ingredients.append(li.get_text(strip=True))

    # Instructions
    instructions = []
    step_tags = soup.select("ol.comp.mntl-sc-block-group--OL li")
    if not step_tags:  # fallback
        step_tags = soup.select("div.section-body p")
    for step in step_tags:
        text = step.get_text(strip=True)
        if text:
            instructions.append(text)

    return {
        'title': title,
        'ingredients': ingredients,
        'instructions': instructions
    }

# Save to CSV
def save_to_csv(data, filename="serious_eats_recipes.csv"):
    with open(filename, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Ingredients", "Instructions"])
        for recipe in data:
            writer.writerow([
                recipe['title'],
                "; ".join(recipe['ingredients']),
                " ".join(recipe['instructions'])
            ])
    print(f"✅ Saved {len(data)} recipes to {filename}")

# Main runner
if __name__ == "__main__":
    all_recipes = []
    for url in recipe_urls:
        print(f"🔎 Scraping: {url}")
        try:
            recipe = get_recipe_data(url)
            all_recipes.append(recipe)
            time.sleep(1)
        except Exception as e:
            print(f"❌ Error scraping {url}: {e}")

    save_to_csv(all_recipes)


🔎 Scraping: https://www.seriouseats.com/best-chocolate-chip-cookies-recipe
🔎 Scraping: https://www.seriouseats.com/classic-macaroni-and-cheese-recipe
🔎 Scraping: https://www.seriouseats.com/easy-chicken-curry-recipe
✅ Saved 3 recipes to serious_eats_recipes.csv


In [None]:
pip install selenium beautifulsoup4 pandas


Collecting selenium
  Downloading selenium-4.33.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.33.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

# --- Setup Selenium ---
def get_driver():
    options = Options()
    options.add_argument("--headless")  # run browser in background
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    return driver

# --- Step 1: Collect Recipe URLs from listing pages ---
def get_recipe_urls(max_links=500):
    print("🔍 Collecting recipe URLs...")
    driver = get_driver()
    url = "https://www.seriouseats.com/recipes"
    driver.get(url)

    # Scroll to load more content dynamically
    SCROLL_PAUSE_TIME = 2
    links = set()
    scrolls = 0

    while len(links) < max_links and scrolls < 100:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        recipe_cards = soup.select('a[data-vars-gtm-click-type="recipe"]')

        for card in recipe_cards:
            href = card.get("href")
            if href and href.startswith("https://www.seriouseats.com/") and "/recipe" in href:
                links.add(href)

        scrolls += 1
        print(f"Collected {len(links)} links...")

    driver.quit()
    return list(links)[:max_links]

# --- Step 2: Scrape Recipe Content ---
def scrape_recipe(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        res = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(res.content, 'html.parser')

        title = soup.find("h1").get_text(strip=True)

        ingredients = [li.get_text(strip=True) for li in soup.select("ul.structured-ingredients__list li")]
        if not ingredients:
            ingredients = [li.get_text(strip=True) for li in soup.select("li.ingredient")]

        steps = [step.get_text(strip=True) for step in soup.select("ol.comp.mntl-sc-block-group--OL li")]
        if not steps:
            steps = [p.get_text(strip=True) for p in soup.select("div.section-body p")]

        return {
            "title": title,
            "ingredients": "; ".join(ingredients),
            "instructions": " ".join(steps),
            "url": url
        }

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
        return None

# --- Step 3: Save to CSV ---
def save_to_csv(recipes, filename="serious_eats_500_recipes.csv"):
    df = pd.DataFrame(recipes)
    df.to_csv(filename, index=False)
    print(f"✅ Saved {len(recipes)} recipes to {filename}")

# --- Main Runner ---
if __name__ == "__main__":
    import requests
    all_urls = get_recipe_urls(max_links=500)
    all_recipes = []

    for i, url in enumerate(all_urls):
        print(f"📄 Scraping {i+1}/{len(all_urls)}: {url}")
        data = scrape_recipe(url)
        if data:
            all_recipes.append(data)
        time.sleep(1)  # polite delay

    save_to_csv(all_recipes)


🔍 Collecting recipe URLs...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collected 0 links...
Collec

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests
import csv
import time

# --- Set up Selenium headless browser ---
def get_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    return driver

# --- Step 1: Scrape recipe URLs from listing page ---
def get_recipe_urls(target_count=200):
    print("🔍 Collecting recipe URLs...")
    driver = get_driver()
    driver.get("https://www.seriouseats.com/recipes")
    time.sleep(3)

    recipe_links = set()
    scrolls = 0
    max_scrolls = 100

    while len(recipe_links) < target_count and scrolls < max_scrolls:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        anchors = soup.select('a[data-vars-gtm-click-type="recipe"]')

        for a in anchors:
            href = a.get("href")
            if href and href.startswith("https://www.seriouseats.com/") and "/recipe" in href:
                recipe_links.add(href)

        scrolls += 1
        print(f"📦 {len(recipe_links)} recipes found...")

    driver.quit()
    return list(recipe_links)[:target_count]

# --- Step 2: Scrape individual recipe data ---
def get_recipe_data(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content, 'html.parser')

    # Title
    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "No Title"

    # Ingredients
    ingredients = []
    ingredient_tags = soup.select("ul.structured-ingredients__list li")
    if not ingredient_tags:
        ingredient_tags = soup.select("li.ingredient")
    for li in ingredient_tags:
        ingredients.append(li.get_text(strip=True))

    # Instructions
    instructions = []
    step_tags = soup.select("ol.comp.mntl-sc-block-group--OL li")
    if not step_tags:
        step_tags = soup.select("div.section-body p")
    for step in step_tags:
        text = step.get_text(strip=True)
        if text:
            instructions.append(text)

    return {
        'title': title,
        'ingredients': ingredients,
        'instructions': instructions
    }

# --- Step 3: Save all recipes to CSV ---
def save_to_csv(data, filename="serious_eats_200_recipes.csv"):
    with open(filename, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Title", "Ingredients", "Instructions"])
        for recipe in data:
            writer.writerow([
                recipe['title'],
                "; ".join(recipe['ingredients']),
                " ".join(recipe['instructions'])
            ])
    print(f"✅ Saved {len(data)} recipes to {filename}")

# --- Main function ---
if __name__ == "__main__":
    recipe_urls = get_recipe_urls(target_count=200)

    all_recipes = []
    for i, url in enumerate(recipe_urls):
        print(f"🔎 Scraping {i+1}/{len(recipe_urls)}: {url}")
        try:
            recipe = get_recipe_data(url)
            all_recipes.append(recipe)
            time.sleep(1)
        except Exception as e:
            print(f"❌ Error scraping {url}: {e}")

    save_to_csv(all_recipes)


🔍 Collecting recipe URLs...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 recipes found...
📦 0 re

In [None]:
pip install requests beautifulsoup4 pandas




In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_recipe_links(page_url):
    res = requests.get(page_url)
    soup = BeautifulSoup(res.content, 'html.parser')
    links = []
    for a in soup.select('a.card__titleLink'):
        link = a.get('href')
        if link and link.startswith("https://www.allrecipes.com/recipe/"):
            links.append(link)
    return list(set(links))

def get_recipe_data(recipe_url):
    res = requests.get(recipe_url)
    soup = BeautifulSoup(res.content, 'html.parser')

    title = soup.find('h1', class_='headline heading-content')
    title = title.text.strip() if title else 'N/A'

    ingredients = [i.text.strip() for i in soup.select('span.ingredients-item-name')]
    ingredients = ', '.join(ingredients)

    nutrition = soup.find('div', class_='partial recipe-nutrition-section')
    nutrition = nutrition.get_text(strip=True) if nutrition else 'N/A'

    return {
        'Title': title,
        'Ingredients': ingredients,
        'Nutrition': nutrition,
        'URL': recipe_url
    }

# Scrape multiple pages
all_data = []
base_url = 'https://www.allrecipes.com/recipes/?page='
for page in range(1, 3):  # Scrape first 2 pages (increase if needed)
    print(f"Scraping page {page}...")
    recipe_links = get_recipe_links(base_url + str(page))
    for link in recipe_links:
        try:
            data = get_recipe_data(link)
            all_data.append(data)
            print(f"Scraped: {data['Title']}")
            time.sleep(1)  # Be polite
        except Exception as e:
            print(f"Error scraping {link}: {e}")

# Save to CSV
df = pd.DataFrame(all_data)
df.to_csv('nutrifusion_recipes.csv', index=False)
print("✅ Data saved to nutrifusion_recipes.csv")


Scraping page 1...
Scraping page 2...
✅ Data saved to nutrifusion_recipes.csv


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from tqdm import tqdm
from requests.exceptions import ChunkedEncodingError, ConnectionError, Timeout

# Function to get recipe links from a search results page
def get_recipe_links(page_url):
    try:
        response = requests.get(page_url, timeout=10) # Added timeout
        response.raise_for_status() # Raise an exception for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        links = []
        for a_tag in soup.select('a.card__titleLink'):
            link = a_tag.get('href')
            if link and link.startswith("https://www.allrecipes.com/recipe/"):
                links.append(link)
        return list(set(links))
    except (ChunkedEncodingError, ConnectionError, Timeout) as e:
        print(f"⚠️ Error fetching page {page_url}: {e}")
        return [] # Return empty list if there's an error

# Function to extract data from each recipe page
def get_recipe_data(url):
    try:
        response = requests.get(url, timeout=10) # Added timeout
        response.raise_for_status() # Raise an exception for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.find('h1', class_='headline heading-content')
        title = title.text.strip() if title else "N/A"

        ingredients = [i.text.strip() for i in soup.select('span.ingredients-item-name')]
        ingredients = ', '.join(ingredients) if ingredients else "N/A"

        nutrition_section = soup.find('div', class_='partial recipe-nutrition-section')
        nutrition = nutrition_section.get_text(strip=True).replace("Full Nutrition", "") if nutrition_section else "N/A"

        return {
            "Title": title,
            "Ingredients": ingredients,
            "Nutrition": nutrition,
            "URL": url
        }
    except (ChunkedEncodingError, ConnectionError, Timeout) as e:
        print(f"⚠️ Error scraping recipe {url}: {e}")
        return None # Return None if there's an error
    except Exception as e:
        print(f"⚠️ An unexpected error occurred while scraping {url}: {e}")
        return None


# Main loop to collect at least 300 recipes
all_recipes = []
seen_links = set()
page = 1

print("⏳ Starting to scrape recipes...")

# Continue scraping from the page where it left off
start_page = page # Use the last successfully scraped page as the starting point

while len(all_recipes) < 300:
    page_url = f'https://www.allrecipes.com/recipes/?page={start_page}'
    print(f"🔍 Scraping page {start_page}...")
    recipe_links = get_recipe_links(page_url)

    if not recipe_links: # If no links are found on a page, it might be the end or an error.
        print(f"🛑 No new links found on page {start_page}. Ending scraping.")
        break

    for link in tqdm(recipe_links, desc=f"Processing page {start_page}", leave=False):
        if link not in seen_links:
            try:
                recipe = get_recipe_data(link)
                if recipe and recipe['Title'] != "N/A" and recipe['Ingredients'] != "N/A" and recipe['Nutrition'] != "N/A":
                    all_recipes.append(recipe)
                    seen_links.add(link)
            except Exception as e:
                print(f"⚠️ Error with {link}: {e}")
            time.sleep(2)  # Increased delay to be more polite

        if len(all_recipes) >= 300:
            break
    start_page += 1 # Increment page number for the next iteration

# Save to CSV
df = pd.DataFrame(all_recipes)
df.to_csv("nutrifusion_300_recipes.csv", index=False)
print(f"✅ Scraping complete! {len(all_recipes)} recipes saved to 'nutrifusion_300_recipes.csv'")

⏳ Starting to scrape recipes...
🔍 Scraping page 1...
🛑 No new links found on page 1. Ending scraping.
✅ Scraping complete! 0 recipes saved to 'nutrifusion_300_recipes.csv'
