In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

headers = {
    "User-Agent": "Mozilla/5.0"
}

# Base URL for Tasty recipes (pagination example)
base_url = "https://tasty.co/topic/all?page={}"

def get_recipe_links(page):
    url = base_url.format(page)
    print(f"Fetching recipe links from: {url}")
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    links = []
    # Recipes are linked in article tags with <a href="/recipe/..."
    for a in soup.select("a.recipe-feed-item__link"):
        href = a.get("href")
        if href and href.startswith("/recipe/"):
            full_url = "https://tasty.co" + href
            links.append(full_url)
    return links

def scrape_recipe(url):
    print(f"Scraping recipe: {url}")
    try:
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, "html.parser")

        # Title
        title_tag = soup.find("h1", class_="recipe-name")
        title = title_tag.get_text(strip=True) if title_tag else "N/A"

        # Ingredients
        ingredients = [i.get_text(strip=True) for i in soup.select("li.ingredients__section-item")]

        # Instructions
        instructions = [step.get_text(strip=True) for step in soup.select("li.instructions__item")]

        return {
            "Title": title,
            "URL": url,
            "Ingredients": "; ".join(ingredients),
            "Instructions": " ".join(instructions)
        }
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None

all_recipes = []
pages_to_scrape = 5  # Change as needed (each page has ~10-12 recipes)

for page in range(1, pages_to_scrape + 1):
    recipe_links = get_recipe_links(page)
    print(f"Found {len(recipe_links)} recipes on page {page}")

    for link in recipe_links:
        data = scrape_recipe(link)
        if data:
            all_recipes.append(data)
        time.sleep(1)  # Be polite and avoid hammering the server

# Save to CSV
df = pd.DataFrame(all_recipes)
df.to_csv("tasty_recipes.csv", index=False, encoding="utf-8")
print(f"Saved {len(all_recipes)} recipes to tasty_recipes.csv")


Fetching recipe links from: https://tasty.co/topic/all?page=1
Found 0 recipes on page 1
Fetching recipe links from: https://tasty.co/topic/all?page=2
Found 0 recipes on page 2
Fetching recipe links from: https://tasty.co/topic/all?page=3
Found 0 recipes on page 3
Fetching recipe links from: https://tasty.co/topic/all?page=4
Found 0 recipes on page 4
Fetching recipe links from: https://tasty.co/topic/all?page=5
Found 0 recipes on page 5
Saved 0 recipes to tasty_recipes.csv


In [2]:
# Install the selenium library
!pip install selenium

# Install openpyxl for saving to .xlsx
!pip install openpyxl

# Install chromium-browser (needed for headless Chrome)
!sudo apt-get update
!sudo apt-get install -y chromium-browser

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import pandas as pd

options = Options()
# Configure Chrome options for headless execution with the correct path
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.binary_location = '/usr/bin/chromium-browser' # Specify the path to chromium-browser

driver = webdriver.Chrome(options=options)

url = "https://www.eatthismuch.com/recipes/"
driver.get(url)
time.sleep(5)  # wait for page to load JS content

recipe_titles = []
recipe_links = []

# Find recipe cards
cards = driver.find_elements(By.CSS_SELECTOR, "a.recipe-card__link")

for card in cards:
    title = card.text
    link = card.get_attribute("href")
    recipe_titles.append(title)
    recipe_links.append(link)

driver.quit()

# Save to CSV
df = pd.DataFrame({"Title": recipe_titles, "URL": recipe_links})
df.to_csv("eatthismuch_recipes.csv", index=False)
print(f"Saved {len(recipe_titles)} recipes to eatthismuch_recipes.csv")

Collecting selenium
  Downloading selenium-4.33.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.33.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post