In [1]:
import requests
import pandas as pd
import time

from bs4 import BeautifulSoup
from parsel import Selector
from random import randrange

## Allrecipes Link Scraping

In [15]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}

def get_links(url, selector_css):
    response = requests.get(url, headers=headers)
    selector = Selector(text=response.text)
    return selector.css(selector_css).getall()

def get_category_links(homepage_url):
    print(f'Extracting category links from: {homepage_url}')
    return get_links(homepage_url, 'a[href*="/recipes/"]::attr(href)')

def get_subcategory_links(category_url):
    print(f'Extracting subcategory links from: {category_url}')
    return get_links(category_url, 'a[href*="/recipes/"]::attr(href)')

def get_recipe_links(category_url):
    links = set()  # Use a set to avoid duplicates
    page = 1
    while True:
        print(f'Extracting links from: {category_url} (Page {page})')
        
        html = requests.get(category_url, headers=headers, timeout=30)
        selector = Selector(text=html.text)

        links.update(selector.css('.card::attr(href)').getall())  # Add links to the set
        
        next_page = selector.css('a[aria-label="Next"]::attr(href)').get()
        if not next_page:
            break
        
        category_url = next_page
        page += 1
        
        time.sleep(randrange(3, 6))  # Random sleep to avoid overloading the server

    return links

# Main script
homepage_url = 'https://www.allrecipes.com/'
all_recipe_links = set()  # Use a set to store all recipe links

# Step 1: Get all category links from the homepage
category_links = get_category_links(homepage_url)

# Step 2: Get links from each category and explore subcategories
for category in category_links:
    all_recipe_links.update(get_recipe_links(category))  # Update the set with new links
    subcategory_links = get_subcategory_links(category)
    
    for subcategory in subcategory_links:
        all_recipe_links.update(get_recipe_links(subcategory))  # Update the set with new links

# Save the links to a JSON file
pd.Series(list(all_recipe_links)).to_json('links/links_allrecipes.json', index=False)


Extracting category links from: https://www.allrecipes.com/
Extracting links from: https://www.allrecipes.com/recipes/17562/dinner/ (Page 1)
Extracting subcategory links from: https://www.allrecipes.com/recipes/17562/dinner/
Extracting links from: https://www.allrecipes.com/recipes/17562/dinner/ (Page 1)
Extracting links from: https://www.allrecipes.com/recipes/17057/everyday-cooking/more-meal-ideas/5-ingredients/main-dishes/ (Page 1)
Extracting links from: https://www.allrecipes.com/recipes/15436/everyday-cooking/one-pot-meals/ (Page 1)
Extracting links from: https://www.allrecipes.com/recipes/1947/everyday-cooking/quick-and-easy/ (Page 1)
Extracting links from: https://www.allrecipes.com/recipes/455/everyday-cooking/more-meal-ideas/30-minute-meals/ (Page 1)
Extracting links from: https://www.allrecipes.com/recipes/94/soups-stews-and-chili/ (Page 1)
Extracting links from: https://www.allrecipes.com/recipes/16099/everyday-cooking/comfort-food/ (Page 1)
Extracting links from: https://ww

## Bon Appetit Link Scraping

In [41]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}

BASE_URL = 'https://www.bonappetit.com'

def get_links_from_recipes_page(initial_url):
    links = set()
    page_number = 1

    while True:
        # Construct the URL for the current page
        recipes_url = f'{initial_url}?page={page_number}'
        print(f'Fetching: {recipes_url}')  # Debugging

        try:
            response = requests.get(recipes_url, headers=headers)
            response.raise_for_status()
            selector = Selector(text=response.text)

            # Fetch recipe links from the current recipes page
            relative_links = selector.css('a[href*="/recipe/"]::attr(href)').getall()
            absolute_links = {BASE_URL + link for link in relative_links if link.startswith('/')}
            links.update(absolute_links)

            # Check if there are any recipe links; if not, break the loop
            if not relative_links:
                break

            # Increment page number for the next iteration
            page_number += 1
            
            # Sleep to avoid overwhelming the server
            time.sleep(2)  # Sleep for 2 seconds

        except requests.RequestException as e:
            print(f"Error fetching {recipes_url}: {e}")
            break

    return links

# Main script
initial_recipes_url = 'https://www.bonappetit.com/recipes'
all_recipe_links = get_links_from_recipes_page(initial_recipes_url)

# Debugging: Print the number of links found
print(f'Total links found: {len(all_recipe_links)}')

# Save the links to a JSON file
if all_recipe_links:
    pd.Series(list(all_recipe_links)).to_json('links/links_bon_appetit.json', index=False)
    print(f'Saved {len(all_recipe_links)} links to recipes-links-bon-appetit.json')
else:
    print('No links were found.')

Fetching: https://www.bonappetit.com/recipes?page=1
Fetching: https://www.bonappetit.com/recipes?page=2
Fetching: https://www.bonappetit.com/recipes?page=3
Fetching: https://www.bonappetit.com/recipes?page=4
Fetching: https://www.bonappetit.com/recipes?page=5
Fetching: https://www.bonappetit.com/recipes?page=6
Fetching: https://www.bonappetit.com/recipes?page=7
Fetching: https://www.bonappetit.com/recipes?page=8
Fetching: https://www.bonappetit.com/recipes?page=9
Fetching: https://www.bonappetit.com/recipes?page=10
Fetching: https://www.bonappetit.com/recipes?page=11
Fetching: https://www.bonappetit.com/recipes?page=12
Fetching: https://www.bonappetit.com/recipes?page=13
Fetching: https://www.bonappetit.com/recipes?page=14
Fetching: https://www.bonappetit.com/recipes?page=15
Fetching: https://www.bonappetit.com/recipes?page=16
Fetching: https://www.bonappetit.com/recipes?page=17
Fetching: https://www.bonappetit.com/recipes?page=18
Fetching: https://www.bonappetit.com/recipes?page=19
Fe