In [5]:
import requests
from bs4 import BeautifulSoup
import os
import time

# Configuration
BASE_URL = "https://www.narcis.nl/search?q="  # Replace with the target website
QUERY = "klimaatverandering"  # Replace with your search query
DOWNLOAD_FOLDER = "test\pdf's"  # Folder to save downloaded PDFs
DELAY = 2  # Delay between requests (in seconds)
PAGES_TO_SCRAPE = 3  # Number of pages to scrape
MAX_PDFS = 5  # Maximum number of PDFs to download

# Create download folder if it doesn't exist
if not os.path.exists(DOWNLOAD_FOLDER):
    os.makedirs(DOWNLOAD_FOLDER)

def download_pdf(pdf_url, folder):
    """Download a PDF file from a given URL and save it to the specified folder."""
    try:
        pdf_response = requests.get(pdf_url)
        pdf_name = os.path.basename(pdf_url)  # Extract the PDF file name
        save_path = os.path.join(folder, pdf_name)
        with open(save_path, "wb") as pdf_file:
            pdf_file.write(pdf_response.content)
        print(f"Downloaded: {pdf_name}")
        return True  # Return True if download is successful
    except Exception as e:
        print(f"Failed to download {pdf_url}: {e}")
        return False  # Return False if download fails

def scrape_pdf_links(url):
    """Scrape PDF links from a given URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all links ending with ".pdf"
        pdf_links = soup.find_all("a", href=lambda href: (href and href.endswith(".pdf")))
        return [link.get("href") for link in pdf_links]
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return []

def main():
    pdf_count = 0  # Counter for downloaded PDFs

    # Scrape multiple pages
    for page in range(1, PAGES_TO_SCRAPE + 1):
        if pdf_count >= MAX_PDFS:
            break  # Stop if the maximum number of PDFs is reached

        print(f"Scraping page {page}...")
        url = f"{BASE_URL}{QUERY}&page={page}"  # Construct the URL for the current page
        pdf_links = scrape_pdf_links(url)

        # Download each PDF
        for link in pdf_links:
            if pdf_count >= MAX_PDFS:
                break  # Stop if the maximum number of PDFs is reached

            if not link.startswith("http"):  # Handle relative URLs
                link = BASE_URL + link

            if download_pdf(link, DOWNLOAD_FOLDER):
                pdf_count += 1  # Increment the counter if download is successful

            # Add a delay between requests
            time.sleep(DELAY)

    print(f"Scraping and downloading complete! Downloaded {pdf_count} PDFs.")

if __name__ == "__main__":
    main()

  DOWNLOAD_FOLDER = "test\pdf's"  # Folder to save downloaded PDFs


Scraping page 1...
Downloaded: DANS-Data-Stations-Policy.pdf
Scraping page 2...
Downloaded: DANS-Data-Stations-Policy.pdf
Scraping page 3...
Downloaded: DANS-Data-Stations-Policy.pdf
Scraping and downloading complete! Downloaded 3 PDFs.
