In [3]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# URL to scrape PDFs from
BASE_URL = "https://othm.org.uk/qualification/othm-level-7-diploma-in-data-science"
# Directory to save downloaded PDFs
DOWNLOAD_FOLDER = r"C:\Users\amith\Kenpath\OFQUAL\othm_pdfs"

def get_pdf_links(url):
    """Fetch all PDF links from the given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    pdf_links = []

    # Find all anchor tags with href containing .pdf
    for link in soup.find_all("a", href=True):
        href = link["href"]
        if href.endswith(".pdf"):
            full_url = urljoin(url, href)  # Handle relative URLs
            pdf_links.append(full_url)

    return pdf_links

def download_pdf(url, folder):
    """Download a PDF file and save it locally."""
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        filename = os.path.join(folder, os.path.basename(url))
        with open(filename, "wb") as pdf_file:
            for chunk in response.iter_content(1024):
                pdf_file.write(chunk)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download: {url}")


os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)

pdf_links = get_pdf_links(BASE_URL)
if not pdf_links:
    print("No PDF files found.")

print(f"Found {len(pdf_links)} PDFs. Downloading...")
for pdf_url in pdf_links:
    download_pdf(pdf_url, DOWNLOAD_FOLDER)



Found 3 PDFs. Downloading...
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\OTHM-Customer-Service-Statement-2023-09-11.pdf
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\othm-guidance-statement-to-centres-on-the-risk-of-ai.pdf
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\othm_level_7_diploma_in_data_science_specification_february_2023_2023-07-26_10-33.pdf


In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# URL of the webpage containing PDF links
page_url = "https://www.highfieldqualifications.com/products/qualifications/quality-assurance/external-level-4"

# Folder to save PDFs
download_folder = r"C:\Users\amith\Kenpath\OFQUAL\highfield_pdfs"
os.makedirs(download_folder, exist_ok=True)

# Fetch the webpage
response = requests.get(page_url)
if response.status_code != 200:
    print(f"Failed to fetch the webpage. Status code: {response.status_code}")
    exit()

# Parse the webpage
soup = BeautifulSoup(response.text, "html.parser")

# Find all <a> tags
pdf_links = [a["href"] for a in soup.find_all("a", href=True)]

# Process and download each link
count = 515 # Counter for naming files
for link in pdf_links:
    full_url = urljoin(page_url, link)  # Convert relative URLs to absolute

    # Check if the URL is a PDF by making a HEAD request
    head_response = requests.head(full_url, allow_redirects=True)
    content_type = head_response.headers.get("Content-Type", "")

    if "pdf" in content_type.lower():  # Confirm it's a PDF
        pdf_name = os.path.join(download_folder, f"ofqual{count}.pdf")  # Sequential naming

        # Download the PDF
        pdf_response = requests.get(full_url, stream=True)
        if pdf_response.status_code == 200:
            with open(pdf_name, "wb") as file:
                for chunk in pdf_response.iter_content(chunk_size=1024):
                    file.write(chunk)
            print(f"Downloaded: {pdf_name}")
            count += 1  # Increment the counter
        else:
            print(f"Failed to download {full_url}. Status code: {pdf_response.status_code}")

print("Download complete!")


Download complete!
