In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

BASE_URL = "https://focusawards.org.uk/qualifications/"

def get_absolute_links(page_url):
    """ Get all qualification links from the main page and filter out invalid links. """
    response = requests.get(page_url)
    if response.status_code != 200:
        print(f"Failed to fetch {page_url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    links = set()  # Use a set to remove duplicates

    for a in soup.find_all("a", href=True):
        abs_link = urljoin(BASE_URL, a["href"])
        parsed_link = urlparse(abs_link)

        # Ensure the link is from the same domain and is not a fragment/mailto link
        if parsed_link.netloc == urlparse(BASE_URL).netloc and not parsed_link.fragment and not parsed_link.scheme.startswith("mailto"):
            links.add(abs_link)

    print(f"Valid qualification links found: {len(links)}")
    return list(links)

def scrape_pdf_links(links):
    """ Visit each link and extract the PDF link. """
    pdf_links = []
    for link in links:
        response = requests.get(link)
        if response.status_code != 200:
            print(f"Failed to fetch {link}")
            continue

        soup = BeautifulSoup(response.text, "html.parser")
        pdf_tag = soup.select_one("#content > div > section > div.container-fluid > div > div > div.dddd > div > a")

        if pdf_tag and pdf_tag.get("href", "").endswith(".pdf"):
            pdf_links.append(urljoin(link, pdf_tag["href"]))

    return pdf_links

# Step 1: Get all valid qualification links
qualification_links = get_absolute_links(BASE_URL)

# Step 2: Scrape PDF links from each qualification page
pdf_links = scrape_pdf_links(qualification_links)

# Print the extracted PDF links
print(pdf_links)


Valid qualification links found: 282
['https://focusawards.org.uk/wp-content/uploads/2024/05/Focus-Awards-Level-2-Certificate-in-Supporting-Learning-in-Physical-Education-and-Learning-School-Sport-CYM.pdf', 'https://focusawards.org.uk/wp-content/uploads/2024/09/WS-Focus-Awards-Level-5-Diploma-in-Leading-and-Managing-an-Adult-Care-Service.pdf', 'https://focusawards.org.uk/wp-content/uploads/2025/02/WS-Focus-Awards-Level-3-Award-in-AI-in-Education-RQF.pdf', 'https://focusawards.org.uk/wp-content/uploads/2025/01/Focus20Awards20Level20320Diploma20in20Exercise20Referral2028RQF29_New_MR2028129-compressed.pdf', 'https://focusawards.org.uk/wp-content/uploads/2024/05/Focus-Award-Level-3-Award-in-Employment-Awareness-in-Active-Leisure-and-Learning-RQF.pdf', 'https://focusawards.org.uk/wp-content/uploads/2025/01/Focus-Awards-Level-4-Certificate-in-Leading-the-Internal-Quality-Cymraeg-compressed.pdf', 'https://focusawards.org.uk/wp-content/uploads/2025/01/Focus-Awards-Level-5-Certificate-in-Gut-Mi

In [3]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor

# Directory to save PDFs
SAVE_DIR = r"C:\Users\amith\Kenpath\OFQUAL\focusawards_pdfs"
os.makedirs(SAVE_DIR, exist_ok=True)

def download_pdf(pdf_url):
    """ Download a single PDF file. """
    pdf_name = os.path.join(SAVE_DIR, pdf_url.split("/")[-1])  # Extract filename
    try:
        response = requests.get(pdf_url, stream=True)
        if response.status_code == 200:
            with open(pdf_name, "wb") as pdf_file:
                for chunk in response.iter_content(chunk_size=1024):
                    pdf_file.write(chunk)
        else:
            print(f"Failed to download: {pdf_url}")
    except Exception as e:
        print(f"Error downloading {pdf_url}: {e}")

def download_pdfs_concurrently(pdf_links, max_threads=5):
    """ Download PDFs using multithreading. """
    with ThreadPoolExecutor(max_threads) as executor:
        executor.map(download_pdf, pdf_links)


# Download PDFs concurrently
download_pdfs_concurrently(pdf_links, max_threads=15)


Failed to download: https://qualitas.focusawards.org.uk/qfilex/hopfile1686849521WS%20-%20Focus%20Awards%20Level%203%20Certificate%20in%20Leisure%20Management%20(RQF)%20D22.pdf
Failed to download: https://qualitas.focusawards.org.uk/qfilex/hopfile1686846997WS%20-%20Focus%20Awards%20Level%202%20Certificate%20in%20Coaching%20Weight%20Lifting%20(RQF)%20-%20J23.pdf
Failed to download: https://qualitas.focusawards.org.uk/qfilex/hopfile1686935024WS%20-%20Focus%20Awards%20Level%201%20Award%20in%20Employability%20Skills%20(RQF)M23.pdf
Failed to download: https://qualitas.focusawards.org.uk/qfilex/hopfile1686844877WS%20-%20%20%20Focus%20Awards%20Level%202%20NVQ%20Certificate%20in%20Active%20Leisure,%20Learning%20and%20Well-Being%20Operational%20Services%20(RQF)%20%20.pdf
Failed to download: https://www.focusawards.org.uk/wp-content/uploads/2019/05/WS-Focus-Awards-Level-2-Certificate-in-Customer-Service-RQF.pdf
Failed to download: https://qualitas.focusawards.org.uk/qfilex/hopfile1686857011WS%20-