In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up Selenium WebDriver
driver = webdriver.Chrome()
driver.get("https://othm.org.uk/qualification")

wait = WebDriverWait(driver, 10)

# Find and click all buttons
buttons = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="othm_type-headingOne"]/button')))

for button in buttons:
    try:
        # Scroll into view
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)
        # Click using JavaScript
        driver.execute_script("arguments[0].click();", button)
        print("Clicked a button.")
    except Exception as e:
        print(f"Could not click button: {e}")

# Wait for content to load inside the target div
wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="othm_type_accordion"]')))

# Extract all URLs inside the target div
url_list = []
accordion_div = driver.find_element(By.XPATH, '//*[@id="othm_type_accordion"]')
links = accordion_div.find_elements(By.TAG_NAME, "a")

for link in links:
    url = link.get_attribute("href")
    if url:
        url_list.append(url)

# Print the list of URLs
print(url_list)
print(len(url_list))

# Close the browser
driver.quit()


Clicked a button.
Clicked a button.
Clicked a button.
Clicked a button.
Clicked a button.
['https://www.ucas.com/ucas/tariff-calculator', 'https://othm.org.uk/qualification/othm-level-3-award-in-assessing-vocationally-related-achievement', 'https://othm.org.uk/qualification/othm-level-3-certificate-in-python', 'https://othm.org.uk/qualification/othm-level-3-diploma-in-business-management', 'https://othm.org.uk/qualification/othm-level-3-diploma-in-business-studies', 'https://othm.org.uk/qualification/othm-level-3-diploma-in-fashion-and-textiles', 'https://othm.org.uk/qualification/othm-level-3-diploma-in-law', 'https://othm.org.uk/qualification/othm-level-3-foundation-diploma-for-higher-education-studies', 'https://othm.org.uk/qualification/othm-level-3-foundation-diploma-in-accountancy', 'https://othm.org.uk/qualification/othm-level-3-foundation-diploma-in-employability-and-workplace-skills', 'https://othm.org.uk/qualification/othm-level-3-foundation-diploma-in-engineering', 'https://

In [7]:
import os
import requests
from bs4 import BeautifulSoup

# Folder to save PDFs
download_folder = r"C:\Users\amith\Kenpath\OFQUAL\othm_pdfs"
os.makedirs(download_folder, exist_ok=True)



# Function to download a PDF
def download_pdf(pdf_url):
    pdf_filename = os.path.join(download_folder, pdf_url.split("/")[-1])
    try:
        response = requests.get(pdf_url, timeout=10)
        response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)

        with open(pdf_filename, "wb") as pdf_file:
            pdf_file.write(response.content)
        print(f"Downloaded: {pdf_filename}")

    except requests.exceptions.RequestException as e:
        print(f"Failed to download {pdf_url}: {e}")

# Visit each link and extract all PDF links
for link in url_list:
    try:
        response = requests.get(link, timeout=10)
        response.raise_for_status()  # Check if the page is accessible
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all 'a' tags that link to PDF files
        pdf_links = soup.find_all("a", href=lambda href: href and href.endswith(".pdf"))

        if not pdf_links:
            print(f"No PDF links found for {link}")
            continue  # Skip to the next link

        # Download each PDF found on the page
        for pdf_link in pdf_links:
            pdf_url = pdf_link.get("href")
            if not pdf_url:
                print(f"Invalid PDF link found on {link}")
                continue
            
            if pdf_url.startswith("/"):  # Convert relative URL to absolute
                pdf_url = "https://othm.org.uk" + pdf_url

            # Download the PDF
            download_pdf(pdf_url)

    except requests.exceptions.RequestException as e:
        print(f"Failed to access {link}: {e}")

print("Process completed!")


No PDF links found for https://www.ucas.com/ucas/tariff-calculator
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\OTHM-Customer-Service-Statement-2023-09-11.pdf
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\othm-guidance-statement-to-centres-on-the-risk-of-ai.pdf
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\7549OTHM_L3_Award_AVRA_spec_May_2023.pdf
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\othm_education_and_training_additional_resources_jan_2025_2025-01-15_11-17.pdf
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\OTHM-Customer-Service-Statement-2023-09-11.pdf
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\othm-guidance-statement-to-centres-on-the-risk-of-ai.pdf
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\othm_level_3_certificate_in_python_specification_finaldocx_2025-02-03_14-17.pdf
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\OTHM-Customer-Service-Statement-2023-09-11.pdf
Downloaded: C:\Users\amith\Kenpath\OFQUAL\othm_pdfs\othm-guida