In [4]:
import os
import requests
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_qualification_links(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    qualification_links = []
    
    textarea = soup.find('textarea', class_='react-component--props')
    if textarea:
        try:
            data = json.loads(textarea.text)
            qualifications = data.get('qualifications', [])
            for q in qualifications:
                full_url = urljoin(base_url, q['link'])
                qualification_links.append(full_url)
        except json.JSONDecodeError:
            print("Failed to parse JSON from textarea")
    
    return qualification_links

def download_pdfs(qualification_url, download_folder):
    response = requests.get(qualification_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    key_documents_div = soup.find('div', class_='key-documents')
    if key_documents_div:
        for a_tag in key_documents_div.find_all('a', href=True):
            file_url = urljoin(qualification_url, a_tag['href'])
            if file_url.endswith('.pdf'):
                download_pdf(file_url, download_folder)

def download_pdf(pdf_url, folder):
    filename = pdf_url.split('/')[-1]
    filepath = os.path.join(folder, filename)
    
    response = requests.get(pdf_url, stream=True)
    if response.status_code == 200:
        with open(filepath, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download: {filename}")

def main():
    base_url = "https://www.wjec.co.uk/qualifications/"
    download_folder = r"C:\Users\amith\Kenpath\OFQUAL\wjec_pdfs"
    os.makedirs(download_folder, exist_ok=True)
    
    qualification_links = get_qualification_links(base_url)
    print(f"Found {len(qualification_links)} qualification pages.")
    
    for link in qualification_links:
        print(f"Visiting: {link}")
        download_pdfs(link, download_folder)

if __name__ == "__main__":
    main()


Found 175 qualification pages.
Visiting: https://www.wjec.co.uk/qualifications/criminology-level-3/
Visiting: https://www.wjec.co.uk/qualifications/science-today-entry-pathways/
Visiting: https://www.wjec.co.uk/qualifications/gcse-built-environment/
Visiting: https://www.wjec.co.uk/qualifications/built-environment-asa-level/
Visiting: https://www.wjec.co.uk/qualifications/essential-skills-for-work-and-life/
Visiting: https://www.wjec.co.uk/qualifications/welsh-for-adults-qualification-suite/
Visiting: https://www.wjec.co.uk/qualifications/essential-application-of-number-skills-eaons/
Visiting: https://www.wjec.co.uk/qualifications/essential-communication-skills-ecomms/
Visiting: https://www.wjec.co.uk/qualifications/essential-digital-literacy-skills-edls/
Visiting: https://www.wjec.co.uk/qualifications/essential-employability-skills-ees/
Visiting: https://www.wjec.co.uk/qualifications/art-and-design-gcse/
Visiting: https://www.wjec.co.uk/qualifications/additional-english-entry-pathways

In [20]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_qualification_links(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    qualification_links = []
    
    textarea = soup.find('textarea', class_='react-component--props')
    if textarea:
        try:
            data = json.loads(textarea.text)
            qualifications = data.get('qualifications', [])
            for q in qualifications:
                full_url = urljoin(base_url, q['link'])
                qualification_links.append(full_url)
        except json.JSONDecodeError:
            print("Failed to parse JSON from textarea")
    
    return qualification_links



base_url = "https://www.wjec.co.uk/qualifications/"
filename = "qualification_urls.txt"

qualification_links = get_qualification_links(base_url)
print(f"Found {len(qualification_links)} qualification pages.")

print(qualification_links)


Found 175 qualification pages.
['https://www.wjec.co.uk/qualifications/criminology-level-3/', 'https://www.wjec.co.uk/qualifications/science-today-entry-pathways/', 'https://www.wjec.co.uk/qualifications/gcse-built-environment/', 'https://www.wjec.co.uk/qualifications/built-environment-asa-level/', 'https://www.wjec.co.uk/qualifications/essential-skills-for-work-and-life/', 'https://www.wjec.co.uk/qualifications/welsh-for-adults-qualification-suite/', 'https://www.wjec.co.uk/qualifications/essential-application-of-number-skills-eaons/', 'https://www.wjec.co.uk/qualifications/essential-communication-skills-ecomms/', 'https://www.wjec.co.uk/qualifications/essential-digital-literacy-skills-edls/', 'https://www.wjec.co.uk/qualifications/essential-employability-skills-ees/', 'https://www.wjec.co.uk/qualifications/art-and-design-gcse/', 'https://www.wjec.co.uk/qualifications/additional-english-entry-pathways/', 'https://www.wjec.co.uk/qualifications/art-and-design-asa-level/', 'https://www.w

In [22]:
import os
import time
import requests
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from concurrent.futures import ThreadPoolExecutor

def get_pdf_links(qualification_url):
    # Set up the Selenium WebDriver (headless mode)
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run browser in the background
    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(qualification_url)
        
        # Wait for the page to load (you can adjust the wait time if necessary)
        time.sleep(2)
        
        # Accept cookies if the button is present
        try:
            accept_cookies_button = driver.find_element(By.XPATH, '//*[@id="accept-cookies"]')
            accept_cookies_button.click()
            print("Cookies accepted.")
            time.sleep(1)  # Wait a bit after accepting cookies
        except:
            print("No cookies prompt found or already accepted.")
        
        # Wait for the page to load (in case there are AJAX requests)
        time.sleep(5)

        # Find the element with id 'keydocuments_content' and extract PDF links
        pdf_elements = driver.find_elements(By.XPATH, '//*[@id="keydocuments_content"]/div[1]/div/div//a[contains(@href, ".pdf")]')
        pdf_links = [element.get_attribute('href') for element in pdf_elements]

        return pdf_links
    finally:
        driver.quit()

def download_pdf(pdf_url, folder):
    filename = pdf_url.split('/')[-1]
    filepath = os.path.join(folder, filename)

    response = requests.get(pdf_url, stream=True)
    if response.status_code == 200:
        with open(filepath, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download: {filename}")

def download_pdfs(qualification_url, download_folder):
    # Get the list of PDF links from the URL
    pdf_links = get_pdf_links(qualification_url)
    
    if not pdf_links:
        print(f"No PDFs found inside the specified container on: {qualification_url}")
        return

    # Use ThreadPoolExecutor to download PDFs concurrently
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(lambda url: download_pdf(url, download_folder), pdf_links)

download_folder = r"C:\Users\amith\Kenpath\OFQUAL\wjec_pdfs"
os.makedirs(download_folder, exist_ok=True)


# Visiting each link and appending '#tab_keydocuments' before calling the function
for link in qualification_links:
    link_with_tab = link + "#tab_keydocuments"
    print(f"Visiting: {link_with_tab}")
    download_pdfs(link_with_tab, download_folder)


Visiting: https://www.wjec.co.uk/qualifications/criminology-level-3/#tab_keydocuments
Cookies accepted.
Downloaded: wjec-applied-certificate-in-criminology-spec-e-22-06-22.pdf
Downloaded: wjec-applied-diploma-in-criminology-spec-e-22-06-22-1.pdf
Downloaded: wjec-applied-certificate-in-criminology-spec-e-22-06-22.pdf
Visiting: https://www.wjec.co.uk/qualifications/science-today-entry-pathways/#tab_keydocuments
Cookies accepted.
Downloaded: e-entry-pathways-specification-main-spec.pdf
Downloaded: e-entry-pathways-specification-main-spec.pdf
Visiting: https://www.wjec.co.uk/qualifications/gcse-built-environment/#tab_keydocuments
Cookies accepted.
Downloaded: wjec-gcse-built-environment-specification-e-20-10-2021.pdf
Downloaded: wjec-gcse-built-environment-specification-e-20-10-2021.pdf
Visiting: https://www.wjec.co.uk/qualifications/built-environment-asa-level/#tab_keydocuments
Cookies accepted.
Downloaded: wjec-gce-built-environment-specification-e-23-08-2021.pdf
Downloaded: wjec-gce-bui