In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# URL to scrape
url = "https://www.proqualab.com/qualification-specifications/"

# Send a GET request
response = requests.get(url, timeout=15)
if response.status_code != 200:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")
    exit()

# Parse the page content
soup = BeautifulSoup(response.text, "html.parser")

# Find all div elements with class "qualification"
qualification_divs = soup.find_all("div", class_="qualification")

# Extract IDs and PDF links
data = []
for div in qualification_divs:
    id_span = div.find("span")  # Qualification ID is inside the <span> tag
    pdf_link = div.find("a", class_="drlink-pdf")  # Find the link with the class "drlink-pdf"
    
    if id_span and pdf_link:
        qualification_id = id_span.text.strip()
        pdf_url = pdf_link.get("href").strip()
        
        # Convert relative URL to absolute if needed
        if pdf_url.startswith("/"):
            pdf_url = "https://www.proqualab.com" + pdf_url
        
        data.append((qualification_id, pdf_url))

# Save to Excel
df = pd.DataFrame(data, columns=["Ofqual ID", "PDF URL"])
df.to_excel("proqual_links.xlsx", index=False)

print(f"Extraction complete. {len(data)} records saved to 'proqual_links.xlsx'.")


Extraction complete. 395 records saved to 'proqual_links.xlsx'.


In [4]:
import os
import requests
import hashlib
import pandas as pd
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed

def download_pdf(pdf_url, save_dir, seen_urls, seen_files):
    """Download a single PDF and save it if it's unique."""
    if pdf_url in seen_urls:
        return f"Skipped (duplicate URL): {pdf_url}"
    
    seen_urls.add(pdf_url)
    
    try:
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()
        
        # Read content in chunks to avoid memory overload
        pdf_content = response.content
        
        # Generate hash for duplicate checking
        pdf_hash = hashlib.md5(pdf_content).hexdigest()
        if pdf_hash in seen_files:
            return f"Skipped (duplicate content): {pdf_url}"
        
        seen_files.add(pdf_hash)

        # Extract filename
        filename = os.path.basename(urllib.parse.urlparse(pdf_url).path)
        if not filename.lower().endswith('.pdf'):
            filename += '.pdf'
            
        save_path = os.path.join(save_dir, filename)
        
        # Save the file
        with open(save_path, 'wb') as f:
            f.write(pdf_content)
        
        return f"Downloaded: {filename}"

    except Exception as e:
        return f"Failed to download {pdf_url}: {str(e)}"


def download_pdfs(df, url_column, save_dir=r"C:\Users\amith\Kenpath\OFQUAL\qualifi.net_pdfs", max_workers=15):
    """Download PDFs from a DataFrame using threading."""
    os.makedirs(save_dir, exist_ok=True)
    
    seen_urls = set()
    seen_files = set()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(download_pdf, row[url_column], save_dir, seen_urls, seen_files): row[url_column]
            for _, row in df.iterrows()
        }
        
        for future in as_completed(futures):
            print(future.result())

pdf_df=pd.read_excel(r"C:\Users\amith\Kenpath\OFQUAL\qualifi.net_pdfs.xlsx")
# Usage example
download_pdfs(pdf_df, 'PDF URL', max_workers=15)  # Adjust workers based on internet speed


Skipped (duplicate URL): https://qualifi.net/wp-content/uploads/2023/09/Qualifi-Diplomas-in-Business-Specifications-September-2023.pdf
Downloaded: CASS_Level-4-Diploma-in-Business-Management_Final.pdf
Downloaded: Level-3-Introduction-Management-vApril-2019.pdf
Downloaded: Level-4-Diploma-in-Entrepreneurship-Specification-March-2019.pdf
Downloaded: Qualifi-Level-4-Diploma-in-Innovation-and-Future-Foresight-Qualification-Specification-January-2023.pdf
Downloaded: Qualifi-Level-6-Diploma-in-Business-Management-Leadership-and-Innovation-Qualification-Specification-April-2021.pdf
Downloaded: Qualifi-Level-7-Diploma-in-Ethical-Leadership-Specification-October-2023.pdf
Downloaded: Qualifi-Level-7-Diploma-in-Executive-Management-Specification-October-2020.pdf
Downloaded: Qualifi-Level-3-Integrated-Diploma-in-Business-and-Management-October-2023.pdf
Downloaded: Qualifi-Level-3-Diploma-in-Business-Management-Specification-Feb-2020.pdf
Downloaded: Qualifi-Level-3-Diploma-in-Business-Innovation-an

In [5]:
pdf_df

Unnamed: 0,Ofqual ID,PDF URL
0,603/1102/2,https://qualifi.net/wp-content/uploads/2020/02...
1,603/4373/4,https://qualifi.net/wp-content/uploads/2019/08...
2,603/4730/2,https://qualifi.net/wp-content/uploads/2023/09...
3,603/1051/0,https://qualifi.net/wp-content/uploads/2020/12...
4,603/7380/5,https://qualifi.net/wp-content/uploads/2023/10...
...,...,...
152,603/5650/9,https://qualifi.net/wp-content/uploads/2022/01...
153,603/6792/1,https://qualifi.net/wp-content/uploads/2022/01...
154,603/5651/0,https://qualifi.net/wp-content/uploads/2022/01...
155,603/6878/0,https://qualifi.net/wp-content/uploads/2022/01...


In [3]:
import os

# Set the directory where the files are located
folder_path = r"C:\Users\amith\Kenpath\OFQUAL\tquk_pdfs"

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if "%20" in filename:  # Check if the filename contains "%20"
        new_filename = filename.replace("%20", "-")
        old_file = os.path.join(folder_path, filename)
        new_file = os.path.join(folder_path, new_filename)
        os.rename(old_file, new_file)

print("Renaming completed.")


Renaming completed.


In [6]:
import os
import requests
import hashlib
import pandas as pd
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed

def download_pdf(pdf_url, save_dir):
    """Download a single PDF without checking for duplicates."""
    try:
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()
        
        # Read content in chunks to avoid memory overload
        pdf_content = response.content
        
        # Extract filename from URL
        filename = os.path.basename(urllib.parse.urlparse(pdf_url).path)
        if not filename.lower().endswith('.pdf'):
            filename += '.pdf'
            
        save_path = os.path.join(save_dir, filename)
        
        # Save the file
        with open(save_path, 'wb') as f:
            f.write(pdf_content)
        
        return f"Downloaded: {filename}"

    except Exception as e:
        return f"Failed to download {pdf_url}: {str(e)}"


def download_pdfs(df, url_column, save_dir=r"C:\Users\amith\Kenpath\OFQUAL\qualifi.net_pdfs", max_workers=15):
    """Download PDFs from a DataFrame using threading without duplicate checks."""
    os.makedirs(save_dir, exist_ok=True)
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(download_pdf, row[url_column], save_dir): row[url_column]
            for _, row in df.iterrows()
        }
        
        for future in as_completed(futures):
            print(future.result())

# Read the Excel file
pdf_df = pd.read_excel(r"C:\Users\amith\Kenpath\OFQUAL\qualifi.net_pdfs.xlsx")

# Run the downloader
download_pdfs(pdf_df, 'PDF URL', max_workers=15)  # Adjust workers based on internet speed


Downloaded: CASS_Level-4-Diploma-in-Business-Management_Final.pdf
Downloaded: Level-3-Introduction-Management-vApril-2019.pdf
Downloaded: Level-4-Diploma-in-Entrepreneurship-Specification-March-2019.pdf
Downloaded: Qualifi-Level-4-Diploma-in-Innovation-and-Future-Foresight-Qualification-Specification-January-2023.pdf
Downloaded: Qualifi-Level-6-Diploma-in-Business-Management-Leadership-and-Innovation-Qualification-Specification-April-2021.pdf
Downloaded: Qualifi-Level-6-Diploma-in-Business-Administration-Specification-February-2020.pdf
Downloaded: Qualifi-Level-7-Diploma-in-Ethical-Leadership-Specification-October-2023.pdf
Downloaded: Qualifi-Level-3-Diploma-in-Business-Management-Specification-Feb-2020.pdf
Downloaded: Qualifi-Level-3-Diploma-in-Business-Innovation-and-Entrepreneurship-Feb-2020.pdf
Downloaded: Qualifi-Level-3-Integrated-Diploma-in-Business-and-Management-October-2023.pdf
Downloaded: Qualifi-Level-3-Extended-Diploma-in-Business-Management-and-Study-Skills-November-2022.