In [12]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

class DownloadError(Exception):
    """Custom exception for download errors"""
    pass

# Function to clean file names
def clean_filename(filename):
    """Clean a filename to remove invalid characters."""
    return re.sub(r'[\\/*?:"<>|]', "", filename).replace(" ", "_")

def download_from_weblink(url, output_path, retries=3, delay=5):
    if not url or pd.isna(url):
        raise DownloadError("Invalid or empty URL")

    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            possible_selectors = [
                'body > main > section.nosheader-container > div > div.nosdetailpage-container > div > div:nth-child(2) > a',
            ]

            for selector in possible_selectors:
                download_elem = soup.select_one(selector)
                if download_elem:
                    pdf_url = download_elem.get('href') if download_elem.name == 'a' else download_elem.get('data-url')

                    if pdf_url:
                        pdf_url = urljoin(url, pdf_url)
                        pdf_response = requests.get(pdf_url, timeout=30)
                        pdf_response.raise_for_status()

                        with open(output_path, 'wb') as f:
                            f.write(pdf_response.content)
                        return True

            raise DownloadError("No download button found on the page")

        except requests.Timeout:
            print(f"Timeout on attempt {attempt + 1}/{retries} for {url}")
            if attempt < retries - 1:
                time.sleep(delay)  # Wait before retrying
            else:
                raise DownloadError("Request timed out after multiple attempts")

        except requests.RequestException as e:
            raise DownloadError(f"Request failed: {str(e)}")
        except Exception as e:
            raise DownloadError(f"Unexpected error: {str(e)}")
        
def process_row(index, row, output_folder):
    """Processes a single row to download the PDF."""
    if pd.isna(row['pdf_link']) and pd.notna(row['web_link']):        
        if pd.notna(row['nos_title']):
            cleaned_title = clean_filename(row['nos_title'])
        else:
            cleaned_title = f"document_{index}"
        
        filename = f"{cleaned_title}.pdf"
        output_path = os.path.join(output_folder, filename)
        
        try:
            if not download_from_weblink(row['web_link'], output_path):
                print(f"Failed to download PDF for row {index} ({filename})")
                return {'index': index, 'url': row['web_link']}
        except DownloadError as e:
            print(f"Error downloading from row {index}: {str(e)}")
            return {'index': index, 'url': row['web_link']}
    return None

def process_excel(excel_path, output_folder, max_workers=20):
    """
    Process Excel file and download PDFs in parallel, logging failures only.
    """
    os.makedirs(output_folder, exist_ok=True)
    failed_urls = []
    
    try:
        df = pd.read_excel(excel_path)
    except Exception as e:
        print(f"Error reading Excel file: {str(e)}")
        return
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {executor.submit(process_row, index, row, output_folder): index for index, row in df.iloc[99:].iterrows()}
        for future in as_completed(future_to_index):
            result = future.result()
            if result:
                failed_urls.append(result)
    
    if failed_urls:
        failed_urls_file = os.path.join(output_folder, "failed_urls.xlsx")
        pd.DataFrame(failed_urls).to_excel(failed_urls_file, index=False)
        print(f"Failed URLs saved to {failed_urls_file}")

# Usage
excel_path = r"C:\Users\amith\Kenpath\latest_nos_data.xlsx"
output_folder = r"C:\Users\amith\Kenpath\NOS from web"
process_excel(excel_path, output_folder)


Error downloading from row 353: Request failed: 500 Server Error: Internal Server Error for url: https://www.ukstandards.org.uk/en/nos-finder/IMPQI238/provide-coaching-and-mentoring-for-improvement-in-food-operations
Timeout on attempt 1/3 for https://www.ukstandards.org.uk/en/nos-finder/INSML002/develop-your-knowledge%2C-skills-and-competence-to-meet-the-requirements-of-your-work
Timeout on attempt 1/3 for https://www.ukstandards.org.uk/en/nos-finder/SEMAE3016/carrying-out-routine-servicing-of-aircraft
Error downloading from row 1383: Request failed: 500 Server Error: Internal Server Error for url: https://www.ukstandards.org.uk/umbraco/surface/nosdocumentpdfsurface/DownloadPDF?nosId=42878&language=English
Error downloading from row 1384: Request failed: 500 Server Error: Internal Server Error for url: https://www.ukstandards.org.uk/umbraco/surface/nosdocumentpdfsurface/DownloadPDF?nosId=42878&language=English
Error downloading from row 1466: Request failed: 500 Server Error: Internal