In [2]:
import pandas as pd
import requests
import os
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

# Paths
original_file = r"C:\Users\amith\Downloads\latest_nos_data.xlsx"  # Original Excel with URLs & titles
status_file = r"C:\Users\amith\Kenpath\NOS\download_status.xlsx"  # Status file
download_directory = r"C:\Users\amith\Kenpath\retry"

# Load the original Excel file
df_original = pd.read_excel(original_file)

# Load the download status file
df_status = pd.read_excel(status_file)

# Filter failed downloads
df_failed = df_status[df_status["Status"] == "Failed"]

if df_failed.empty:
    print("No failed downloads to retry.")
    exit()

# Function to clean file names
def clean_filename(filename):
    """Clean a filename to remove invalid characters."""
    return re.sub(r'[\\/*?:"<>|]', "", filename).replace(" ", "_")

# Function to retry downloading a file
def retry_download(row):
    pdf_link = row["PDF_Link"]

    if pd.notna(pdf_link):
        try:
            response = requests.get(pdf_link, stream=True, timeout=15)
            response.raise_for_status()

            # Find the corresponding 'nos_title' from the original dataset
            matching_row = df_original[df_original["pdf_link"] == pdf_link]

            if matching_row.empty:
                print(f"Warning: No matching 'nos_title' found for {pdf_link}")
                return (pdf_link, "Failed - No Title")

            nos_title = matching_row["nos_title"].values[0]
            cleaned_title = clean_filename(nos_title)
            filename = os.path.join(download_directory, f"{cleaned_title}.pdf")

            with open(filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

            print(f"Retried: {pdf_link} -> {filename}")
            return (pdf_link, "Success")

        except requests.exceptions.RequestException as e:
            print(f"Retry failed for {pdf_link}: {e}")
            return (pdf_link, "Failed")

    return (None, "Skipped")

# Set the number of threads (adjust based on system performance)
num_threads = 10

# Use ThreadPoolExecutor to retry failed downloads
results = []
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    future_to_url = {executor.submit(retry_download, row): row for _, row in df_failed.iterrows()}
    
    for future in as_completed(future_to_url):
        results.append(future.result())

# Update the status file with retry results
df_retry_status = pd.DataFrame(results, columns=["PDF_Link", "Retry_Status"])
df_updated = df_status.merge(df_retry_status, on="PDF_Link", how="left")
df_updated.to_excel(status_file, index=False)

print("Retry process completed. Updated status saved in 'download_status.xlsx'.")


Retry failed for https://files.ukstandards.org.uk/pdfs/LANCS59.pdf: 404 Client Error: The specified blob does not exist. for url: https://files.ukstandards.org.uk/pdfs/LANCS59.pdf
Retry failed for https://files.ukstandards.org.uk/pdfs/IMIARBG11.pdf: 404 Client Error: The specified blob does not exist. for url: https://files.ukstandards.org.uk/pdfs/IMIARBG11.pdf
Retry failed for https://files.ukstandards.org.uk/pdfs/LANCS4L.pdf: 404 Client Error: The specified blob does not exist. for url: https://files.ukstandards.org.uk/pdfs/LANCS4L.pdf
Retry failed for https://files.ukstandards.org.uk/pdfs/LANCS25L.pdf: 404 Client Error: The specified blob does not exist. for url: https://files.ukstandards.org.uk/pdfs/LANCS25L.pdf
Retry failed for https://files.ukstandards.org.uk/pdfs/LANCS25L.pdf: 404 Client Error: The specified blob does not exist. for url: https://files.ukstandards.org.uk/pdfs/LANCS25L.pdf
Retry failed for https://files.ukstandards.org.uk/pdfs/LANCS25L.pdf: 404 Client Error: The s

In [2]:
import pandas as pd

df=pd.read_excel(r"C:\Users\amith\Kenpath\download_status.xlsx")
df

Unnamed: 0,PDF_Link,Status,Retry_Status_x,Retry_Status_y
0,,Skipped,,
1,https://files.ukstandards.org.uk/pdfs/SFL244Te...,Success,,
2,,Skipped,,
3,https://files.ukstandards.org.uk/pdfs/SFL247Te...,Success,,
4,,Skipped,,
...,...,...,...,...
23985,https://files.ukstandards.org.uk/pdfs/LSI YW31...,Success,,
23986,https://files.ukstandards.org.uk/pdfs/LSIWWP30...,Success,,
23987,https://files.ukstandards.org.uk/pdfs/COSVR17 ...,Success,,
23988,https://files.ukstandards.org.uk/pdfs/LSIWWP31...,Success,,


In [14]:
df["Retry_Status_y"].value_counts()







Retry_Status_y
Failed     224
Success     33
Name: count, dtype: int64