In [None]:
# @title 1. Setup Environment
import requests
import json
import time
import os
import re # For cleaning filenames
from IPython.display import display, HTML # For displaying output in Colab

print("Setting up environment...")

# Create a directory to save PDFs if it doesn't exist
pdf_output_dir = "downloaded_pdfs"
if not os.path.exists(pdf_output_dir):
    os.makedirs(pdf_output_dir)
    print(f"Created directory: '{pdf_output_dir}' for PDFs.")
else:
    print(f"Directory '{pdf_output_dir}' already exists.")

print("Environment setup complete.")


# @title 2. Scraping All Document Metadata
print("Starting document metadata scraping...")

base_api_url = "https://ncar.gov.sa/api/index.php/api/documents/document-search"
all_scraped_data = []

# --- Configuration for Scraping ---
items_per_page = 10 # Each page has 10 items
sort_by = "approveDate"
sort_order = "DESC"

# Your identified base payload
base_payload = {
    "approveTool_id": None,
    "documentCategory_id": None,
    "name": None,
    "ApproveDate": None,
    "PublishingStatus": None,
    "alphabeticalCategory_id": None,
    "alphabeticalSubCategory_id": [],
    "alphabeticalTopic_id": None,
    "approveDate_from": None,
    "approveDate_to": None,
    "generalCategory_id": [],
    "governmentalAgency_childId": [],
    "governmentalAgency_id": None,
    "identical": 1,
    "is_printed": None,
    "is_translated": None,
    "is_valid": None,
    "number": None,
    "omAlQourah_date": None,
    "omAlQourah_version": None,
    "particularCategory_id": [],
    "releaseOrgId": None
}

# --- Function to fetch a single page of documents ---
def fetch_page(page_num, current_payload):
    url = f"{base_api_url}/{page_num}/{items_per_page}/{sort_by}/{sort_order}"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "Referer": "https://ncar.gov.sa/rules-regulations"
    }

    print(f"  Requesting page {page_num} from: {url}")
    try:
        response = requests.post(url, headers=headers, json=current_payload)
        response.raise_for_status() # Raise an HTTPError for bad responses
        return response.json() # Parse the JSON response
    except requests.exceptions.RequestException as e:
        print(f"  Error fetching page {page_num}: {e}")
        return None

# --- Main Scraping Loop ---
current_page = 1
search_payload = base_payload.copy()

while True:
    page_data_response = fetch_page(current_page, search_payload)

    if page_data_response is None:
        print("  Failed to retrieve data for this page. Stopping scraping.")
        break

    documents_on_page = page_data_response.get("data")

    if not documents_on_page:
        print(f"  No more data found on page {current_page}. Stopping pagination.")
        break

    all_scraped_data.extend(documents_on_page) # Add all documents from the current page

    current_page += 1
    time.sleep(0.5) # Be polite, a short delay between API requests

print(f"Scraping complete! Total documents scraped: {len(all_scraped_data)}.")

# Optional: Save the scraped metadata to a JSON file (useful for debugging or later use)
# with open("ncar_documents.json", "w", encoding="utf-8") as f:
#     json.dump(all_scraped_data, f, ensure_ascii=False, indent=4)
# print("Scraped metadata saved to 'ncar_documents.json'.")


# @title 3. Downloading All PDF Files
print("\nStarting PDF download process...")

# Base URL parts for PDF resources
base_pdf_url_prefix = "https://ncar.gov.sa/api/index.php/resource/"
base_pdf_url_suffix = "/Documents/OriginalAttachPath"

# --- Function to sanitize filenames ---
def sanitize_filename(name):
    """
    Cleans a string to be suitable for a filename.
    Handles Unicode escape sequences and removes invalid characters.
    """
    # Attempt to decode Unicode escape sequences if present
    try:
        name = name.encode('latin1').decode('unicode_escape')
    except (UnicodeEncodeError, UnicodeDecodeError):
        pass # If not a unicode escape sequence, keep as is

    # Remove characters that are not letters, numbers, hyphens, underscores, or spaces
    s = re.sub(r'[^\w\s-]', '', name).strip()
    # Replace multiple spaces with a single hyphen
    s = re.sub(r'[\s]+', '-', s)
    # Truncate to a reasonable length (e.g., 100 characters) to avoid OS limits
    return s[:100]

downloaded_count = 0
total_documents = len(all_scraped_data)

if total_documents == 0:
    print("No documents were scraped. Cannot proceed with PDF downloads.")
else:
    for i, doc in enumerate(all_scraped_data):
        # The 'id' field contains the encrypted part needed for the PDF URL
        encrypted_part = doc.get("id")

        # Skip if the 'id' (encrypted part) is missing
        if not encrypted_part:
            print(f"  Skipping document {i+1}: No 'id' (encrypted part) found.")
            continue

        # Construct the full PDF download URL
        pdf_url = f"{base_pdf_url_prefix}{encrypted_part}{base_pdf_url_suffix}"

        # Generate a descriptive filename using English title and document number
        doc_title_en = doc.get("title_en", f"document_{i+1}") # Fallback title if 'title_en' is missing
        doc_number = doc.get("number", "") # Can be empty

        sanitized_title = sanitize_filename(doc_title_en)
        sanitized_number = sanitize_filename(doc_number) if doc_number else ""

        if sanitized_number:
            filename_base = f"{sanitized_title}_{sanitized_number}"
        else:
            filename_base = sanitized_title

        pdf_filename = os.path.join(pdf_output_dir, f"{filename_base}.pdf")

        print(f"  Downloading ({i+1}/{total_documents}): '{filename_base}'")

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
            "Referer": "https://ncar.gov.sa/rules-regulations"
        }

        try:
            # Use stream=True for efficient downloading of potentially large files
            with requests.get(pdf_url, headers=headers, stream=True) as r:
                r.raise_for_status() # Check for HTTP errors

                # Perform a basic check for PDF content type
                content_type = r.headers.get('Content-Type', '')
                if 'application/pdf' not in content_type and 'octet-stream' not in content_type:
                    print(f"    Warning: Expected PDF for '{filename_base}', but got Content-Type: '{content_type}'. Skipping.")
                    continue

                with open(pdf_filename, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192): # Download in chunks
                        f.write(chunk)
                print(f"    Successfully saved: '{pdf_filename}'")
                downloaded_count += 1

        except requests.exceptions.RequestException as e:
            print(f"    Error downloading '{filename_base}': {e}")
        except Exception as e:
            print(f"    An unexpected error occurred for '{filename_base}': {e}")

        time.sleep(0.5) # Polite delay between downloads

    print(f"\nPDF download complete! Total PDFs downloaded: {downloaded_count} out of {total_documents} found.")

# Optional: Provide a way to download the entire 'downloaded_pdfs' folder from Colab
print("\n---")
print("All PDFs are saved in the 'downloaded_pdfs' folder in your Colab environment.")
print("To download them to your local computer, run the following in a new Colab cell:")
display(HTML("""
<pre>
from google.colab import files
!zip -r /content/downloaded_pdfs.zip /content/downloaded_pdfs
files.download('/content/downloaded_pdfs.zip')
</pre>
"""))

Setting up environment...
Created directory: 'downloaded_pdfs' for PDFs.
Environment setup complete.
Starting document metadata scraping...
  Requesting page 1 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/1/10/approveDate/DESC
  Requesting page 2 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/2/10/approveDate/DESC
  Requesting page 3 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/3/10/approveDate/DESC
  Requesting page 4 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/4/10/approveDate/DESC
  Requesting page 5 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/5/10/approveDate/DESC
  Requesting page 6 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/6/10/approveDate/DESC
  Requesting page 7 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/7/10/approveDate/DESC
  Requesting page 8 from: https://ncar.gov.sa/api/index.php/api/documents/do

In [16]:
# @title 1. Scraping All Document Metadata (MUST RUN THIS FIRST if your session reset)

import requests
import json
import time
import os
import re # For cleaning filenames
from IPython.display import display, HTML # For displaying output in Colab

print("Starting document metadata scraping...")

# Ensure the PDF directory exists for consistency, though not directly used in this scrape cell
pdf_output_dir = "downloaded_pdfs"
if not os.path.exists(pdf_output_dir):
    os.makedirs(pdf_output_dir)

base_api_url = "https://ncar.gov.sa/api/index.php/api/documents/document-search"
global all_scraped_data # Make it global so other cells can access it
all_scraped_data = []

# --- Configuration for Scraping ---
items_per_page = 10
sort_by = "approveDate"
sort_order = "DESC"

base_payload = {
    "approveTool_id": None, "documentCategory_id": None, "name": None, "ApproveDate": None,
    "PublishingStatus": None, "alphabeticalCategory_id": None, "alphabeticalSubCategory_id": [],
    "alphabeticalTopic_id": None, "approveDate_from": None, "approveDate_to": None,
    "generalCategory_id": [], "governmentalAgency_childId": [], "governmentalAgency_id": None,
    "identical": 1, "is_printed": None, "is_translated": None, "is_valid": None,
    "number": None, "omAlQourah_date": None, "omAlQourah_version": None, "particularCategory_id": [],
    "releaseOrgId": None
}

def fetch_page(page_num, current_payload):
    url = f"{base_api_url}/{page_num}/{items_per_page}/{sort_by}/{sort_order}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "Referer": "https://ncar.gov.sa/rules-regulations"
    }
    print(f"  Requesting page {page_num} from: {url}")
    try:
        response = requests.post(url, headers=headers, json=current_payload)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"  Error fetching page {page_num}: {e}")
        return None

current_page = 1
search_payload = base_payload.copy()

while True:
    page_data_response = fetch_page(current_page, search_payload)
    if page_data_response is None:
        print("  Failed to retrieve data for this page. Stopping scraping.")
        break
    documents_on_page = page_data_response.get("data")
    if not documents_on_page:
        print(f"  No more data found on page {current_page}. Stopping pagination.")
        break
    all_scraped_data.extend(documents_on_page)
    current_page += 1
    time.sleep(0.5)

print(f"Scraping complete! Total documents scraped: {len(all_scraped_data)}.")

# Save the scraped metadata to a JSON file (essential for the next step)
with open("ncar_documents.json", "w", encoding="utf-8") as f:
    json.dump(all_scraped_data, f, ensure_ascii=False, indent=4)
print("Scraped metadata saved to 'ncar_documents.json'.")
print("Proceed to the next step to identify the missing files.")

Starting document metadata scraping...
  Requesting page 1 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/1/10/approveDate/DESC
  Requesting page 2 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/2/10/approveDate/DESC
  Requesting page 3 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/3/10/approveDate/DESC
  Requesting page 4 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/4/10/approveDate/DESC
  Requesting page 5 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/5/10/approveDate/DESC
  Requesting page 6 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/6/10/approveDate/DESC
  Requesting page 7 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/7/10/approveDate/DESC
  Requesting page 8 from: https://ncar.gov.sa/api/index.php/api/documents/document-search/8/10/approveDate/DESC
  Requesting page 9 from: https://ncar.gov.sa/api/index.php/api/d

In [17]:
# @title 2. Identify and List Overwritten Files (Run AFTER Step 1 Completes)
import requests
import json
import time
import os
import re
import hashlib
from IPython.display import display, HTML

print("Preparing for collision analysis...")

# Ensure pdf_output_dir exists
pdf_output_dir = "downloaded_pdfs"
if not os.path.exists(pdf_output_dir):
    os.makedirs(pdf_output_dir)

# --- Load Scraped Data (CRUCIAL for this analysis) ---
global all_scraped_data # Ensure it's accessible if not already in global scope from Step 1
if 'all_scraped_data' not in globals() or not all_scraped_data:
    try:
        if os.path.exists("ncar_documents.json"):
            with open("ncar_documents.json", "r", encoding="utf-8") as f:
                all_scraped_data = json.load(f)
            print(f"Loaded {len(all_scraped_data)} documents from 'ncar_documents.json'.")
        else:
            print("Error: 'ncar_documents.json' not found after scraping. Please ensure Step 1 completed successfully.")
            all_scraped_data = [] # Reset to empty to prevent errors
    except Exception as e:
        print(f"An unexpected error occurred while loading data: {e}")
        all_scraped_data = []

# Create a quick lookup for documents by their ID
global document_by_id # Make it global for consistency
document_by_id = {doc['id']: doc for doc in all_scraped_data}

if not all_scraped_data:
    print("No document metadata available. Aborting collision analysis.")
else:
    print(f"Data loaded successfully with {len(all_scraped_data)} documents.")
    print("\nIdentifying documents with potentially duplicate original filenames using the exact observed logic...")

    original_filename_map = {}

    # --- This function perfectly simulates the filename generation from the ORIGINAL download ---
    def simulate_original_filename_logic_from_log(name_part):
        """
        Simulates the original filename sanitization logic observed in the provided log.
        This function is used ONLY for identifying collisions, not for actual file saving.
        """
        try:
            name_part = name_part.encode('latin1').decode('unicode_escape')
        except (UnicodeEncodeError, UnicodeDecodeError):
            pass
        s = re.sub(r'[^\w\s-]', '', name_part).strip()
        s = re.sub(r'[\s]+', '-', s)
        return s

    for doc in all_scraped_data:
        doc_id = doc.get("id")
        if not doc_id:
            continue

        doc_title_en = doc.get("title_en")
        if not doc_title_en:
            doc_title_en = doc.get("title_ar", f"document_no_title_found_{doc_id}")

        doc_number = doc.get("number", "")

        sanitized_title = simulate_original_filename_logic_from_log(doc_title_en)
        sanitized_number = simulate_original_filename_logic_from_log(doc_number)

        original_filename_base_no_id = ""
        if sanitized_number:
            original_filename_base_no_id = f"{sanitized_title}_{sanitized_number}"
        else:
            original_filename_base_no_id = sanitized_title

        final_original_filename_key = f"{original_filename_base_no_id}.pdf"

        if final_original_filename_key in original_filename_map:
            original_filename_map[final_original_filename_key].append(doc_id)
        else:
            original_filename_map[final_original_filename_key] = [doc_id]

    ids_to_redownload_set = set()
    collision_info = []

    for filename_key, doc_ids_list in original_filename_map.items():
        if len(doc_ids_list) > 1:
            ids_to_redownload_set.update(doc_ids_list)
            collision_info.append({
                "filename_key": filename_key,
                "colliding_doc_ids": doc_ids_list,
                "num_collisions": len(doc_ids_list)
            })
            print(f"  Filename '{filename_key}' would have caused {len(doc_ids_list)} collisions.")

    global ids_to_redownload # Make it global for the next download step
    ids_to_redownload = list(ids_to_redownload_set)
    num_unique_filenames_lost = sum(len(ids) - 1 for ids in original_filename_map.values() if len(ids) > 1)

    print(f"\nAnalysis complete. Identified {len(ids_to_redownload)} documents involved in filename collisions.")
    print(f"This indicates approximately {num_unique_filenames_lost} files were overwritten due to shared filenames.")
    print(f"These {len(ids_to_redownload)} documents will be redownloaded with unique filenames.")

    print("\n--- Details of Potentially Overwritten (Missing) Files ---")

    lost_files_count_tracker = 0 # Use a different name to avoid confusion with num_unique_filenames_lost
    if num_unique_filenames_lost == 0:
        print("No filename collisions detected, so no files were likely overwritten due to naming conflicts.")
        print("If you still believe files are missing, the cause might be different (e.g., download errors, API issues).")
    else:
        for collision in collision_info:
            filename = collision["filename_key"]
            colliding_ids = collision["colliding_doc_ids"]

            print(f"\nFilename '{filename}' would have been generated by {len(colliding_ids)} documents:")

            for i, doc_id in enumerate(colliding_ids):
                doc = document_by_id.get(doc_id, {})
                title_en = doc.get('title_en', 'N/A')
                title_ar = doc.get('title_ar', 'N/A')
                number = doc.get('number', 'N/A')

                status_text = "This document would have saved a file with this name (likely the one that remained)."
                if i > 0:
                    status_text = "--> This document's file would have been OVERWRITTEN by a subsequent file with the same name. This is a 'missing' file."
                    lost_files_count_tracker += 1

                print(f"  - Document ID: {doc_id}")
                print(f"    Title (EN): {title_en}")
                print(f"    Title (AR): {title_ar}")
                print(f"    Number: {number}")
                print(f"    {status_text}")
                print("-" * 40)

        print(f"\nTotal estimated files overwritten (net loss of unique filenames): {lost_files_count_tracker}")

    print("\n--- To ensure you have all files with unique names ---")
    print("3. Run the '3. Targeted PDF Redownload with Unique Filenames' cell from the *previous full script response* (the one with the `sanitize_filename_for_unique_save` function).")
    print("4. Then, run the following to download the entire 'downloaded_pdfs' folder:")
    display(HTML("""
    <pre>
    from google.colab import files
    !zip -r /content/downloaded_pdfs.zip /content/downloaded_pdfs
    files.download('/content/downloaded_pdfs.zip')
    </pre>
    """))

Preparing for collision analysis...
Data loaded successfully with 4750 documents.

Identifying documents with potentially duplicate original filenames using the exact observed logic...
  Filename 'تعديل-اسم-برنامج-دعم-التدريب-المرتبط-بالتوظيف-في-المعاهد-غير-الربحية-ليكون-برنامج-دعم-معاهد-الشراكات-الاستراتيجية_1ت21442.pdf' would have caused 2 collisions.
  Filename 'Circular-regarding-the-implementation-of-IMO-Resolution-96-406-MSC-and-the-amendments-to-the-International-Maritime-Dangerous-Goods-Code_131702579.pdf' would have caused 2 collisions.
  Filename 'انضمام-وزارة-الشؤون-الإسلامية-والأوقاف-والدعوة-والإرشاد-إلى-اللجنة-الدائمة-لمكافحة-غسل-الأموال_340.pdf' would have caused 2 collisions.
  Filename 'ترتيبات-تنظيمية-بشأن-بعض-المناطق-التي-قد-تتخذ-للسكنى-لوجود-إشعاعات-اليورانيوم-وغاز-الكبريت-أو-لقربها-من-المنشآت-البترولية_285.pdf' would have caused 2 collisions.

Analysis complete. Identified 8 documents involved in filename collisions.
This indicates approximately 4 files were overwri

In [15]:
# @title Identify and List Overwritten Files (Revised with imports)
import requests
import json
import time
import os
import re
import hashlib
from IPython.display import display, HTML

print("Setting up environment and loading data for collision analysis...")

# Ensure pdf_output_dir exists, as expected by the sanitize function later
pdf_output_dir = "downloaded_pdfs"
if not os.path.exists(pdf_output_dir):
    os.makedirs(pdf_output_dir)

# --- Load Scraped Data (CRUCIAL for this analysis) ---
all_scraped_data = []
document_by_id = {}
try:
    if os.path.exists("ncar_documents.json"):
        with open("ncar_documents.json", "r", encoding="utf-8") as f:
            all_scraped_data = json.load(f)
        print(f"Loaded {len(all_scraped_data)} documents from 'ncar_documents.json'.")
        document_by_id = {doc['id']: doc for doc in all_scraped_data}
    else:
        print("Error: 'ncar_documents.json' not found. Please ensure you have run the '2. Scraping All Document Metadata' cell from the full script in your Colab session, which saves this file. This analysis cannot proceed without the full metadata.")
        # If the file isn't found, this script won't be able to perform the analysis.
        # We'll proceed with empty data, but the user needs to know.
except Exception as e:
    print(f"An unexpected error occurred while loading data: {e}")
    print("Please ensure 'ncar_documents.json' is correctly saved and accessible.")

if not all_scraped_data:
    print("No document metadata available. Aborting collision analysis.")
else:
    print(f"Data loaded successfully with {len(all_scraped_data)} documents.")
    print("\nIdentifying documents with potentially duplicate original filenames using the exact observed logic...")

    original_filename_map = {}

    # --- This function perfectly simulates the filename generation from the ORIGINAL download ---
    def simulate_original_filename_logic_from_log(name_part):
        """
        Simulates the original filename sanitization logic observed in the provided log.
        This function is used ONLY for identifying collisions, not for actual file saving.
        """
        # Decode potential Unicode escape sequences (e.g., from Arabic characters)
        try:
            name_part = name_part.encode('latin1').decode('unicode_escape')
        except (UnicodeEncodeError, UnicodeDecodeError):
            pass # If not a unicode escape sequence, keep as is

        # Remove characters that are not letters, numbers, hyphens, underscores, or spaces
        s = re.sub(r'[^\w\s-]', '', name_part).strip()
        # Replace multiple spaces with a single hyphen
        s = re.sub(r'[\s]+', '-', s)
        return s

    for doc in all_scraped_data:
        doc_id = doc.get("id")
        if not doc_id:
            continue

        doc_title_en = doc.get("title_en")
        if not doc_title_en:
            doc_title_en = doc.get("title_ar", f"document_no_title_found_{doc_id}")

        doc_number = doc.get("number", "")

        sanitized_title = simulate_original_filename_logic_from_log(doc_title_en)
        sanitized_number = simulate_original_filename_logic_from_log(doc_number)

        original_filename_base_no_id = ""
        if sanitized_number:
            original_filename_base_no_id = f"{sanitized_title}_{sanitized_number}"
        else:
            original_filename_base_no_id = sanitized_title

        final_original_filename_key = f"{original_filename_base_no_id}.pdf"

        if final_original_filename_key in original_filename_map:
            original_filename_map[final_original_filename_key].append(doc_id)
        else:
            original_filename_map[final_original_filename_key] = [doc_id]

    ids_to_redownload_set = set()
    collision_info = []

    for filename_key, doc_ids_list in original_filename_map.items():
        if len(doc_ids_list) > 1:
            ids_to_redownload_set.update(doc_ids_list)
            collision_info.append({
                "filename_key": filename_key,
                "colliding_doc_ids": doc_ids_list,
                "num_collisions": len(doc_ids_list)
            })
            print(f"  Filename '{filename_key}' would have caused {len(doc_ids_list)} collisions.")

    ids_to_redownload = list(ids_to_redownload_set)
    num_unique_filenames_lost = sum(len(ids) - 1 for ids in original_filename_map.values() if len(ids) > 1)

    print(f"\nAnalysis complete. Identified {len(ids_to_redownload)} documents involved in filename collisions.")
    print(f"This indicates approximately {num_unique_filenames_lost} files were overwritten due to shared filenames.")
    print(f"These {len(ids_to_redownload)} documents will be redownloaded with unique filenames.")

    # Now, identify the specific files that were "lost" (all but one in each collision group)
    print("\n--- Details of Potentially Overwritten (Missing) Files ---")

    lost_files_count = 0
    if num_unique_filenames_lost == 0:
        print("No filename collisions detected, so no files were likely overwritten due to naming conflicts.")
        print("If you still believe files are missing, the cause might be different (e.g., download errors, API issues).")
    else:
        for collision in collision_info:
            filename = collision["filename_key"]
            colliding_ids = collision["colliding_doc_ids"]

            print(f"\nFilename '{filename}' would have been generated by {len(colliding_ids)} documents:")

            for i, doc_id in enumerate(colliding_ids):
                doc = document_by_id.get(doc_id, {})
                title_en = doc.get('title_en', 'N/A')
                title_ar = doc.get('title_ar', 'N/A')
                number = doc.get('number', 'N/A')

                status_text = "This document would have saved a file with this name (likely the one that remained)."
                if i > 0:
                    status_text = "--> This document's file would have been OVERWRITTEN by a subsequent file with the same name. This is a 'missing' file."
                    lost_files_count += 1

                print(f"  - Document ID: {doc_id}")
                print(f"    Title (EN): {title_en}")
                print(f"    Title (AR): {title_ar}")
                print(f"    Number: {number}")
                print(f"    {status_text}")
                print("-" * 40)

        print(f"\nTotal estimated files overwritten (net loss of unique filenames): {lost_files_count}")

    # Final reminder about downloading the fixed files
    print("\n--- To ensure you have all files with unique names ---")
    print("1. Run the '3. Targeted PDF Redownload with Unique Filenames' cell from the previous response.")
    print("2. Then, run the following to download the entire 'downloaded_pdfs' folder:")
    display(HTML("""
    <pre>
    from google.colab import files
    !zip -r /content/downloaded_pdfs.zip /content/downloaded_pdfs
    files.download('/content/downloaded_pdfs.zip')
    </pre>
    """))

Setting up environment and loading data for collision analysis...
Loaded 530 documents from 'ncar_documents.json'.
Data loaded successfully with 530 documents.

Identifying documents with potentially duplicate original filenames using the exact observed logic...

Analysis complete. Identified 0 documents involved in filename collisions.
This indicates approximately 0 files were overwritten due to shared filenames.
These 0 documents will be redownloaded with unique filenames.

--- Details of Potentially Overwritten (Missing) Files ---
No filename collisions detected, so no files were likely overwritten due to naming conflicts.
If you still believe files are missing, the cause might be different (e.g., download errors, API issues).

--- To ensure you have all files with unique names ---
1. Run the '3. Targeted PDF Redownload with Unique Filenames' cell from the previous response.
2. Then, run the following to download the entire 'downloaded_pdfs' folder:


In [18]:
# @title 3. Targeted PDF Redownload with Unique Filenames

print("\nStarting targeted PDF redownload...")

# Base URL parts for PDF resources
base_pdf_url_prefix = "https://ncar.gov.sa/api/index.php/resource/"
base_pdf_url_suffix = "/Documents/OriginalAttachPath"

downloaded_this_run_count = 0
total_to_redownload = len(ids_to_redownload) # This list is populated by the previous cell

if total_to_redownload == 0:
    print("No documents identified for targeted redownload based on collision analysis.")
    print("If you still believe files are missing, please double-check 'all_scraped_data' integrity.")
elif not all_scraped_data:
    print("Cannot proceed with download: 'all_scraped_data' is empty. Please run the scraping cell or load from JSON.")
else:
    for i, doc_id_to_redownload in enumerate(ids_to_redownload):
        doc = document_by_id.get(doc_id_to_redownload) # Get the full document object using the global lookup
        if not doc:
            print(f"  Warning: Document with ID '{doc_id_to_redownload}' not found in scraped data. Skipping.")
            continue

        encrypted_part = doc.get("id")
        if not encrypted_part:
            print(f"  Skipping document ID '{doc_id_to_redownload}': No encrypted part found.")
            continue

        pdf_url = f"{base_pdf_url_prefix}{encrypted_part}{base_pdf_url_suffix}"

        # Generate filename using the NEW, unique sanitize_filename_for_unique_save function
        doc_title_en = doc.get("title_en")
        if not doc_title_en:
            doc_title_en = doc.get("title_ar", f"document_no_title_{doc_id_to_redownload}")

        doc_number = doc.get("number", "")

        # Combine title and number for the base part of the unique filename
        combined_name_for_unique = f"{doc_title_en}"
        if doc_number:
            combined_name_for_unique += f"_{doc_number}"

        filename_base = sanitize_filename_for_unique_save(combined_name_for_unique, doc_id_to_redownload)
        pdf_filename = os.path.join(pdf_output_dir, f"{filename_base}.pdf")

        # Check if a file with this *exact unique name* already exists before downloading
        if os.path.exists(pdf_filename):
            print(f"  File already exists with unique name: '{pdf_filename}'. Skipping download.")
            downloaded_this_run_count += 1 # Count it as "found"
            continue

        print(f"  Redownloading ({i+1}/{total_to_redownload}): '{filename_base}'")

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
            "Referer": "https://ncar.gov.sa/rules-regulations"
        }

        try:
            with requests.get(pdf_url, headers=headers, stream=True) as r:
                r.raise_for_status()

                content_type = r.headers.get('Content-Type', '')
                if 'application/pdf' not in content_type and 'octet-stream' not in content_type:
                    print(f"    Warning: Expected PDF for '{filename_base}', but got Content-Type: '{content_type}'. Skipping.")
                    continue

                with open(pdf_filename, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
                print(f"    Successfully saved: '{pdf_filename}'")
                downloaded_this_run_count += 1

        except requests.exceptions.RequestException as e:
            print(f"    Error downloading '{filename_base}': {e}")
        except Exception as e:
            print(f"    An unexpected error occurred for '{filename_base}': {e}")

        time.sleep(0.5) # Polite delay

print(f"\nTargeted redownload complete! PDFs downloaded in this run: {downloaded_this_run_count}.")
print("The new files will have a unique suffix (e.g., '_abcdef12.pdf') to prevent future conflicts.")

# Optional: Download all your PDFs again (if you want the new ones + existing ones)
print("\n---")
print("To download ALL your PDFs (including newly added ones), run the following in a new Colab cell:")
display(HTML("""
<pre>
from google.colab import files
!zip -r /content/downloaded_pdfs.zip /content/downloaded_pdfs
files.download('/content/downloaded_pdfs.zip')
</pre>
"""))


Starting targeted PDF redownload...
  Redownloading (1/8): 'ترتيبات-تنظيمية-بشأن-بعض-المناطق-التي-قد-تتخذ-للسكنى-لوجود-إشعاعات-اليورانيوم-وغ_e2b5a250'
    Successfully saved: 'downloaded_pdfs/ترتيبات-تنظيمية-بشأن-بعض-المناطق-التي-قد-تتخذ-للسكنى-لوجود-إشعاعات-اليورانيوم-وغ_e2b5a250.pdf'
  Redownloading (2/8): 'Circular-regarding-the-implementation-of-IMO-Resolution-96-406-MSC-and-the-amend_ad6c5e8e'
    Successfully saved: 'downloaded_pdfs/Circular-regarding-the-implementation-of-IMO-Resolution-96-406-MSC-and-the-amend_ad6c5e8e.pdf'
  Redownloading (3/8): 'Circular-regarding-the-implementation-of-IMO-Resolution-96-406-MSC-and-the-amend_7a606064'
    Successfully saved: 'downloaded_pdfs/Circular-regarding-the-implementation-of-IMO-Resolution-96-406-MSC-and-the-amend_7a606064.pdf'
  Redownloading (4/8): 'انضمام-وزارة-الشؤون-الإسلامية-والأوقاف-والدعوة-والإرشاد-إلى-اللجنة-الدائمة-لمكاف_0fb17411'
    Successfully saved: 'downloaded_pdfs/انضمام-وزارة-الشؤون-الإسلامية-والأوقاف-والدعوة-والإرشا