In [1]:
import os
import re
import time
import json
import requests
from datetime import datetime

print("Starting document metadata scraping...")

# --- Configuration ---
BASE_API_URL = "https://ncar.gov.sa/api/index.php/api/documents/document-search"
BASE_PDF_URL_PREFIX = "https://ncar.gov.sa/api/index.php/resource/"
BASE_PDF_URL_SUFFIX = "/Documents/OriginalAttachPath"
PDF_OUTPUT_DIR = "NCAR_PDFs"

ITEMS_PER_PAGE = 10
SORT_BY = "approveDate"
SORT_ORDER = "DESC"
START_PAGE = 1 # MODIFY START_PAGE AND END_PAGE, PAGE 1 IS MOST RECENT
END_PAGE = 3 # MODIFY START_PAGE AND END_PAGE, PAGE 1 IS MOST RECENT
REQUEST_DELAY = 0.5  # polite delay

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
    "Referer": "https://ncar.gov.sa/rules-regulations"
}

BASE_PAYLOAD = {
    "approveTool_id": None,
    "documentCategory_id": None,
    "name": None,
    "ApproveDate": None,
    "PublishingStatus": None,
    "alphabeticalCategory_id": None,
    "alphabeticalSubCategory_id": [],
    "alphabeticalTopic_id": None,
    "approveDate_from": None,
    "approveDate_to": None,
    "generalCategory_id": [],
    "governmentalAgency_childId": [],
    "governmentalAgency_id": None,
    "identical": 1,
    "is_printed": None,
    "is_translated": None,
    "is_valid": None,
    "number": None,
    "omAlQourah_date": None,
    "omAlQourah_version": None,
    "particularCategory_id": [],
    "releaseOrgId": None
}


# --- Helpers ---
def sanitize_filename(name: str, max_length: int = 100) -> str:
    """Clean a string for safe filename use."""
    try:
        name = name.encode('latin1').decode('unicode_escape')
    except (UnicodeEncodeError, UnicodeDecodeError):
        pass
    s = re.sub(r"[^\w\s-]", "", name).strip()
    s = re.sub(r"[\s]+", "-", s)
    return s[:max_length] or "untitled"


def fetch_page(page_num: int, payload: dict) -> dict | None:
    """Fetch a single page of documents."""
    url = f"{BASE_API_URL}/{page_num}/{ITEMS_PER_PAGE}/{SORT_BY}/{SORT_ORDER}"
    print(f"üîé Requesting page {page_num} ...")
    try:
        response = requests.post(url, headers=HEADERS, json=payload, timeout=15)
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        print(f"‚ùå Error fetching page {page_num}: {e}")
        return None


def save_json(data: list, start: int, end: int) -> str:
    """Save scraped data to JSON with page range + formatted date in filename."""
    os.makedirs("output", exist_ok=True)
    today = datetime.now()
    today_str = f"{today.day:02}.{today.month:02}.{today.year}"  # e.g., 09.12.2024
    filename = f"output/ncar_crawl_page{start}-{end}_({today_str}).json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    return filename


def download_pdf(doc: dict, index: int, total: int) -> bool:
    """Download a single PDF if available."""
    encrypted_id = doc.get("id")
    if not encrypted_id:
        print(f"‚ö†Ô∏è  Skipping {index+1}/{total}: missing ID")
        return False

    pdf_url = f"{BASE_PDF_URL_PREFIX}{encrypted_id}{BASE_PDF_URL_SUFFIX}"
    title = sanitize_filename(doc.get("title_en", f"document_{index+1}"))
    number = sanitize_filename(doc.get("number", "")) if doc.get("number") else ""
    filename_base = f"{title}_{number}" if number else title

    os.makedirs(PDF_OUTPUT_DIR, exist_ok=True)
    pdf_path = os.path.join(PDF_OUTPUT_DIR, f"{filename_base}.pdf")

    print(f"‚¨áÔ∏è  Downloading ({index+1}/{total}): {filename_base}")

    try:
        with requests.get(pdf_url, headers=HEADERS, stream=True, timeout=30) as r:
            r.raise_for_status()
            if "application/pdf" not in r.headers.get("Content-Type", ""):
                print(f"‚ö†Ô∏è  Not a PDF ({r.headers.get('Content-Type')}). Skipping.")
                return False

            with open(pdf_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print(f"‚úÖ Saved: {pdf_path}")
        return True

    except requests.RequestException as e:
        print(f"‚ùå Error downloading '{filename_base}': {e}")
        return False


# --- Scraping Process ---
scraped_data = []
for page in range(START_PAGE, END_PAGE + 1):
    page_data = fetch_page(page, BASE_PAYLOAD)
    if not page_data or not page_data.get("data"):
        print(f"‚ö†Ô∏è  No data on page {page}, stopping.")
        break
    scraped_data.extend(page_data["data"])
    time.sleep(REQUEST_DELAY)

print(f"\nüìä Scraping complete. Total documents: {len(scraped_data)}")

if scraped_data:
    json_file = save_json(scraped_data, START_PAGE, END_PAGE)
    print(f"üíæ Data saved to: {json_file}")

    print("\nüöÄ Starting PDF downloads...")
    downloaded = sum(download_pdf(doc, i, len(scraped_data))
                     for i, doc in enumerate(scraped_data))
    print(f"\nüéâ PDF download complete: {downloaded}/{len(scraped_data)} successful.")
else:
    print("‚ö†Ô∏è  No data scraped, skipping PDF downloads.")


Starting document metadata scraping...
üîé Requesting page 1 ...
üîé Requesting page 2 ...
üîé Requesting page 3 ...

üìä Scraping complete. Total documents: 30
üíæ Data saved to: output/ncar_crawl_page1-3_(20.10.2025).json

üöÄ Starting PDF downloads...
‚¨áÔ∏è  Downloading (1/30): ÿ™ÿπÿØŸäŸÑ-ÿßŸÑŸÖÿßÿØÿ™ŸäŸÜ-8-11-ŸÖŸÜ-ÿ™ŸÜÿ∏ŸäŸÖ-ÿµŸÜÿØŸàŸÇ-ÿßŸÑÿ™ŸÜŸÖŸäÿ©-ÿßŸÑŸàÿ∑ŸÜŸä-ŸÑÿπÿßŸÖ-1439ŸáŸÄ_284
‚úÖ Saved: NCAR_PDFs\ÿ™ÿπÿØŸäŸÑ-ÿßŸÑŸÖÿßÿØÿ™ŸäŸÜ-8-11-ŸÖŸÜ-ÿ™ŸÜÿ∏ŸäŸÖ-ÿµŸÜÿØŸàŸÇ-ÿßŸÑÿ™ŸÜŸÖŸäÿ©-ÿßŸÑŸàÿ∑ŸÜŸä-ŸÑÿπÿßŸÖ-1439ŸáŸÄ_284.pdf
‚¨áÔ∏è  Downloading (2/30): ÿßŸÑŸÜÿ∏ÿßŸÖ-ÿßŸÑÿ£ÿ≥ÿßÿ≥-ŸÑŸÖÿ§ÿ≥ÿ≥ÿ©-ŸÖÿ±ŸÉÿ≤-ÿßŸÑÿ±Ÿäÿßÿ∂-ŸÑŸÑÿ™ŸÇŸÜŸäÿ©-ÿßŸÑÿ≠ŸäŸàŸäÿ©-ŸÑÿπÿßŸÖ-1447ŸáŸÄ_31971
‚úÖ Saved: NCAR_PDFs\ÿßŸÑŸÜÿ∏ÿßŸÖ-ÿßŸÑÿ£ÿ≥ÿßÿ≥-ŸÑŸÖÿ§ÿ≥ÿ≥ÿ©-ŸÖÿ±ŸÉÿ≤-ÿßŸÑÿ±Ÿäÿßÿ∂-ŸÑŸÑÿ™ŸÇŸÜŸäÿ©-ÿßŸÑÿ≠ŸäŸàŸäÿ©-ŸÑÿπÿßŸÖ-1447ŸáŸÄ_31971.pdf
‚¨áÔ∏è  Downloading (3/30): ÿ™ŸÜÿ∏ŸäŸÖ-ÿßŸÑŸÖÿπŸáÿØ-ÿßŸÑŸàÿ∑ŸÜŸä-ŸÑÿ£ÿ®ÿ≠ÿßÿ´-ÿßŸÑÿµÿ≠ÿ©-ŸÑÿπÿßŸÖ-1447ŸáŸÄ_266
‚úÖ Saved: NCAR_PDFs\ÿ™ŸÜÿ∏ŸäŸÖ-ÿßŸÑŸÖÿπŸáÿØ-ÿßŸÑŸàÿ∑ŸÜŸä-ŸÑÿ£ÿ®ÿ≠ÿßÿ´-ÿ