In [1]:
#Dependencies

!pip install -q playwright
!playwright install --with-deps chromium
!pip install pdf2docx

'pip' is not recognized as an internal or external command,
operable program or batch file.
'playwright' is not recognized as an internal or external command,
operable program or batch file.
'pip' is not recognized as an internal or external command,
operable program or batch file.


In [1]:
# This section retrieves a fresh session cookie, fetches the entries' IDs, titles, and URLs, and saves them in a file "Nezams_IDs.{date}.json".

import asyncio, json, urllib.parse
from datetime import datetime
from playwright.async_api import async_playwright

OUTPUT_FILENAME_TEMPLATE = "Nezams_IDs.{date}.json"

async def fetch_and_save_ids():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"])
        page = await browser.new_page()

        target_data = None
        target_nonce = None

        async with page.expect_request("**/admin-ajax.php**") as request_info:
            await page.goto("https://nezams.com/", wait_until="networkidle")

            request = await request_info.value
            response = await request.response()
            target_data = await response.json()

        parsed_qs = urllib.parse.parse_qs(urllib.parse.urlparse(request.url).query)
        target_nonce = parsed_qs.get("_wpnonce", [None])[0]

        systems = [{"id": item.get("id"), "name": item.get("text"), "url": item.get("link")}
                   for item in target_data.get("data", []) if item.get("id")]

        today = datetime.now().strftime("%m.%d.%Y")
        filename = OUTPUT_FILENAME_TEMPLATE.format(date=today)

        with open(filename, "w", encoding="utf-8") as f:
            json.dump(systems, f, ensure_ascii=False, indent=2)

        print(f"Saved {len(systems)} entries to {filename}")
        if target_nonce:
            print(f"**_wpnonce cookie** = {target_nonce}")

        await browser.close()

await fetch_and_save_ids()

NotImplementedError: 

In [None]:
# This section loads the previously saved JSON file, visits each URL, and constructs a DOCX file with selected elements and proper formatting.

import asyncio
import os
import json
from datetime import datetime
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn

# Output folder
output_dir = "/content/Nezams_Docs"
os.makedirs(output_dir, exist_ok=True)

# Selectors to remove
unwanted_selectors = [
    "div.fontsize.no-print",
    "span.share-icon",
    "span.total-readers",
    "div.subject-share",
    "span.numbe-s",
    "div#more-items",
    "ul#subject-nav-links"
]

# Save DOCX with RTL and right-aligned paragraphs
def save_docx(title, body, filename):
    doc = Document()
    section = doc.sections[0]
    section.right_to_left = True

    style = doc.styles['Normal']
    style.font.name = 'Arial'
    style._element.rPr.rFonts.set(qn('w:eastAsia'), 'Arial')
    style.font.size = Pt(14)

    p_title = doc.add_paragraph()
    p_title.paragraph_format.right_to_left = True
    p_title.alignment = WD_ALIGN_PARAGRAPH.RIGHT
    p_title.add_run(title)

    p_body = doc.add_paragraph()
    p_body.paragraph_format.right_to_left = True
    p_body.alignment = WD_ALIGN_PARAGRAPH.RIGHT
    p_body.add_run(body)

    doc.save(filename)

# Async scraper
async def scrape_and_save_all():
    today = datetime.now().strftime("%m.%d.%Y")
    filename = f"Nezams_IDs.{today}.json"

    try:
        with open(filename, "r", encoding="utf-8") as f:
            data = json.load(f)

        items_to_scrape = [item for item in data if 'url' in item and item['url']]
        print(f"Loaded {len(items_to_scrape)} URLs from the JSON file: {filename}\n")
    except FileNotFoundError:
        print(f"Error: JSON file '{filename}' not found. Please ensure the file exists.")
        return
    except json.JSONDecodeError:
        print("Error: JSON file is not valid.")
        return

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(locale="ar-SA")
        page = await context.new_page()

        for count, item in enumerate(items_to_scrape, 1):
            url = item['url']
            item_id = item.get('id', 'N/A')
            try:
                await page.goto(url, timeout=20000)
                await page.wait_for_timeout(2000)
                html = await page.content()
                soup = BeautifulSoup(html, 'html.parser')

                title_tag = soup.select_one("body > div.page > h1")
                content_div = soup.select_one("body > div.page > div.post-page > div")
                if not title_tag or not content_div:
                    continue
                title = title_tag.get_text(strip=True)

                for selector in unwanted_selectors:
                    for tag in content_div.select(selector):
                        tag.decompose()

                for outer in content_div.select('span.selectionShareable[style="color: #993300;"]'):
                    inner_spans = outer.select('span.selectionShareable')
                    combined = ' '.join(s.get_text(strip=True) for s in inner_spans if s.get_text(strip=True))
                    outer.string = combined
                    for s in inner_spans:
                        s.decompose()

                body_text = content_div.get_text(separator="\n", strip=True)

                safe_title = title.replace("/", "-").replace(":", "،").strip()
                output_filename = os.path.join(output_dir, f"{safe_title}.docx")
                save_docx(title, body_text, output_filename)
                print(f"✅ Saved {count}- ID: {item_id}: {safe_title}")
            except Exception as e:
                print(f"❌ Failed {count}- ID: {item_id}: {url} — {str(e)}")

        await browser.close()

# Run it
await scrape_and_save_all()

Loaded 487 URLs from the JSON file: Nezams_IDs.09.17.2025.json

✅ Saved 1- ID: 5802: نظام النقل البري على الطرق
✅ Saved 2- ID: 5775: نظام المواد البترولية والبتروكيماوية
✅ Saved 3- ID: 5757: نظام القياس والمعايرة
✅ Saved 4- ID: 5749: نظام الاستثمار
✅ Saved 5- ID: 5744: النظام الأساس لمستشفى الملك خالد التخصصي للعيون ومركز الأبحاث (مؤسسة مستقلة ذات طبيعة خاصة وغير هادفة للربح)
✅ Saved 6- ID: 5709: نظام ضريبة التصرفات العقارية
✅ Saved 7- ID: 5706: نظام السجل التجاري


CancelledError: 