In [None]:
!pip install -q playwright nest_asyncio
!playwright install --with-deps chromium
!pip install playwright
!pip install pdf2docx


In [2]:
import requests
import time
import json

# Define the file name containing the law IDs
file_name = "law IDs.txt"
# Define the output file name for collected links
output_file_name = "collected_links.txt"

url = "https://nezams.com/wp-admin/admin-ajax.php"
headers = {
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "X-Requested-With": "XMLHttpRequest",
    "Referer": "https://nezams.com/",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
}

try:
    with open(file_name, 'r') as f:
        law_ids = [line.strip() for line in f if line.strip()]

    # Open the output file in append mode ('a'). Use 'w' to overwrite each time.
    with open(output_file_name, 'a', encoding='utf-8') as outfile:
        for law_id in law_ids:
            payload = f"action=get_system_number&id={law_id}&sysn=1&pid={law_id}"

            try:
                r = requests.post(url, headers=headers, data=payload)

                if r.status_code == 200:
                    try:
                        response_data = json.loads(r.text)
                        if response_data.get("success") and "data" in response_data:
                            extracted_link = response_data["data"]
                            outfile.write(extracted_link + '\n') # Write link to file with a newline
                        else:
                            print(f"No link found for ID {law_id}. Response indicates failure or missing 'data' field: {r.text[:200]}")
                    except json.JSONDecodeError:
                        print(f"Error: Could not decode JSON response for ID {law_id}. Response: {r.text[:200]}")
                else:
                    print(f"Request failed for ID {law_id} with status code {r.status_code}. Response: {r.text[:200]}")

            except requests.exceptions.RequestException as e:
                print(f"Network error for ID {law_id}: {e}")

            time.sleep(0.1)

    print(f"\nAll collected links have been saved to '{output_file_name}'.")

except FileNotFoundError:
    print(f"Error: The file '{file_name}' was not found. Please upload 'law IDs.txt' to Colab.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


All collected links have been saved to 'collected_links.txt'.


In [8]:

import asyncio
import os
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn

# Output folder
output_dir = "/content/laws_docs"
os.makedirs(output_dir, exist_ok=True)

# Selectors to remove
unwanted_selectors = [
    "div.fontsize.no-print",
    "span.share-icon",
    "span.total-readers",
    "div.subject-share",
    "span.numbe-s",
    "div#more-items",
    "ul#subject-nav-links"
]

# Save DOCX with RTL and right-aligned paragraphs
def save_docx(title, body, filename):
    doc = Document()
    section = doc.sections[0]
    section.right_to_left = True  # Full document RTL

    style = doc.styles['Normal']
    style.font.name = 'Arial'
    style._element.rPr.rFonts.set(qn('w:eastAsia'), 'Arial')
    style.font.size = Pt(14)

    # Title (right-aligned)
    p_title = doc.add_paragraph()
    p_title.paragraph_format.right_to_left = True
    p_title.alignment = WD_ALIGN_PARAGRAPH.RIGHT
    p_title.add_run(title)

    # Body (right-aligned)
    p_body = doc.add_paragraph()
    p_body.paragraph_format.right_to_left = True
    p_body.alignment = WD_ALIGN_PARAGRAPH.RIGHT
    p_body.add_run(body)

    doc.save(filename)

# Async scraper
async def scrape_and_save_all():
    with open("collected_links.txt", "r", encoding="utf-8") as f:
        urls = [line.strip() for line in f if line.strip()]

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(locale="ar-SA")
        page = await context.new_page()

        for url in urls:
            try:
                await page.goto(url, timeout=20000)
                await page.wait_for_timeout(2000)
                html = await page.content()
                soup = BeautifulSoup(html, 'html.parser')

                title_tag = soup.select_one("body > div.page > h1")
                content_div = soup.select_one("body > div.page > div.post-page > div")
                if not title_tag or not content_div:
                    continue
                title = title_tag.get_text(strip=True)

                # Remove unwanted parts
                for selector in unwanted_selectors:
                    for tag in content_div.select(selector):
                        tag.decompose()

                # Combine special spans into one line with spaces
                for outer in content_div.select('span.selectionShareable[style="color: #993300;"]'):
                    inner_spans = outer.select('span.selectionShareable')
                    combined = ' '.join(s.get_text(strip=True) for s in inner_spans if s.get_text(strip=True))
                    outer.string = combined
                    for s in inner_spans:
                        s.decompose()

                body_text = content_div.get_text(separator="\n", strip=True)

                safe_title = title.replace("/", "-").replace(":", "،").strip()
                filename = os.path.join(output_dir, f"{safe_title}.docx")
                save_docx(title, body_text, filename)
                print(f"✅ Saved: {safe_title}")
            except Exception as e:
                print(f"❌ Failed: {url} — {str(e)}")

        await browser.close()

# Run it
await scrape_and_save_all()


✅ Saved: نظام صندوق الاستثمارات العامة
✅ Saved: نظام مكافحة التستر
✅ Saved: نظام القياس والمعايرة
✅ Saved: نظام السجل التجاري
✅ Saved: نظام صندوق التنمية العقارية
✅ Saved: نظام صندوق التنمية الزراعية
✅ Saved: نظام السياحة
✅ Saved: نظام الكهرباء
✅ Saved: تنظيم المركز السعودي لكفاءة الطاقة
✅ Saved: تنظيم إعانة البحث عن عمل
