In [24]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
import time
from urllib.parse import urljoin
import chromedriver_autoinstaller

chromedriver_autoinstaller.install()
options = Options()
options.add_argument("--headless=new")
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)

base_url = "https://mc.gov.sa/ar/Regulations/Pages/"
start_url = base_url + "default.aspx"

all_links = set()
visited_pages = set()

try:
    driver.get(start_url)

    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "orderDropDownList"))
    )
    select = Select(driver.find_element(By.ID, "orderDropDownList"))
    select.select_by_value("1")
    time.sleep(2)

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "#regulationsContainer a.rule-clickable-part")
        )
    )
    links = driver.find_elements(By.CSS_SELECTOR, "#regulationsContainer a.rule-clickable-part")
    for link in links:
        href = link.get_attribute("href")
        if href:
            full_url = urljoin(base_url, href)
            all_links.add(full_url)

    print(f"‚úÖ Total links collected: {len(all_links)}")

except Exception as e:
    print(f"‚ùå Error: {e}")
finally:
    driver.quit()

# üëá This is important to make the variable available in the next cell
all_links = sorted(all_links)


‚úÖ Total links collected: 20


In [None]:
import os
import re
import json
from urllib.request import Request, urlopen
from urllib.parse import urlparse, parse_qs
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
from docx.oxml import OxmlElement
from docx.oxml.ns import qn

# ---------- Helpers ----------

def apply_rtl(paragraph):
    pPr = paragraph._element.get_or_add_pPr()
    for tag in ['w:bidi', 'w:textDirection']:
        for child in pPr.findall(qn(tag)):
            pPr.remove(child)
    bidi = OxmlElement('w:bidi')
    bidi.set(qn('w:val'), '1')
    text_direction = OxmlElement('w:textDirection')
    text_direction.set(qn('w:val'), 'rtl')
    pPr.insert(0, text_direction)
    pPr.insert(0, bidi)


def clean_meta_info(text: str) -> str:
    return re.sub(r'\|\s*\|', '|', text).replace(' | ', '|').strip()


def add_articles_from_list(articles, content, is_arabic=True):
    if not articles:
        return
    for art in articles:
        if not isinstance(art, dict):
            continue

        # Heading (article title)
        title = art.get('displayNameAr') if is_arabic else art.get('displayNameEn')
        if title:
            content.append(('heading', title))

        # Body text (handle modified vs original articles)
        article2 = art.get('article2') or {}
        original_article_id = art.get('originalArticleId')

        if original_article_id is not None and article2.get('articleContent'):
            body_obj = article2.get('articleContent') or {}
        else:
            body_obj = (art.get('articleContent') or {})

        text = body_obj.get('textAr') if is_arabic else body_obj.get('textEn')
        if text:
            normalized = text.replace('\r\n', '\n').replace('\r', '\n')
            content.append(('article_body', normalized))
            content.append(('empty', ''))


def build_content_from_law(law_obj, is_arabic=True):
    """Rebuild the same hierarchy that the page JS creates from the JSON API."""
    content = []

    parts = law_obj.get('parts') or []
    for part in parts:
        if not isinstance(part, dict):
            continue
        part_title = part.get('titleAr') if is_arabic else part.get('titleEn')
        if part_title:
            content.append(('heading', part_title))

        # Articles under the part
        add_articles_from_list(part.get('articles'), content, is_arabic=is_arabic)

        # Chapters under the part
        for chap in part.get('chapters') or []:
            if not isinstance(chap, dict):
                continue
            chap_title = chap.get('titleAr') if is_arabic else chap.get('titleEn')
            if chap_title:
                content.append(('heading', chap_title))

            # Extensions under chapter
            for ext in chap.get('extensions') or []:
                if not isinstance(ext, dict):
                    continue
                ext_title = ext.get('titleAr') if is_arabic else ext.get('titleEn')
                if ext_title:
                    content.append(('heading', ext_title))
                add_articles_from_list(ext.get('articles'), content, is_arabic=is_arabic)

            # Articles directly under chapter
            add_articles_from_list(chap.get('articles'), content, is_arabic=is_arabic)

    # Articles directly under the law
    add_articles_from_list(law_obj.get('articles'), content, is_arabic=is_arabic)

    return content


def save_docx(law_title: str, section_name: str, meta_info: str, content):
    doc = Document()
    style = doc.styles['Normal']
    style.font.name = 'Arial'
    style.font.size = Pt(12)

    p = doc.add_paragraph(f"{law_title} - {section_name}")
    p.style = doc.styles['Heading 1']
    apply_rtl(p)

    if meta_info:
        meta = doc.add_paragraph(meta_info)
        apply_rtl(meta)
        doc.add_paragraph()

    for kind, text in content:
        if kind == 'heading':
            p = doc.add_paragraph(text)
            p.style = doc.styles['Heading 2']
            apply_rtl(p)
        elif kind == 'article_body':
            for line in text.split('\n'):
                line = line.strip()
                if not line:
                    continue
                p = doc.add_paragraph(line)
                apply_rtl(p)
        elif kind == 'empty':
            doc.add_paragraph()

    os.makedirs('exported_docs', exist_ok=True)
    filename = f"{law_title} - ÿßŸÑŸÜÿ∏ÿßŸÖ.docx"
    path = os.path.join('exported_docs', filename)
    doc.save(path)
    print(f"üíæ Saved: {path}")


def extract_law_id(url: str) -> str:
    parsed = urlparse(url)
    qs = parse_qs(parsed.query)
    return (qs.get('lawId') or [''])[0]


# ---------- MAIN: call Regulations JSON API directly ----------

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36'
}

api_template = (
    "https://mc.gov.sa/_layouts/15/MCI/RegulationsAPIs.ashx"
    "?siteURL=https://regulations.mc.gov.sa/&lng=ar&op=GetLowByID&Id={law_id}"
)

for url in all_links:
    law_id = extract_law_id(url)
    if not law_id:
        print(f"‚ö†Ô∏è Skipping URL without lawId: {url}")
        continue

    try:
        api_url = api_template.format(law_id=law_id)
        print(f"\nüåê Fetching JSON: {api_url}")
        req = Request(api_url, headers=headers)
        with urlopen(req, timeout=30) as resp:
            raw = resp.read().decode('utf-8', errors='ignore')

        data = json.loads(raw)
        value = data.get('value')
        if isinstance(value, list):
            if not value:
                print("‚ö†Ô∏è Empty value list in API response")
                continue
            # First element is the main "ÿßŸÑŸÜÿ∏ÿßŸÖ" object
            law_obj = value[0]
        else:
            law_obj = value or data

        if not isinstance(law_obj, dict):
            print("‚ö†Ô∏è Unexpected law object structure, skipping")
            continue

        law_title = (law_obj.get('nameAr') or law_obj.get('nameEn') or 'ÿ®ÿØŸàŸÜ ÿπŸÜŸàÿßŸÜ').strip()
        print(f"üìÑ Law title: {law_title}")

        # Meta info: use summary if available
        meta_info = (law_obj.get('summaryAr') or law_obj.get('summaryEn') or '').strip()

        content = build_content_from_law(law_obj, is_arabic=True)
        if content:
            save_docx(law_title, 'ÿßŸÑŸÜÿ∏ÿßŸÖ', meta_info, content)
        else:
            print('‚ö†Ô∏è No article content built from JSON for this law')
    except Exception as e:
        print(f"‚ùå Error processing {url}: {e}")

print('‚ú® Done (API-based).')



üåê Fetching JSON: https://mc.gov.sa/_layouts/15/MCI/RegulationsAPIs.ashx?siteURL=https://regulations.mc.gov.sa/&lng=ar&op=GetLowByID&Id=07140004-6a05-48e3-bb04-a8250094bb85
üìÑ Law title: ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ¥ÿ±ŸÉÿßÿ™
üíæ Saved: exported_docs/ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ¥ÿ±ŸÉÿßÿ™ - ÿßŸÑŸÜÿ∏ÿßŸÖ.docx

üåê Fetching JSON: https://mc.gov.sa/_layouts/15/MCI/RegulationsAPIs.ashx?siteURL=https://regulations.mc.gov.sa/&lng=ar&op=GetLowByID&Id=1b7a7975-05b2-42f8-865d-a847007e63bf
üìÑ Law title: ÿßŸÑŸÇÿßŸÜŸàŸÜ (ÿßŸÑŸÜÿ∏ÿßŸÖ) ÿßŸÑŸÖŸàÿ≠ÿØ ŸÑŸÖŸÉÿßŸÅÿ≠ÿ© ÿßŸÑÿ•ÿ∫ÿ±ÿßŸÇ ŸàÿßŸÑÿ™ÿØÿßÿ®Ÿäÿ± ÿßŸÑÿ™ÿπŸàŸäÿ∂Ÿäÿ© ŸàÿßŸÑŸàŸÇÿßÿ¶Ÿäÿ© ŸÑÿØŸàŸÑ ŸÖÿ¨ŸÑÿ≥ ÿßŸÑÿ™ÿπÿßŸàŸÜ ŸÑÿØŸàŸÑ ÿßŸÑÿÆŸÑŸäÿ¨ ÿßŸÑÿπÿ±ÿ®Ÿäÿ©
üíæ Saved: exported_docs/ÿßŸÑŸÇÿßŸÜŸàŸÜ (ÿßŸÑŸÜÿ∏ÿßŸÖ) ÿßŸÑŸÖŸàÿ≠ÿØ ŸÑŸÖŸÉÿßŸÅÿ≠ÿ© ÿßŸÑÿ•ÿ∫ÿ±ÿßŸÇ ŸàÿßŸÑÿ™ÿØÿßÿ®Ÿäÿ± ÿßŸÑÿ™ÿπŸàŸäÿ∂Ÿäÿ© ŸàÿßŸÑŸàŸÇÿßÿ¶Ÿäÿ© ŸÑÿØŸàŸÑ ŸÖÿ¨ŸÑÿ≥ ÿßŸÑÿ™ÿπÿßŸàŸÜ ŸÑÿØŸàŸÑ ÿßŸÑÿÆŸÑŸäÿ¨ ÿßŸÑÿπÿ±ÿ®Ÿäÿ© - ÿßŸÑŸÜÿ∏ÿßŸÖ.docx

üåê Fetching JSON: https://mc.gov.sa/_layouts/15/MCI/RegulationsAPIs.ashx?