In [1]:
# file: boe_scraper_articles.py
import requests
from bs4 import BeautifulSoup
import csv
import re

# --------------------------
# Settings
# --------------------------
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; BOE-ArticleScraper/1.0)"
}

# Example single page (you can add more in the list)
LAW_PAGES = [
    "https://laws.boe.gov.sa/BoeLaws/Laws/LawDetails/08381293-6388-48e2-8ad2-a9a700f2aa94/1",
    # Add more LawDetails URLs here
]

OUTPUT_FILE = "boe_articles.csv"

# --------------------------
# Helpers
# --------------------------
def get_soup(url):
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")




In [2]:
get_soup("https://laws.boe.gov.sa/BoeLaws/Laws/LawDetails/08381293-6388-48e2-8ad2-a9a700f2aa94/1")





<!DOCTYPE html>

<html dir="rtl" lang="ar">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<meta content="width=device-width, initial-scale=0.7, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<link href="/Content/images/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="/Content/images/favicon.ico" rel="icon" type="image/ico">
<link href="/Content/css/bootstrap-rtl.min.css" rel="stylesheet" type="text/css"/>
<link href="/Content/new/css/bootstrap-grid.rtl.min.css" rel="stylesheet" type="text/css"/>
<link href="/Content/new/css/bootstrap-utilities.rtl.min.css" rel="stylesheet" type="text/css"/>
<link href="/Content/css/font-awesome.min.css" rel="stylesheet"/>
<link href="/Content/css/colorbox.css" rel="stylesheet"/>
<link href="/Content/css/main-rtl.css?d=06102025" id="sitecss" rel="stylesheet" type="text/css"/>
<link href="/Content/new/css/forntend-rtl.css

In [3]:
# Let's inspect the HTML structure to find where articles are located
soup = get_soup("https://laws.boe.gov.sa/BoeLaws/Laws/LawDetails/08381293-6388-48e2-8ad2-a9a700f2aa94/1")

# Look for article containers
print("Looking for article structures...")
print("\n=== Checking for div with class containing 'article' or 'مادة' ===")
article_divs = soup.find_all("div", class_=re.compile(r"article|مادة", re.IGNORECASE))
print(f"Found {len(article_divs)} divs")

print("\n=== Checking for elements with 'مادة' text ===")
mada_elements = soup.find_all(string=re.compile(r"مادة"))
print(f"Found {len(mada_elements)} elements with 'مادة'")
for i, elem in enumerate(mada_elements[:3]):  # Show first 3
    print(f"\n{i+1}. Parent: {elem.parent.name}, Text preview: {str(elem)[:100]}")

print("\n=== Checking main content area ===")
main_content = soup.find("div", class_=re.compile(r"content|body|law", re.IGNORECASE))
if main_content:
    print(f"Found main content: {main_content.get('class')}")
    # Look for article patterns within
    text = main_content.get_text()
    matches = re.findall(r'مادة\s+\([^)]+\).*?(?=مادة\s+\(|$)', text, re.DOTALL)
    print(f"Found {len(matches)} article patterns")

Looking for article structures...

=== Checking for div with class containing 'article' or 'مادة' ===
Found 719 divs

=== Checking for elements with 'مادة' text ===
Found 962 elements with 'مادة'

1. Parent: span, Text preview: مادة معدلة

2. Parent: span, Text preview: مادة ملغية

3. Parent: p, Text preview: 
بناء على المادة (السبعين) من النظام الأساسي للحكم، الصادر بالأمر الملكي رقم (أ/٩٠) وتاريخ ٢٧/ ٨/ ١

=== Checking main content area ===
Found main content: ['col-6', 'col-md-9', 'd-flex', 'flex-column', 'justify-content-center']
Found 0 article patterns


In [4]:
# Let's look for the actual article structure more carefully
print("=== Looking for article numbers with Arabic numerals ===")
# Articles often appear as: مادة (الأولى) or مادة (١)
article_pattern = re.compile(r'مادة\s*\([^\)]+\)', re.IGNORECASE)
matches = soup.find_all(string=article_pattern)
print(f"Found {len(matches)} potential article headers")

if matches:
    for i, match in enumerate(matches[:5]):  # Show first 5
        parent = match.parent
        print(f"\n--- Article {i+1} ---")
        print(f"Tag: {parent.name}, Class: {parent.get('class')}")
        print(f"Text: {str(match).strip()[:100]}")
        
        # Try to get the full article content
        # Check siblings or parent content
        next_sibs = []
        for sib in parent.next_siblings:
            if isinstance(sib, str):
                text = sib.strip()
                if text:
                    next_sibs.append(text[:50])
            else:
                text = sib.get_text(strip=True)
                if text and not article_pattern.match(text):
                    next_sibs.append(text[:50])
                elif article_pattern.match(text):
                    break
        print(f"Next content: {next_sibs[:2]}")

=== Looking for article numbers with Arabic numerals ===
Found 62 potential article headers

--- Article 1 ---
Tag: p, Class: None
Text: بناء على المادة (السبعين) من النظام الأساسي للحكم، الصادر بالأمر الملكي رقم (أ/٩٠) وتاريخ ٢٧/ ٨/ ١٤١
Next content: ['عبد الله بن عبد العزيز']

--- Article 2 ---
Tag: p, Class: None
Text: وبناء على المادة (العشرين) من نظام مجلس الوزراء، الصادر بالأمر الملكي رقم (أ/١٣) وتاريخ ٣/ ٣/ ١٤١٤هـ
Next content: ['عبد الله بن عبد العزيز']

--- Article 3 ---
Tag: p, Class: None
Text: وبناء على المادة (الثامنة عشرة) من نظام مجلس الشورى، الصادر بالأمر الملكي رقم (أ/٩١) وتاريخ ٢٧/ ٨/ ١
Next content: ['عبد الله بن عبد العزيز']

--- Article 4 ---
Tag: br, Class: None
Text: 2- يصدر الوزير -بالتنسيق مع الجهات المختصة- لائحة أو أكثر للفئات الواردة في الفقرات الفرعية (أ) و(ج)
Next content: []

--- Article 5 ---
Tag: div, Class: ['HTMLContainer']
Text: "1 - مع مراعاة ما ورد في المادة (السابعة والثلاثين) من هذا النظام، تضع الوزارة نموذجاً موحداً لعقد ا
Next content: []


In [5]:
# Let's look for the actual law articles - they start with "مادة (" followed by article number
print("=== Finding actual law articles ===")

# Find all divs with class HTMLContainer which seem to contain articles
html_containers = soup.find_all("div", class_="HTMLContainer")
print(f"Found {len(html_containers)} HTMLContainer divs")

if html_containers:
    for i, container in enumerate(html_containers[:3]):  # Show first 3
        text = container.get_text(" ", strip=True)
        print(f"\n--- Container {i+1} ---")
        print(f"Length: {len(text)} chars")
        print(f"Preview: {text[:200]}")
        
        # Check if it starts with article pattern
        if re.match(r'مادة\s*\(', text):
            print("✓ This looks like an article!")
        else:
            print("✗ Not an article")

=== Finding actual law articles ===
Found 374 HTMLContainer divs

--- Container 1 ---
Length: 409 chars
Preview: يتضمن النظام : التعريفات والأحكام العامة - تنظيم عمليات التوظيف - توظيف غير السعوديين  - التدريب والتأهيل - علاقات العمل - شروط العمل وظروفه - العمل لبعض الوقت - الوقاية من مخاطر العمل والوقاية من الح
✗ Not an article

--- Container 2 ---
Length: 9 chars
Preview: Labor Law
✗ Not an article

--- Container 3 ---
Length: 9 chars
Preview: Labor Law
✗ Not an article


In [6]:
# Let's find HTMLContainers that actually contain articles
article_containers = []
for container in html_containers:
    text = container.get_text(" ", strip=True)
    # Check if it starts with "مادة (" pattern
    if re.match(r'^[\s\u200f\u200e]*مادة\s*\(', text):
        article_containers.append(container)

print(f"Found {len(article_containers)} actual article containers")

# Show first 5 articles
for i, container in enumerate(article_containers[:5]):
    text = container.get_text(" ", strip=True)
    # Clean up zero-width characters and extra whitespace
    text = re.sub(r'[\u200f\u200e]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    print(f"\n=== Article {i+1} ===")
    print(text[:300])
    print("...")

Found 0 actual article containers


In [7]:
# Let's check all HTMLContainers for any that contain the word مادة
print("=== Checking all HTMLContainers for 'مادة' ===")
containers_with_mada = []
for container in html_containers:
    text = container.get_text(" ", strip=True)
    if 'مادة' in text:
        containers_with_mada.append(container)
        
print(f"Found {len(containers_with_mada)} containers with 'مادة'")

# Check the first few
for i, container in enumerate(containers_with_mada[:5]):
    text = container.get_text(" ", strip=True)
    text = re.sub(r'[\u200f\u200e]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    print(f"\n=== Container {i+1} ===")
    print(f"First 400 chars: {text[:400]}")

=== Checking all HTMLContainers for 'مادة' ===
Found 136 containers with 'مادة'

=== Container 1 ===
First 400 chars: بعون الله تعالى نحن عبد الله بن عبد العزيز آل سعود ملك المملكة العربية السعودية بناء على المادة (السبعين) من النظام الأساسي للحكم، الصادر بالأمر الملكي رقم (أ/٩٠) وتاريخ ٢٧/ ٨/ ١٤١٢هـ. وبناء على المادة (العشرين) من نظام مجلس الوزراء، الصادر بالأمر الملكي رقم (أ/١٣) وتاريخ ٣/ ٣/ ١٤١٤هـ. وبناء على المادة (الثامنة عشرة) من نظام مجلس الشورى، الصادر بالأمر الملكي رقم (أ/٩١) وتاريخ ٢٧/ ٨/ ١٤١٢هـ. وبعد ا

=== Container 2 ===
First 400 chars: - صدر المرسوم الملكي رقم (م/44) وتاريخ 1446/2/8هـ، وذلك بالموافقة على تعديلات بعض مواد نظام العمل (ويعمل بها من تاريخ 1446/8/20هـ) وتم إضافة تعريفين إلى هذه المادة وتكون بالنص الآتي: 1- "الإسناد: خدمة توفير عامل للعمل لدى غير صاحب العمل وذلك من خلال منشأة مرخص لها لهذا الغرض. 2- الاستقالة: إفصاح العامل كتابة عن رغبته دون إكراه في إنهاء عقد عمل محدد المدة دون تعليق على قيد أو شرط، وقبول صاحب العمل 

=== Container 3 ===
First 400 chars: عُدل

In [8]:
# Let's look for the actual article divs - they might be in a specific structure
# Look for divs that contain article numbers/headers
print("=== Looking for article header structure ===")

# Try finding elements with specific classes or patterns
article_headers = soup.find_all(["div", "h2", "h3", "h4"], string=re.compile(r'^\s*المادة\s+\(|^\s*مادة\s+\('))
print(f"Found {len(article_headers)} article headers")

for i, header in enumerate(article_headers[:5]):
    print(f"\n--- Header {i+1} ---")
    print(f"Tag: {header.name}, Class: {header.get('class')}")
    print(f"Text: {header.get_text(strip=True)}")
    
    # Try to find the article content - might be in next sibling or parent's next sibling
    if header.next_sibling:
        print(f"Next sibling type: {type(header.next_sibling)}")
        if hasattr(header.next_sibling, 'get_text'):
            print(f"Next text: {header.next_sibling.get_text(strip=True)[:100]}")
    
    # Also check parent structure
    print(f"Parent: {header.parent.name}, Parent class: {header.parent.get('class')}")

=== Looking for article header structure ===
Found 0 article headers


In [9]:
# Let's look for a different structure - maybe article number is separate from content
# Look for elements that contain just article numbers
print("=== Looking for article number elements ===")

# Search for patterns like: الأولى, الثانية, الثالثة, etc. (ordinal numbers in Arabic)
# Or: (١), (٢), (٣) - Arabic numerals in parentheses

# First, let's check for accordion or collapsible structures
accordions = soup.find_all(["div"], class_=re.compile(r"accordion|collapse|article", re.IGNORECASE))
print(f"Found {len(accordions)} potential accordion/article containers")

# Check for specific article divs
article_divs = soup.find_all("div", attrs={"data-article": True})
print(f"Found {len(article_divs)} divs with data-article attribute")

# Let's look at the page structure more broadly
main_content = soup.find("div", class_=re.compile(r"col-6 col-md-9"))
if main_content:
    # Look for all child divs
    child_divs = main_content.find_all("div", recursive=False)
    print(f"\nFound {len(child_divs)} direct child divs in main content")
    
    # Show first few
    for i, div in enumerate(child_divs[:3]):
        print(f"\n--- Child {i+1} ---")
        print(f"Classes: {div.get('class')}")
        text = div.get_text(strip=True)[:200]
        print(f"Text preview: {text}")

=== Looking for article number elements ===
Found 719 potential accordion/article containers
Found 0 divs with data-article attribute

Found 2 direct child divs in main content

--- Child 1 ---
Classes: ['row']
Text preview: البحث في الوثائق النظاميةتسجيل الدخولENحجم الخط+-اللون الرمادي

--- Child 2 ---
Classes: ['row']
Text preview: 


In [None]:
# Let's look for a specific accordion pattern - BOE sites often use Bootstrap accordions
print("=== Looking for accordion items ===")

accordion_items = soup.find_all("div", class_=re.compile(r"accordion-item|card"))
print(f"Found {len(accordion_items)} accordion items")

# Check for buttons or headers that might trigger accordion
accordion_buttons = soup.find_all("button", class_=re.compile(r"accordion"))
print(f"Found {len(accordion_buttons)} accordion buttons")

# Let's try a different approach - look for heading tags with article text
headings = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
article_headings = [h for h in headings if 'مادة' in h.get_text() or 'المادة' in h.get_text()]
print(f"\nFound {len(article_headings)} headings with 'مادة'")

for i, heading in enumerate(article_headings[:5]):
    print(f"\n--- Heading {i+1} ---")
    print(f"Tag: {heading.name}")
    print(f"Text: {heading.get_text(strip=True)}")
    print(f"Class: {heading.get('class')}")

In [None]:

def extract_articles(soup):
    """
    Extract all مواد (articles) from the page.
    Typically, they appear inside divs, paragraphs, or spans that start with the word 'مادة'.
    """
    articles = []
    # Find all text blocks that contain the word مادة followed by a number
    possible_blocks = soup.find_all(["p", "div", "span"], string=re.compile(r"مادة"))
    
    for block in possible_blocks:
        text = block.get_text(" ", strip=True)
        # Clean up whitespace and remove duplicates
        if text and text.startswith("مادة"):
            articles.append(text)
    return articles

# --------------------------
# Main
# --------------------------
def main():
    all_rows = []
    for url in LAW_PAGES:
        print(f"Scraping {url} ...")
        soup = get_soup(url)
        articles = extract_articles(soup)
        for article in articles:
            all_rows.append({
                "law_url": url,
                "article_text": article
            })
        print(f"  Found {len(articles)} مواد")

    # Save to CSV
    with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["law_url", "article_text"])
        writer.writeheader()
        writer.writerows(all_rows)

    print(f"\nDone! Saved {len(all_rows)} مواد in {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


In [12]:
# Save the HTML content to a file
response = requests.get("https://laws.boe.gov.sa/BoeLaws/Laws/LawDetails/83f450eb-7985-461f-b053-a9a700f2ba08/1", headers=HEADERS)
with open("law_page.html", "w", encoding="utf-8") as f:
    f.write(response.text)
print("HTML content saved to law_page.html")

HTML content saved to law_page.html
