In [None]:
# 1. Install necessary libraries
print("Installing required libraries...")
!pip install chromedriver_autoinstaller selenium beautifulsoup4 python-docx
print("Libraries installed.\n")
# 2. Install chromium-browser
print("Installing chromium-browser...")
!apt-get update
!apt-get install chromium-browser -y
print("chromium-browser installed.\n")

In [4]:
# 3. Import necessary modules
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import os



# 4. Set up headless Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.binary_location = "/usr/bin/chromium-browser"

# 5. Start the driver
print("Starting Selenium WebDriver...")
driver = webdriver.Chrome(options=chrome_options)
print("WebDriver started.\n")

# 6. Container for results (legislation links) - this will be accessible in Cell 2
legislation_results = []

print("Starting to fetch legislation links (Pages 1-9)...")
# Loop through pages 1–9
for page_num in range(1, 10):
    url = f"https://laws.moj.gov.sa/legislations-regulations?pageNumber={page_num}&pageSize=9&sortingBy=7"
    driver.get(url)
    time.sleep(2)  # wait for JS rendering

    soup = BeautifulSoup(driver.page_source, "html.parser")
    anchors = soup.select("a.details.d-flex.text-dark")

    print(f"✅ Page {page_num}: {len(anchors)} items found.")

    for a in anchors:
        title_tag = a.select_one("h2.title")
        if title_tag:
            title = title_tag.get_text(strip=True)
            href = a.get("href")
            if href:
                full_url = "https://laws.moj.gov.sa" + href
                if {"title": title, "url": full_url} not in legislation_results:
                    legislation_results.append({"title": title, "url": full_url})

# 7. Quit the driver after the first scraping task is done
driver.quit()
print("\nWebDriver for link scraping closed.")
print(f"\nCollected {len(legislation_results)} unique legislation links.")

Auto-installing chromedriver...
Chromedriver installed.

Starting Selenium WebDriver...
WebDriver started.

Starting to fetch legislation links (Pages 1-9)...
✅ Page 1: 9 items found.
✅ Page 2: 9 items found.
✅ Page 3: 9 items found.
✅ Page 4: 9 items found.
✅ Page 5: 9 items found.
✅ Page 6: 9 items found.
✅ Page 7: 9 items found.
✅ Page 8: 7 items found.
✅ Page 9: 0 items found.

WebDriver for link scraping closed.

Collected 61 unique legislation links. Ready for Cell 2.


In [None]:
# Colab Cell 2: Visit Individual Pages, Copy Text, and Save as .docx

# 1. Re-import necessary modules
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

# 2. Set up headless Chrome options again
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.binary_argument = "/usr/bin/chromium-browser" # Typo fix: should be .binary_location

# Corrected the typo from binary_argument to binary_location
chrome_options.binary_location = "/usr/bin/chromium-browser"

# Your requested Chrome options for clipboard permissions
chrome_options.add_argument("--enable-features=VaapiVideoDecode")
chrome_options.add_experimental_option("prefs", {
    "profile.default_content_setting_values.clipboard": 1 # 1 for allow, 2 for block
})

# 3. Start the driver for the second task
print("Starting Selenium WebDriver for document content scraping...")
driver = webdriver.Chrome(options=chrome_options)
driver.set_script_timeout(60) # Set script execution timeout to 60 seconds
print("WebDriver started.\n")

# 4. Create a directory to save the copied text files
output_dir = "copied_legislation_docs"
os.makedirs(output_dir, exist_ok=True)
print(f"Saving copied texts (as .docx) to: {output_dir}/\n")

# 5. Loop through each collected legislation link to get its text content
print(f"Processing {len(legislation_results)} individual legislation pages...")
for i, item in enumerate(legislation_results):
    title = item["title"]
    url = item["url"]

    print(f"({i+1}/{len(legislation_results)}) Visiting: {title}")
    try:
        driver.get(url)

        wait = WebDriverWait(driver, 20)
        copy_button = wait.until(EC.element_to_be_clickable((By.ID, "copy-btn")))

        copy_button.click()
        time.sleep(3) # Give more time for the clipboard operation to complete

        # Attempt to read the clipboard content using JavaScript
        clipboard_text = driver.execute_script("return navigator.clipboard.readText();")

        if clipboard_text:
            safe_title = "".join(c for c in title if c.isalnum() or c.isspace()).strip()
            filename = f"{safe_title[:100].replace(' ', '_')}.docx" # Truncate to 100 chars, replace spaces
            filepath = os.path.join(output_dir, filename)

            document = Document()

            # --- Title Paragraph: Right-aligned, RTL ---
            title_paragraph = document.add_paragraph()
            title_paragraph.add_run(title)
            title_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
            title_paragraph.paragraph_format.bidi = True # Bi-directional text for RTL

            # --- NO EMPTY LINES HERE (removed document.add_paragraph() calls) ---

            # --- Main Text Paragraph: Right-aligned, RTL ---
            main_text_paragraph = document.add_paragraph()
            main_text_paragraph.add_run(clipboard_text)
            main_text_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT # Ensure main text is also right-aligned
            main_text_paragraph.paragraph_format.bidi = True # Bi-directional text for RTL

            document.save(filepath)
            print(f"  --> Successfully saved text to '{filename}'")
        else:
            print(f"  --> Warning: Could not retrieve clipboard text for '{title}' (empty or permission denied).")

    except Exception as e:
        print(f"  --> Error processing '{title}': {e}")

# 6. Quit the driver after all individual pages are processed
driver.quit()
print("\nWebDriver for document content scraping closed.")
print("\nScript finished. Check the 'copied_legislation_docs' folder in your Colab files.")

Starting Selenium WebDriver for document content scraping...
WebDriver started.

Saving copied texts (as .docx) to: copied_legislation_docs/

Processing 70 individual legislation pages...
(1/70) Visiting: اللائحة التنفيذية لإجراءات الاستئناف
  --> Successfully saved text to 'اللائحة_التنفيذية_لإجراءات_الاستئناف.docx'
(2/70) Visiting: نظام التكاليف القضائية
  --> Successfully saved text to 'نظام_التكاليف_القضائية.docx'
(3/70) Visiting: اللائحة التنفيذية لنظام المحاكم التجارية
  --> Successfully saved text to 'اللائحة_التنفيذية_لنظام_المحاكم_التجارية.docx'
(4/70) Visiting: قواعد العمل في مكاتب المصالحة وإجراءاته
  --> Successfully saved text to 'قواعد_العمل_في_مكاتب_المصالحة_وإجراءاته.docx'
(5/70) Visiting: ضوابط تسلم المؤجر الأصول المنقولة
  --> Successfully saved text to 'ضوابط_تسلم_المؤجر_الأصول_المنقولة.docx'
(6/70) Visiting: لائحة الوثائق القضائية
  --> Successfully saved text to 'لائحة_الوثائق_القضائية.docx'
(7/70) Visiting: آلية الاستعانة بمحام على نفقة الدولة للمتهم في الجرائم ال