In [4]:
#Dependencies
%pip install selenium python-docx beautifulsoup4 webdriver-manager

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import os
import time
from bs4 import BeautifulSoup
from docx import Document
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

OUTPUT_FOLDER = "GOSI_DOCX"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
MAIN_URL = "https://www.gosi.gov.sa/ar/SystemsAndRegulations"

def make_rtl(paragraph):
    """Set paragraph to right-to-left direction."""
    p = paragraph._p
    pPr = p.get_or_add_pPr()
    bidi = OxmlElement('w:bidi')
    bidi.set(qn('w:val'), 'true')
    pPr.append(bidi)

def scrape_and_save_docx_rtl():
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    # Initialize webdriver with automatic driver management
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    wait = WebDriverWait(driver, 20)
    
    try:
        print("üåê Navigating to GOSI website...")
        driver.get(MAIN_URL)
        
        # Wait for the list to load
        print("‚è≥ Waiting for page to load...")
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mediaCenterElements li")))
        items_all = driver.find_elements(By.CSS_SELECTOR, "#mediaCenterElements li")

        # Collect titles except "ŸÉÿ™Ÿäÿ®ÿßÿ™ ÿßŸÑÿ£ŸÜÿ∏ŸÖÿ©"
        titles = []
        for item in items_all:
            text = item.text.strip()
            if text and text != "ŸÉÿ™Ÿäÿ®ÿßÿ™ ÿßŸÑÿ£ŸÜÿ∏ŸÖÿ©":
                titles.append(text)

        print(f"‚úÖ Total items to visit: {len(titles)}\n")

        for idx, title in enumerate(titles, start=1):
            print(f"‚û° Visiting ({idx}/{len(titles)}): {title}")

            # Go back to main page for each new item
            driver.get(MAIN_URL)
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mediaCenterElements li")))

            # Find the <li> by title
            items = driver.find_elements(By.CSS_SELECTOR, "#mediaCenterElements li")
            item_to_click = None
            for item in items:
                if item.text.strip() == title:
                    item_to_click = item
                    break

            if not item_to_click:
                print(f"‚ö† Could not find item: {title}")
                continue

            # Click the item
            try:
                driver.execute_script("arguments[0].click();", item_to_click)
                
                # Wait for content to load
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#systemsAndRegulationsPageContent")))
                time.sleep(2)  # allow Angular to render fully

                # Get main content div
                content_div = driver.find_element(By.CSS_SELECTOR, "#systemsAndRegulationsPageContent")
                html_content = content_div.get_attribute('innerHTML')

                if not html_content.strip():
                    print(f"‚ö† No content found for: {title}")
                    continue

                # Parse HTML to plain text
                plain_text = BeautifulSoup(html_content, "html.parser").get_text(separator="\n", strip=True)

                if not plain_text.strip():
                    print(f"‚ö† No text content found for: {title}")
                    continue

                # Save as DOCX with RTL
                doc = Document()
                para = doc.add_paragraph(plain_text)
                make_rtl(para)
                safe_name = "".join(c for c in title if c.isalnum() or c in " _-")
                output_path = os.path.join(OUTPUT_FOLDER, f"{safe_name}.docx")
                doc.save(output_path)
                print(f"‚úÖ Saved DOCX (RTL): {output_path}\n")
                
            except Exception as e:
                print(f"‚ö† Error processing {title}: {str(e)}")
                continue

    except Exception as e:
        print(f"‚ùå Fatal error: {str(e)}")
    finally:
        driver.quit()
        
    print(f"\n‚úÖ All pages processed and saved as DOCX in '{OUTPUT_FOLDER}/' folder.")

# Run the scraping function
scrape_and_save_docx_rtl()

üåê Navigating to GOSI website...
‚è≥ Waiting for page to load...
‚úÖ Total items to visit: 9

‚û° Visiting (1/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1421 Ÿá
‚è≥ Waiting for page to load...
‚úÖ Total items to visit: 9

‚û° Visiting (1/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1421 Ÿá
‚ö† Error processing ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1421 Ÿá: Message: 
Stacktrace:
	GetHandleVerifier [0x0x690c13+66051]
	GetHandleVerifier [0x0x690c54+66116]
	(No symbol) [0x0x46db33]
	(No symbol) [0x0x4b78a8]
	(No symbol) [0x0x4b7c4b]
	(No symbol) [0x0x4fe0a2]
	(No symbol) [0x0x4d9fd4]
	(No symbol) [0x0x4fb7f1]
	(No symbol) [0x0x4d9d86]
	(No symbol) [0x0x4ab53e]
	(No symbol) [0x0x4ac414]
	GetHandleVerifier [0x0x8d8a13+2457603]
	GetHandleVerifier [0x0x8d39d2+2437058]
	GetHandleVerifier [0x0x6b97f2+232930]
	GetHandleVerifier [0x0x6a9a18+167944]
	GetHandleVerifier [0x0x6b092d+196381]
	GetHandleVerifier [0x0x698ee8+99544]
	GetHandleVerifier [0x0x699082+99954]
	GetH

In [6]:
# Diagnostic version to debug the scraping process
import os
import time
from bs4 import BeautifulSoup
from docx import Document
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

OUTPUT_FOLDER = "GOSI_DOCX"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
MAIN_URL = "https://www.gosi.gov.sa/ar/SystemsAndRegulations"

def make_rtl(paragraph):
    """Set paragraph to right-to-left direction."""
    p = paragraph._p
    pPr = p.get_or_add_pPr()
    bidi = OxmlElement('w:bidi')
    bidi.set(qn('w:val'), 'true')
    pPr.append(bidi)

def debug_website_structure():
    """Debug function to check what's available on the website"""
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    wait = WebDriverWait(driver, 20)
    
    try:
        print("üåê Navigating to GOSI website...")
        driver.get(MAIN_URL)
        print(f"‚úÖ Page loaded. Title: {driver.title}")
        
        # Check if the page loaded correctly
        print(f"üìÑ Current URL: {driver.current_url}")
        
        # Wait and check for various possible selectors
        time.sleep(5)  # Give page time to load
        
        # Check for different possible selectors
        selectors_to_check = [
            "#mediaCenterElements li",
            ".media-center-elements li", 
            "[id*='media'] li",
            "[class*='media'] li",
            "ul li",
            ".list-item",
            ".menu-item"
        ]
        
        for selector in selectors_to_check:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                print(f"üîç Selector '{selector}': Found {len(elements)} elements")
                if elements:
                    for i, elem in enumerate(elements[:5]):  # Show first 5
                        text = elem.text.strip()
                        print(f"   [{i+1}] '{text}'")
                    if len(elements) > 5:
                        print(f"   ... and {len(elements)-5} more")
                    break
            except Exception as e:
                print(f"‚ùå Error with selector '{selector}': {e}")
        
        # Check page source for debugging
        page_source = driver.page_source
        print(f"\nüìä Page source length: {len(page_source)} characters")
        
        # Look for key Arabic terms in the page source
        arabic_terms = ["ÿßŸÑÿ£ŸÜÿ∏ŸÖÿ©", "ÿßŸÑŸÑŸàÿßÿ¶ÿ≠", "ŸÉÿ™Ÿäÿ®ÿßÿ™", "GOSI"]
        for term in arabic_terms:
            if term in page_source:
                print(f"‚úÖ Found '{term}' in page source")
            else:
                print(f"‚ùå '{term}' not found in page source")
        
    except Exception as e:
        print(f"‚ùå Error during debugging: {e}")
    finally:
        driver.quit()

# Run the debug function
debug_website_structure()

üåê Navigating to GOSI website...
‚úÖ Page loaded. Title: ÿßŸÑŸÖÿ§ÿ≥ÿ≥ÿ© ÿßŸÑÿπÿßŸÖÿ© ŸÑŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ©
üìÑ Current URL: https://www.gosi.gov.sa/ar/SystemsAndRegulations
üîç Selector '#mediaCenterElements li': Found 10 elements
   [1] 'ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1421 Ÿá'
   [2] 'ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1445Ÿá'
   [3] 'ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ŸÇÿßÿπÿØ ÿßŸÑŸÖÿØŸÜŸä'
   [4] 'ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ŸÇÿßÿπÿØ ÿßŸÑÿπÿ≥ŸÉÿ±Ÿä'
   [5] 'ŸÜÿ∏ÿßŸÖ ÿ™ÿ®ÿßÿØŸÑ ÿßŸÑŸÖŸÜÿßŸÅÿπ'
   ... and 5 more

üìä Page source length: 207739 characters
‚úÖ Found 'ÿßŸÑÿ£ŸÜÿ∏ŸÖÿ©' in page source
‚úÖ Found 'ÿßŸÑŸÑŸàÿßÿ¶ÿ≠' in page source
‚úÖ Found 'ŸÉÿ™Ÿäÿ®ÿßÿ™' in page source
‚úÖ Found 'GOSI' in page source


In [7]:
# Test clicking on one item and extracting content
def test_single_item_extraction():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    wait = WebDriverWait(driver, 20)
    
    try:
        print("üåê Navigating to GOSI website...")
        driver.get(MAIN_URL)
        
        # Wait for the list to load
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mediaCenterElements li")))
        items_all = driver.find_elements(By.CSS_SELECTOR, "#mediaCenterElements li")
        
        # Get the first item (excluding "ŸÉÿ™Ÿäÿ®ÿßÿ™ ÿßŸÑÿ£ŸÜÿ∏ŸÖÿ©")
        test_item = None
        test_title = None
        for item in items_all:
            text = item.text.strip()
            if text and text != "ŸÉÿ™Ÿäÿ®ÿßÿ™ ÿßŸÑÿ£ŸÜÿ∏ŸÖÿ©":
                test_item = item
                test_title = text
                break
        
        if not test_item:
            print("‚ùå No suitable test item found")
            return
            
        print(f"üß™ Testing with item: '{test_title}'")
        
        # Click the item
        driver.execute_script("arguments[0].click();", test_item)
        print("‚úÖ Clicked on item")
        
        # Wait for content to load and check different selectors
        time.sleep(3)
        
        content_selectors = [
            "#systemsAndRegulationsPageContent",
            "[id*='content']",
            "[class*='content']",
            ".page-content",
            ".main-content",
            "main",
            ".container"
        ]
        
        for selector in content_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                print(f"üîç Content selector '{selector}': Found {len(elements)} elements")
                if elements:
                    content = elements[0].get_attribute('innerHTML')
                    print(f"   Content length: {len(content)} characters")
                    if content.strip():
                        # Try to extract text
                        plain_text = BeautifulSoup(content, "html.parser").get_text(separator="\n", strip=True)
                        print(f"   Plain text length: {len(plain_text)} characters")
                        print(f"   First 200 chars: {plain_text[:200]}...")
                        
                        # Try to save as DOCX
                        if plain_text.strip():
                            doc = Document()
                            para = doc.add_paragraph(plain_text)
                            make_rtl(para)
                            safe_name = "test_file"
                            output_path = os.path.join(OUTPUT_FOLDER, f"{safe_name}.docx")
                            doc.save(output_path)
                            print(f"‚úÖ Successfully saved test file: {output_path}")
                            return
                    break
            except Exception as e:
                print(f"‚ùå Error with content selector '{selector}': {e}")
        
        print("‚ùå No content found with any selector")
        
    except Exception as e:
        print(f"‚ùå Error during testing: {e}")
    finally:
        driver.quit()

# Run the test
test_single_item_extraction()

SessionNotCreatedException: Message: session not created
from chrome not reachable; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception
Stacktrace:
	GetHandleVerifier [0x0x690c13+66051]
	GetHandleVerifier [0x0x690c54+66116]
	(No symbol) [0x0x46d980]
	(No symbol) [0x0x4614fa]
	(No symbol) [0x0x4a7a36]
	(No symbol) [0x0x4a30ec]
	(No symbol) [0x0x49dcda]
	(No symbol) [0x0x4e5c54]
	(No symbol) [0x0x4e555a]
	(No symbol) [0x0x4d9d86]
	(No symbol) [0x0x4ab53e]
	(No symbol) [0x0x4ac414]
	GetHandleVerifier [0x0x8d8a13+2457603]
	GetHandleVerifier [0x0x8d39d2+2437058]
	GetHandleVerifier [0x0x6b97f2+232930]
	GetHandleVerifier [0x0x6a9a18+167944]
	GetHandleVerifier [0x0x6b092d+196381]
	GetHandleVerifier [0x0x698ee8+99544]
	GetHandleVerifier [0x0x699082+99954]
	GetHandleVerifier [0x0x68322a+10266]
	BaseThreadInitThunk [0x0x772e5d49+25]
	RtlInitializeExceptionChain [0x0x77c3d6db+107]
	RtlGetAppContainerNamedObjectPath [0x0x77c3d661+561]


In [8]:
# Fixed version with better Chrome management and error handling
import os
import time
import subprocess
from bs4 import BeautifulSoup
from docx import Document
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def kill_chrome_processes():
    """Kill any existing Chrome processes"""
    try:
        subprocess.run(["taskkill", "/f", "/im", "chrome.exe"], 
                      capture_output=True, text=True)
        subprocess.run(["taskkill", "/f", "/im", "chromedriver.exe"], 
                      capture_output=True, text=True)
        time.sleep(2)
        print("üßπ Cleaned up existing Chrome processes")
    except:
        pass

def scrape_and_save_docx_rtl_fixed():
    # Clean up any existing Chrome processes first
    kill_chrome_processes()
    
    OUTPUT_FOLDER = "GOSI_DOCX"
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    MAIN_URL = "https://www.gosi.gov.sa/ar/SystemsAndRegulations"
    
    # Set up Chrome options with additional stability options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-plugins")
    chrome_options.add_argument("--disable-images")
    chrome_options.add_argument("--disable-javascript")  # We'll try without JS first
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    driver = None
    try:
        # Initialize webdriver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.set_page_load_timeout(30)
        wait = WebDriverWait(driver, 15)
        
        print("üåê Navigating to GOSI website...")
        driver.get(MAIN_URL)
        print(f"‚úÖ Page loaded successfully")
        
        # Wait for the list to load
        print("‚è≥ Waiting for navigation elements...")
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mediaCenterElements li")))
        items_all = driver.find_elements(By.CSS_SELECTOR, "#mediaCenterElements li")
        
        # Collect titles except "ŸÉÿ™Ÿäÿ®ÿßÿ™ ÿßŸÑÿ£ŸÜÿ∏ŸÖÿ©"
        titles = []
        for item in items_all:
            text = item.text.strip()
            if text and text != "ŸÉÿ™Ÿäÿ®ÿßÿ™ ÿßŸÑÿ£ŸÜÿ∏ŸÖÿ©":
                titles.append(text)
        
        print(f"‚úÖ Found {len(titles)} items to process")
        
        # Process each item
        for idx, title in enumerate(titles[:3], start=1):  # Start with first 3 items
            print(f"\n‚û° Processing ({idx}/3): {title}")
            
            try:
                # Go back to main page
                driver.get(MAIN_URL)
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mediaCenterElements li")))
                
                # Find and click the item
                items = driver.find_elements(By.CSS_SELECTOR, "#mediaCenterElements li")
                item_to_click = None
                for item in items:
                    if item.text.strip() == title:
                        item_to_click = item
                        break
                
                if not item_to_click:
                    print(f"‚ö† Could not find item: {title}")
                    continue
                
                # Click using JavaScript
                driver.execute_script("arguments[0].click();", item_to_click)
                print("‚úÖ Clicked item")
                
                # Wait and try to find content
                time.sleep(3)
                
                # Try multiple content selectors
                content_found = False
                content_selectors = [
                    "#systemsAndRegulationsPageContent",
                    "[id*='content']",
                    ".content",
                    "main",
                    "body"
                ]
                
                for selector in content_selectors:
                    try:
                        content_elements = driver.find_elements(By.CSS_SELECTOR, selector)
                        if content_elements:
                            html_content = content_elements[0].get_attribute('innerHTML')
                            if html_content and html_content.strip():
                                # Parse HTML to plain text
                                plain_text = BeautifulSoup(html_content, "html.parser").get_text(separator="\n", strip=True)
                                
                                if plain_text and len(plain_text.strip()) > 50:  # Ensure meaningful content
                                    # Save as DOCX with RTL
                                    doc = Document()
                                    para = doc.add_paragraph(plain_text)
                                    make_rtl(para)
                                    safe_name = "".join(c for c in title if c.isalnum() or c in " _-")
                                    output_path = os.path.join(OUTPUT_FOLDER, f"{safe_name}.docx")
                                    doc.save(output_path)
                                    print(f"‚úÖ Saved: {output_path} ({len(plain_text)} chars)")
                                    content_found = True
                                    break
                    except Exception as e:
                        continue
                
                if not content_found:
                    print(f"‚ö† No content found for: {title}")
                    
            except Exception as e:
                print(f"‚ùå Error processing {title}: {str(e)}")
                continue
        
        print(f"\n‚úÖ Processing complete! Check the '{OUTPUT_FOLDER}' folder.")
        
    except Exception as e:
        print(f"‚ùå Fatal error: {str(e)}")
    finally:
        if driver:
            driver.quit()
        # Clean up processes again
        kill_chrome_processes()

# Run the fixed scraper
scrape_and_save_docx_rtl_fixed()

üßπ Cleaned up existing Chrome processes
üåê Navigating to GOSI website...
‚úÖ Page loaded successfully
‚è≥ Waiting for navigation elements...
‚úÖ Found 9 items to process

‚û° Processing (1/3): ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1421 Ÿá
‚úÖ Clicked item
‚úÖ Saved: GOSI_DOCX\ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1421 Ÿá.docx (8072 chars)

‚û° Processing (2/3): ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1445Ÿá
‚úÖ Clicked item
‚úÖ Saved: GOSI_DOCX\ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1445Ÿá.docx (8143 chars)

‚û° Processing (3/3): ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ŸÇÿßÿπÿØ ÿßŸÑŸÖÿØŸÜŸä
‚úÖ Clicked item
‚úÖ Saved: GOSI_DOCX\ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ŸÇÿßÿπÿØ ÿßŸÑŸÖÿØŸÜŸä.docx (8072 chars)

‚úÖ Processing complete! Check the 'GOSI_DOCX' folder.
üßπ Cleaned up existing Chrome processes


In [9]:
# Complete version - Process ALL items
def scrape_all_gosi_documents():
    # Clean up any existing Chrome processes first
    kill_chrome_processes()
    
    OUTPUT_FOLDER = "GOSI_DOCX"
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    MAIN_URL = "https://www.gosi.gov.sa/ar/SystemsAndRegulations"
    
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-plugins")
    chrome_options.add_argument("--disable-images")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    driver = None
    try:
        # Initialize webdriver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.set_page_load_timeout(30)
        wait = WebDriverWait(driver, 15)
        
        print("üåê Navigating to GOSI website...")
        driver.get(MAIN_URL)
        print(f"‚úÖ Page loaded successfully")
        
        # Wait for the list to load
        print("‚è≥ Waiting for navigation elements...")
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mediaCenterElements li")))
        items_all = driver.find_elements(By.CSS_SELECTOR, "#mediaCenterElements li")
        
        # Collect titles except "ŸÉÿ™Ÿäÿ®ÿßÿ™ ÿßŸÑÿ£ŸÜÿ∏ŸÖÿ©"
        titles = []
        for item in items_all:
            text = item.text.strip()
            if text and text != "ŸÉÿ™Ÿäÿ®ÿßÿ™ ÿßŸÑÿ£ŸÜÿ∏ŸÖÿ©":
                titles.append(text)
        
        print(f"‚úÖ Found {len(titles)} total items to process\n")
        
        # Process ALL items
        successful_downloads = 0
        for idx, title in enumerate(titles, start=1):
            print(f"‚û° Processing ({idx}/{len(titles)}): {title}")
            
            try:
                # Go back to main page
                driver.get(MAIN_URL)
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mediaCenterElements li")))
                
                # Find and click the item
                items = driver.find_elements(By.CSS_SELECTOR, "#mediaCenterElements li")
                item_to_click = None
                for item in items:
                    if item.text.strip() == title:
                        item_to_click = item
                        break
                
                if not item_to_click:
                    print(f"‚ö† Could not find item: {title}")
                    continue
                
                # Click using JavaScript
                driver.execute_script("arguments[0].click();", item_to_click)
                
                # Wait and try to find content
                time.sleep(2)
                
                # Try multiple content selectors
                content_found = False
                content_selectors = [
                    "#systemsAndRegulationsPageContent",
                    "[id*='content']",
                    ".content",
                    "main",
                    "body"
                ]
                
                for selector in content_selectors:
                    try:
                        content_elements = driver.find_elements(By.CSS_SELECTOR, selector)
                        if content_elements:
                            html_content = content_elements[0].get_attribute('innerHTML')
                            if html_content and html_content.strip():
                                # Parse HTML to plain text
                                plain_text = BeautifulSoup(html_content, "html.parser").get_text(separator="\n", strip=True)
                                
                                if plain_text and len(plain_text.strip()) > 50:  # Ensure meaningful content
                                    # Save as DOCX with RTL
                                    doc = Document()
                                    para = doc.add_paragraph(plain_text)
                                    make_rtl(para)
                                    safe_name = "".join(c for c in title if c.isalnum() or c in " _-")
                                    output_path = os.path.join(OUTPUT_FOLDER, f"{safe_name}.docx")
                                    doc.save(output_path)
                                    print(f"‚úÖ Saved: {safe_name}.docx ({len(plain_text)} chars)")
                                    successful_downloads += 1
                                    content_found = True
                                    break
                    except Exception as e:
                        continue
                
                if not content_found:
                    print(f"‚ö† No content found for: {title}")
                    
            except Exception as e:
                print(f"‚ùå Error processing {title}: {str(e)}")
                continue
        
        print(f"\nüéâ Processing complete!")
        print(f"üìä Successfully downloaded {successful_downloads} out of {len(titles)} documents")
        print(f"üìÅ Files saved in '{OUTPUT_FOLDER}' folder")
        
    except Exception as e:
        print(f"‚ùå Fatal error: {str(e)}")
    finally:
        if driver:
            driver.quit()
        # Clean up processes again
        kill_chrome_processes()

# Run the complete scraper
print("üöÄ Starting complete GOSI document scraping...")
scrape_all_gosi_documents()

üöÄ Starting complete GOSI document scraping...
üßπ Cleaned up existing Chrome processes
üåê Navigating to GOSI website...
‚úÖ Page loaded successfully
‚è≥ Waiting for navigation elements...
‚úÖ Found 9 total items to process

‚û° Processing (1/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1421 Ÿá
‚úÖ Saved: ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1421 Ÿá.docx (8072 chars)
‚û° Processing (2/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1445Ÿá
‚úÖ Saved: ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜÿßÿ™ ÿßŸÑÿßÿ¨ÿ™ŸÖÿßÿπŸäÿ© 1445Ÿá.docx (8143 chars)
‚û° Processing (3/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ŸÇÿßÿπÿØ ÿßŸÑŸÖÿØŸÜŸä
‚úÖ Saved: ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ŸÇÿßÿπÿØ ÿßŸÑŸÖÿØŸÜŸä.docx (8072 chars)
‚û° Processing (4/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ŸÇÿßÿπÿØ ÿßŸÑÿπÿ≥ŸÉÿ±Ÿä
‚úÖ Saved: ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ŸÇÿßÿπÿØ ÿßŸÑÿπÿ≥ŸÉÿ±Ÿä.docx (8089 chars)
‚û° Processing (5/9): ŸÜÿ∏ÿßŸÖ ÿ™ÿ®ÿßÿØŸÑ ÿßŸÑŸÖŸÜÿßŸÅÿπ
‚úÖ Saved: ŸÜÿ∏ÿßŸÖ ÿ™ÿ®ÿßÿØŸÑ ÿßŸÑŸÖŸÜÿßŸÅÿπ.docx (8143 chars)
‚û° Processing (6/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑÿ™ÿ£ŸÖŸäŸÜ ÿ∂ÿØ ÿßŸÑÿ™ÿπÿ∑ŸÑ ÿπŸ