In [1]:
#Dependencies
%pip install -q selenium webdriver-manager beautifulsoup4 pdf2docx

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# This section retrieves a fresh session cookie, fetches the entries' IDs, titles, and URLs, and saves them in a file "Nezams_IDs.{date}.json".

import json
import time
import urllib.parse
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

OUTPUT_FILENAME_TEMPLATE = "Nezams_IDs.{date}.json"

def fetch_and_save_ids():
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    # Initialize webdriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    try:
        print("üåê Navigating to Nezams website...")
        driver.get("https://nezams.com/")
        
        # Wait for page to load
        wait = WebDriverWait(driver, 10)
        time.sleep(3)  # Give time for AJAX requests
        
        # Look for admin-ajax.php requests in network logs or try to find the data source
        # Since we can't directly intercept network requests with Selenium like Playwright,
        # let's try to find the data in the page source or make the AJAX request ourselves
        
        # First, let's try to find any JSON data embedded in the page
        page_source = driver.page_source
        
        # Try to find script tags that might contain the data
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Look for common elements that might contain the systems data
        systems = []
        
        # Try to find links or elements that represent the systems
        # Common selectors for navigation menus or lists
        selectors_to_try = [
            'a[href*="nezam"]',
            'a[href*="system"]', 
            '.menu-item a',
            '.nav-item a',
            'ul li a',
            '[data-id]',
            '.system-link'
        ]
        
        for selector in selectors_to_try:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    print(f"‚úÖ Found {len(elements)} elements with selector: {selector}")
                    for i, elem in enumerate(elements[:10]):  # Limit to first 10 for testing
                        try:
                            text = elem.text.strip()
                            href = elem.get_attribute('href')
                            if text and href and 'nezam' in href.lower():
                                systems.append({
                                    "id": i + 1,
                                    "name": text,
                                    "url": href
                                })
                        except:
                            continue
                    if systems:
                        break
            except Exception as e:
                continue
        
        # If we didn't find systems through selectors, try a different approach
        if not systems:
            print("‚ö† No systems found through standard selectors, trying alternative approach...")
            
            # Try to execute JavaScript to get data
            try:
                # Look for any global variables or data structures
                js_result = driver.execute_script("""
                    // Look for common global variables that might contain data
                    var data = [];
                    if (typeof window.nezams_data !== 'undefined') {
                        return window.nezams_data;
                    }
                    if (typeof window.systems !== 'undefined') {
                        return window.systems;
                    }
                    // Look for menu data
                    var links = document.querySelectorAll('a[href*="nezam"], a[href*="system"]');
                    for (var i = 0; i < links.length; i++) {
                        if (links[i].href && links[i].textContent.trim()) {
                            data.push({
                                id: i + 1,
                                name: links[i].textContent.trim(),
                                url: links[i].href
                            });
                        }
                    }
                    return data;
                """)
                
                if js_result and isinstance(js_result, list):
                    systems = js_result[:20]  # Limit to first 20
                    print(f"‚úÖ Found {len(systems)} systems through JavaScript")
                
            except Exception as e:
                print(f"‚ùå JavaScript execution failed: {e}")
        
        # Save the data
        if systems:
            today = datetime.now().strftime("%m.%d.%Y")
            filename = OUTPUT_FILENAME_TEMPLATE.format(date=today)
            
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(systems, f, ensure_ascii=False, indent=2)
            
            print(f"‚úÖ Saved {len(systems)} entries to {filename}")
            
            # Print first few entries for verification
            print("\nüìã First 5 entries:")
            for i, system in enumerate(systems[:5], 1):
                print(f"  {i}. {system['name']}")
                print(f"     URL: {system['url']}")
        else:
            print("‚ùå No systems found. The website structure might have changed.")
            print("üîç You may need to inspect the website manually to find the correct selectors.")
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
    finally:
        driver.quit()

# Run the function
fetch_and_save_ids()

üåê Navigating to Nezams website...
‚ùå Error: HTTPConnectionPool(host='localhost', port=64377): Read timed out. (read timeout=120)
‚ùå Error: HTTPConnectionPool(host='localhost', port=64377): Read timed out. (read timeout=120)


In [None]:
# This section loads the previously saved JSON file, visits each URL, and constructs a DOCX file with selected elements and proper formatting.

import os
import json
import time
import subprocess
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn

# Output folder
output_dir = "Nezams_Docs"
os.makedirs(output_dir, exist_ok=True)

# Selectors to remove
unwanted_selectors = [
    "div.fontsize.no-print",
    "span.share-icon",
    "span.total-readers",
    "div.subject-share",
    "span.numbe-s",
    "div#more-items",
    "ul#subject-nav-links"
]

def kill_chrome_processes():
    """Kill any existing Chrome processes"""
    try:
        subprocess.run(["taskkill", "/f", "/im", "chrome.exe"], 
                      capture_output=True, text=True)
        subprocess.run(["taskkill", "/f", "/im", "chromedriver.exe"], 
                      capture_output=True, text=True)
        time.sleep(2)
        print("üßπ Cleaned up existing Chrome processes")
    except:
        pass

# Save DOCX with RTL and right-aligned paragraphs
def save_docx(title, body, filename):
    doc = Document()
    section = doc.sections[0]
    section.right_to_left = True

    style = doc.styles['Normal']
    style.font.name = 'Arial'
    style._element.rPr.rFonts.set(qn('w:eastAsia'), 'Arial')
    style.font.size = Pt(14)

    p_title = doc.add_paragraph()
    p_title.paragraph_format.right_to_left = True
    p_title.alignment = WD_ALIGN_PARAGRAPH.RIGHT
    p_title.add_run(title)

    p_body = doc.add_paragraph()
    p_body.paragraph_format.right_to_left = True
    p_body.alignment = WD_ALIGN_PARAGRAPH.RIGHT
    p_body.add_run(body)

    doc.save(filename)

# Synchronous scraper using Selenium
def scrape_and_save_all():
    # Clean up any existing Chrome processes first
    kill_chrome_processes()
    
    today = datetime.now().strftime("%m.%d.%Y")
    filename = f"Nezams_IDs.{today}.json"

    try:
        with open(filename, "r", encoding="utf-8") as f:
            data = json.load(f)

        items_to_scrape = [item for item in data if 'url' in item and item['url']]
        print(f"Loaded {len(items_to_scrape)} URLs from the JSON file: {filename}\n")
    except FileNotFoundError:
        print(f"Error: JSON file '{filename}' not found. Please ensure the file exists.")
        return
    except json.JSONDecodeError:
        print("Error: JSON file is not valid.")
        return

    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    # Initialize webdriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    try:
        successful_downloads = 0
        
        for count, item in enumerate(items_to_scrape, 1):
            url = item['url']
            item_id = item.get('id', 'N/A')
            
            try:
                print(f"üåê Processing ({count}/{len(items_to_scrape)}): ID {item_id}")
                
                driver.get(url)
                time.sleep(2)  # Wait for page to load
                
                html = driver.page_source
                soup = BeautifulSoup(html, 'html.parser')

                # Try to find title and content with various selectors
                title_tag = soup.select_one("body > div.page > h1")
                if not title_tag:
                    # Try alternative selectors for title
                    title_tag = soup.select_one("h1") or soup.select_one(".page-title") or soup.select_one("title")
                
                content_div = soup.select_one("body > div.page > div.post-page > div")
                if not content_div:
                    # Try alternative selectors for content
                    content_div = soup.select_one(".post-page") or soup.select_one(".content") or soup.select_one("main")
                
                if not title_tag or not content_div:
                    print(f"‚ö† No title or content found for ID {item_id}")
                    continue
                    
                title = title_tag.get_text(strip=True)

                # Remove unwanted elements
                for selector in unwanted_selectors:
                    for tag in content_div.select(selector):
                        tag.decompose()

                # Handle nested spans
                for outer in content_div.select('span.selectionShareable[style="color: #993300;"]'):
                    inner_spans = outer.select('span.selectionShareable')
                    combined = ' '.join(s.get_text(strip=True) for s in inner_spans if s.get_text(strip=True))
                    if combined:
                        outer.string = combined
                    for s in inner_spans:
                        s.decompose()

                body_text = content_div.get_text(separator="\n", strip=True)
                
                if not body_text.strip():
                    print(f"‚ö† No content text found for ID {item_id}")
                    continue

                # Create safe filename
                safe_title = title.replace("/", "-").replace(":", "ÿå").replace("\\", "-").replace("*", "").replace("?", "").replace("\"", "").replace("<", "").replace(">", "").replace("|", "").strip()
                safe_title = safe_title[:100]  # Limit filename length
                
                output_filename = os.path.join(output_dir, f"{safe_title}.docx")
                save_docx(title, body_text, output_filename)
                successful_downloads += 1
                print(f"‚úÖ Saved ({count}/{len(items_to_scrape)}): {safe_title} ({len(body_text)} chars)")
                
            except Exception as e:
                print(f"‚ùå Failed ({count}/{len(items_to_scrape)}) ID {item_id}: {str(e)}")
                continue

        print(f"\nüéâ Processing complete!")
        print(f"üìä Successfully downloaded {successful_downloads} out of {len(items_to_scrape)} documents")
        print(f"üìÅ Files saved in '{output_dir}' folder")
        
    except Exception as e:
        print(f"‚ùå Fatal error: {str(e)}")
    finally:
        driver.quit()
        kill_chrome_processes()

# Run the scraper
print("üöÄ Starting Nezams document scraping...")
scrape_and_save_all()

üöÄ Starting Nezams document scraping...
üßπ Cleaned up existing Chrome processes
Loaded 9 URLs from the JSON file: Nezams_IDs.09.19.2025.json

üßπ Cleaned up existing Chrome processes
Loaded 9 URLs from the JSON file: Nezams_IDs.09.19.2025.json

üåê Processing (1/9): ID 2
üåê Processing (1/9): ID 2
‚úÖ Saved (1/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑŸÜŸÇŸÑ ÿßŸÑÿ®ÿ±Ÿä ÿπŸÑŸâ ÿßŸÑÿ∑ÿ±ŸÇ (21953 chars)
üåê Processing (2/9): ID 3
‚úÖ Saved (1/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑŸÜŸÇŸÑ ÿßŸÑÿ®ÿ±Ÿä ÿπŸÑŸâ ÿßŸÑÿ∑ÿ±ŸÇ (21953 chars)
üåê Processing (2/9): ID 3
‚úÖ Saved (2/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑŸÖŸàÿßÿØ ÿßŸÑÿ®ÿ™ÿ±ŸàŸÑŸäÿ© ŸàÿßŸÑÿ®ÿ™ÿ±ŸàŸÉŸäŸÖÿßŸàŸäÿ© (17974 chars)
üåê Processing (3/9): ID 4
‚úÖ Saved (2/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑŸÖŸàÿßÿØ ÿßŸÑÿ®ÿ™ÿ±ŸàŸÑŸäÿ© ŸàÿßŸÑÿ®ÿ™ÿ±ŸàŸÉŸäŸÖÿßŸàŸäÿ© (17974 chars)
üåê Processing (3/9): ID 4
‚úÖ Saved (3/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑŸÇŸäÿßÿ≥ ŸàÿßŸÑŸÖÿπÿßŸäÿ±ÿ© (14686 chars)
üåê Processing (4/9): ID 5
‚úÖ Saved (3/9): ŸÜÿ∏ÿßŸÖ ÿßŸÑŸÇŸäÿßÿ≥ ŸàÿßŸÑŸÖÿπÿßŸäÿ±ÿ© (14686 chars)
üåê Processing (4/9): ID 5
‚úÖ Saved (4/9): ŸÜÿ