In [2]:
!pip install playwright
!playwright install


Collecting playwright
  Downloading playwright-1.53.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.53.0-py3-none-manylinux1_x86_64.whl (45.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.53.0 pyee-13.0.0
Downloading Chromium 138.0.7204.23 (playwright build v1179)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1179/chromium-linux.zip[22m
[1G171.6 MiB [] 0% 0.0s[0K[1G171.6 MiB [] 0% 32.0s[0K[1G171.6 MiB [] 0% 13.5s[0K[1G171.6 MiB [] 0% 7.1s[0K[1G171.6 MiB [] 1% 5.3s[0K[1G171.6 MiB [] 1% 4.5s[0K[1G171.6 MiB [] 2% 4.1s[0K[1G171.6 MiB [] 2% 3.7s[0K[1G171.6 MiB [] 3% 3.4s[0K[1G171.6 MiB [] 4% 3.3s

In [14]:
import asyncio
import re
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

async def get_blog_ids():
    url = "https://www.bog.gov.sa/ScientificContent/JudicialBlogs/1443/Pages/default.aspx"
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_selector("button[title='تحميل']")  # wait for buttons

        html = await page.content()
        soup = BeautifulSoup(html, "html.parser")
        blog_ids = []
        for btn in soup.find_all("button", {"title": "تحميل"}):
            target = btn.get("data-bs-target", "")
            m = re.search(r'blog_items_(\d+)', target)
            if m:
                blog_ids.append(int(m.group(1)))

        await browser.close()
        return blog_ids

blog_ids = asyncio.run(get_blog_ids())
print(f"Found blog IDs: {blog_ids}")


RuntimeError: asyncio.run() cannot be called from a running event loop

# Task
Explain the error in the selected code, fix it if possible, and incorporate the changes into the existing code. If fixing is not possible, diagnose the error.

## Fetch blog ids and pdf links

### Subtask:
Run the code to extract blog IDs from the main page and then query an API for each ID to get the PDF links. Save the links to `pdf_links.txt`.


**Reasoning**:
The current environment is an interactive notebook, which already has a running event loop. Calling `asyncio.run()` in this context will raise a `RuntimeError`. To fix this, I need to use `await` to call the asynchronous function `get_blog_ids()` directly within the existing event loop. Then, I will iterate through the retrieved blog IDs and use `aiohttp` within an `asyncio` context to fetch the PDF links from the API. Finally, I will write the collected PDF links to a file.



In [19]:
# -*- coding: utf-8 -*-
"""
Playwright Script for Google Colab to Fetch Dynamic Download Links (Async Version)
"""

# -----------------------------------------------------------------------------
# STEP 1: Install Playwright and its browser binaries
# Run these commands in a Colab cell. It takes a minute or two.
# -----------------------------------------------------------------------------
!pip install playwright
!playwright install chromium
!pip install nest_asyncio # Install nest_asyncio for Colab environment

# -----------------------------------------------------------------------------
# STEP 2: Import necessary libraries
# -----------------------------------------------------------------------------
from playwright.async_api import async_playwright
import asyncio
import nest_asyncio
import json
import re
import time

# Apply nest_asyncio to allow the async loop to run inside Colab's loop
nest_asyncio.apply()

# -----------------------------------------------------------------------------
# STEP 3: Define the main function to fetch links (now async and multi-step)
# -----------------------------------------------------------------------------
async def get_multi_step_download_links(
    initial_url,
    first_click_selector, # This can now be an XPath or CSS selector
    modal_trigger_button_selector,
    download_links_container_selector,
    final_download_link_selector,
    api_endpoint_regex=None
):
    """
    Asynchronously navigates to an initial URL, performs two sequential clicks
    (one to navigate/load content, one to open a modal), waits for dynamic content,
    and extracts download links from the final modal, also capturing relevant API responses.

    Args:
        initial_url (str): The initial URL of the page to start from.
        first_click_selector (str): CSS selector or XPath for the first link/element to click.
        modal_trigger_button_selector (str): CSS selector for the button that opens the download modal.
        download_links_container_selector (str): CSS selector for the container (the modal itself)
                                                  where the final download links will appear.
        final_download_link_selector (str): CSS selector for the actual <a> tags
                                            containing the download links within the modal.
        api_endpoint_regex (re.Pattern, optional): A regex pattern to filter and capture specific API responses.

    Returns:
        tuple: A tuple containing:
            - list: A list of found download link URLs.
            - list: A list of captured API responses (dictionaries with 'url', 'content_type', 'status').
    """
    links_found = []
    captured_api_responses = []

    def handle_response(response):
        if api_endpoint_regex and api_endpoint_regex.search(response.url):
            content_type = response.headers.get("Content-Type", "")
            captured_api_responses.append({
                "url": response.url,
                "content_type": content_type,
                "status": response.status
            })

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=['--no-sandbox', '--disable-setuid-sandbox']
        )
        page = await browser.new_page()

        page.on("request", lambda request: print(f"Colab Debug: REQ >> {request.method.upper()} {request.url}"))
        page.on("response", handle_response)

        print(f"Colab Output: Navigating to initial page: {initial_url}...")
        try:
            await page.goto(initial_url, wait_until="domcontentloaded", timeout=60000)
            print("Colab Output: Initial page loaded.")
        except Exception as e:
            print(f"Colab Error: Failed to load initial page {initial_url}: {e}")
            await browser.close()
            return [], []

        # --- Step 1: Click the first link (using XPath) ---
        print(f"Colab Output: Attempting to click first link with selector: {first_click_selector}")
        try:
            # Use page.locator with xpath= for XPath selectors
            await page.locator(xpath=first_click_selector).click()
            print("Colab Output: First link clicked.")

            # Wait for the modal trigger button to be visible on the new content area.
            # This implicitly waits for the content loaded by the first click.
            await page.wait_for_selector(modal_trigger_button_selector, state='visible', timeout=30000)
            print("Colab Output: Content after first click loaded. Modal trigger button visible.")

        except Exception as e:
            print(f"Colab Error: Error clicking first link or waiting for subsequent content: {e}")
            await browser.close()
            return [], []

        # --- Step 2: Click the modal trigger button ---
        print(f"Colab Output: Attempting to click modal trigger button with selector: {modal_trigger_button_selector}")
        try:
            await page.locator(modal_trigger_button_selector).click()
            print("Colab Output: Modal trigger button clicked.")

            # --- Step 3: Wait for the modal to appear ---
            await page.wait_for_selector(download_links_container_selector, state='visible', timeout=15000)
            print(f"Colab Output: Download modal container ({download_links_container_selector}) is visible.")

            # --- Step 4: Extract the download links from inside the modal ---
            elements = await page.query_selector_all(final_download_link_selector)
            if elements:
                for element in elements:
                    href = await element.get_attribute('href')
                    if href:
                        # Attempt to get the full resolved URL, which is often more reliable
                        # than manual string concatenation for relative URLs.
                        full_href = await element.evaluate('node => node.href')
                        links_found.append(full_href)
            else:
                print(f"Colab Output: No download links found with selector: {final_download_link_selector} inside the modal.")

        except Exception as e:
            print(f"Colab Error: Error interacting with modal button or finding links: {e}")

        await browser.close()

    return links_found, captured_api_responses

# -----------------------------------------------------------------------------
# STEP 4: Configuration and Execution (Async call)
# -----------------------------------------------------------------------------

target_url = "https://www.bog.gov.sa/ScientificContent/JudicialBlogs/1443/Pages/default.aspx"

# First link to click (using the provided XPath)
first_click_selector = "/html/body/form/div[5]/div[2]/div/div/div[4]/div[4]/div/div/div/div/div[2]/div[3]/section/div/div/div[2]/div[1]/ul/li[2]/a"

# Modal trigger button (تحميل)
modal_trigger_button_selector = "button#modalLink[title='تحميل']"

# Container for download links (the modal content)
download_links_container_selector = "div#ex0blog_items_259"

# Actual download links within the modal (any <a> tag with an href)
final_download_link_selector = f"{download_links_container_selector} a[href]"

# Optional: Regex for specific API calls if you want to capture them
api_url_pattern = re.compile(r"BOGBlogHandler\.ashx|sp\.web\.getcontextwebthemedata")

print(f"Colab Output: Starting script for {target_url}")

download_links, captured_api_responses = await get_multi_step_download_links(
    target_url,
    first_click_selector,
    modal_trigger_button_selector,
    download_links_container_selector,
    final_download_link_selector,
    api_endpoint_regex=api_url_pattern
)

# -----------------------------------------------------------------------------
# STEP 5: Display Results
# -----------------------------------------------------------------------------
if download_links:
    print("\n--- Colab Result: Found Download Links ---")
    for link in download_links:
        print(link)
else:
    print("\n--- Colab Result: No direct download links found after interaction. ---")
    print("This might mean selectors are incorrect, or content loaded differently.")

if captured_api_responses:
    print("\n--- Colab Result: Captured API Responses (Review for URLs and Status) ---")
    for i, res in enumerate(captured_api_responses):
        print(f"\nResponse {i+1} from URL: {res['url']}")
        print(f"  Content Type: {res.get('content_type', 'N/A')}")
        print(f"  Status: {res.get('status', 'N/A')}")
else:
    print("\n--- Colab Result: No specific API responses were captured matching the pattern (if provided). ---")
    print("Review 'REQ' debug lines above for all network activity.")

Colab Output: Starting script for https://www.bog.gov.sa/ScientificContent/JudicialBlogs/1443/Pages/default.aspx
Colab Output: Navigating to initial page: https://www.bog.gov.sa/ScientificContent/JudicialBlogs/1443/Pages/default.aspx...
Colab Debug: REQ >> GET https://www.bog.gov.sa/ScientificContent/JudicialBlogs/1443/Pages/default.aspx
Colab Debug: REQ >> GET https://www.bog.gov.sa/Style%20Library/ar-SA/Themable/Core%20Styles/pagelayouts15.css
Colab Debug: REQ >> GET https://www.bog.gov.sa/_layouts/15/1025/styles/Themable/oslo.css?rev=NcGXZT9PATKQlWAgwTxJOQ%3D%3DTAG0
Colab Debug: REQ >> GET https://www.bog.gov.sa/Style%20Library/chat-widget/widgets.min.css
Colab Debug: REQ >> GET https://www.bog.gov.sa/_layouts/15/1025/initstrings.js?rev=u0%2Bzzti0ebUXx1Nufovceg%3D%3DTAG0
Colab Debug: REQ >> GET https://www.bog.gov.sa/_layouts/15/init.js?rev=YNGa0jKPBejT2TMT7y9VpQ%3D%3DTAG0
Colab Debug: REQ >> GET https://www.bog.gov.sa/_layouts/15/theming.js?rev=SuLUbbwTXcxFWOWLFUsOkw%3D%3DTAG0
Cola