In [14]:
import pandas as pd
import asyncio
import re
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import aiofiles
import random

# from tqdm.asyncio import tqdm  # Usuń to dla Jupyter

FIRST_OBKW_MAX = 1436239
FIRST_OBKW_MIN = 1404408

SECOND_OBKW_MAX = 1499538
SECOND_OBKW_MIN = 1467396

URL_TEMPLATE_FIRST = "https://wybory.gov.pl/prezydent2025/pl/obkw/1/{}"
URL_TEMPLATE_SECOND = "https://wybory.gov.pl/prezydent2025/pl/obkw/2/{}"

async def scrape_single_page(page, url):
    """Scrapowanie pojedynczej strony"""
    try:
        await page.goto(url, wait_until='domcontentloaded', timeout=10000)
        
        # Try to wait for visible table, but don't fail if it's hidden
        try:
            await page.wait_for_selector('.table-responsive', state='visible', timeout=5000)
        except:
            # If table is not visible, wait for DOM to be ready and continue
            await page.wait_for_load_state('networkidle', timeout=5000)
        
        # Pobierz HTML
        html = await page.content()
        soup = BeautifulSoup(html, 'html.parser')
        
        commission_id = url.split('/')[-1]
        
        # Wyciągnij adres
        address = "Unknown"
        address_element = soup.select_one('.col-xs-12.col-sm-8.col-lg-9.col-xl-10')
        if address_element:
            address = address_element.get_text(strip=True)
        
        nawrocki_votes = 0
        trzaskowski_votes = 0
        
        # Znajdź tabele z wynikami
        tables = soup.find_all('table')
        
        for table in tables:
            rows = table.find_all('tr')
            
            for row in rows:
                row_text = row.get_text()
                
                # Szukaj Trzaskowskiego
                if 'Trzaskowski' in row_text or 'TRZASKOWSKI' in row_text:
                    numbers = re.findall(r'\d+', row_text.replace(' ', ''))
                    if numbers:
                        trzaskowski_votes = int(numbers[-1])
                
                # Szukaj Nawrockiego
                if 'Nawrocki' in row_text or 'NAWROCKI' in row_text:
                    numbers = re.findall(r'\d+', row_text.replace(' ', ''))
                    if numbers:
                        nawrocki_votes = int(numbers[-1])
        
        # Jeśli nie znaleziono w tabelach, szukaj w całej stronie
        if nawrocki_votes == 0 and trzaskowski_votes == 0:
            page_text = soup.get_text()
            
            trzaskowski_pattern = r'Trzaskowski[^0-9]*(\d+)'
            nawrocki_pattern = r'Nawrocki[^0-9]*(\d+)'
            
            trzaskowski_match = re.search(trzaskowski_pattern, page_text, re.IGNORECASE)
            nawrocki_match = re.search(nawrocki_pattern, page_text, re.IGNORECASE)
            
            if trzaskowski_match:
                trzaskowski_votes = int(trzaskowski_match.group(1))
            if nawrocki_match:
                nawrocki_votes = int(nawrocki_match.group(1))
        
        result = {
            "id": commission_id,
            "address": address,
            "Nawrocki": nawrocki_votes,
            "Trzaskowski": trzaskowski_votes,
        }
        
        print(f"✓ {commission_id}: T={trzaskowski_votes}, N={nawrocki_votes}")
        return result
        
    except Exception as e:
        print(f"✗ Error {url}: {str(e)}")
        return None

async def scrape_batch_async(urls, max_concurrent=10):
    """Asynchroniczne scrapowanie batcha URLi"""
    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=True,
            args=[
                '--no-sandbox',
                '--disable-dev-shm-usage',
                '--disable-gpu',
                '--disable-images',
                '--disable-plugins',
                '--disable-extensions'
            ]
        )
        
        # Utwórz semafore do kontroli liczby równoczesnych połączeń
        semaphore = asyncio.Semaphore(max_concurrent)
        
        async def scrape_with_semaphore(url):
            async with semaphore:
                page = await browser.new_page()
                try:
                    result = await scrape_single_page(page, url)
                    return result
                finally:
                    await page.close()
        
        # Uruchom wszystkie zadania równocześnie
        tasks = [scrape_with_semaphore(url) for url in urls]
        results = await asyncio.gather(*tasks)  # Zmieniono z tqdm.gather
        
        await browser.close()
        
        # Filtruj tylko udane wyniki
        return [r for r in results if r is not None]

async def scrape_in_batches_async(start_id, end_id, url_template, batch_size=50, max_concurrent=10):
    """Scrapowanie w batchach asynchronicznie"""
    all_ids = list(range(start_id, end_id + 1))
    batches = [all_ids[i:i + batch_size] for i in range(0, len(all_ids), batch_size)]
    
    all_results = []
    
    for i, batch in enumerate(batches):
        print(f"Batch {i+1}/{len(batches)} - IDs: {batch[0]} do {batch[-1]}")
        
        batch_urls = [url_template.format(obkw_id) for obkw_id in batch]
        
        # Scrapuj batch asynchronicznie
        batch_results = await scrape_batch_async(batch_urls, max_concurrent)
        all_results.extend(batch_results)
        
        # Zapisz po każdym batchu (zwykły pandas)
        temp_df = pd.DataFrame(all_results)
        temp_df.to_csv(f"results_batch_{i+1}.csv", index=False)
        
        print(f"Batch {i+1} ukończony: {len(batch_results)}/{len(batch)} sukces")
        
        # Krótka przerwa między batchami
        await asyncio.sleep(random.uniform(0.3, 1.0))
    
    return all_results

async def main():
    print("ASYNC PLAYWRIGHT SCRAPER STARTUJE!")

    # Pełne scrapowanie pierwszej tury
    print("Scrapowanie pierwszej tury...")
    results_first = await scrape_in_batches_async(
        FIRST_OBKW_MIN, 
        FIRST_OBKW_MAX, 
        URL_TEMPLATE_FIRST,
        batch_size=20, 
        max_concurrent=20
    )
    
    # Zapisz finalne wyniki pierwszej tury
    election_results_first = pd.DataFrame(results_first)
    election_results_first.to_csv("election_results_first_FINAL.csv", index=False)
    print(f"Pierwsza tura ukończona: {len(results_first)} wyników")

    # Scrapuj drugą turę
    print("Scrapowanie drugiej tury...")
    results_second = await scrape_in_batches_async(
        SECOND_OBKW_MIN, 
        SECOND_OBKW_MAX, 
        URL_TEMPLATE_SECOND,
        batch_size=20, 
        max_concurrent=40
    )
    
    # Zapisz finalne wyniki drugiej tury
    election_results_second = pd.DataFrame(results_second)
    election_results_second.to_csv_("election_results_second_FINAL.csv", index=False)
    print(f"Druga tura ukończona: {len(results_second)} wyników")
    
    print("SCRAPING ZAKOŃCZONY!")
    
    print("SCRAPING ZAKOŃCZONY!")

# DLA JUPYTER NOTEBOOK - użyj tego zamiast asyncio.run():
await main()

ASYNC PLAYWRIGHT SCRAPER STARTUJE!
Scrapowanie pierwszej tury...
Batch 1/1592 - IDs: 1404408 do 1404427
✓ 1404408: T=361, N=287
✓ 1404415: T=334, N=219
✓ 1404409: T=381, N=228
✓ 1404419: T=335, N=213
✓ 1404414: T=413, N=228
✓ 1404417: T=272, N=203
✓ 1404420: T=330, N=162
✓ 1404412: T=343, N=202
✓ 1404421: T=452, N=225
✓ 1404416: T=337, N=182
✓ 1404418: T=318, N=166
✓ 1404422: T=396, N=194
✓ 1404413: T=212, N=170
✓ 1404423: T=367, N=191
✓ 1404410: T=356, N=241
✓ 1404425: T=271, N=156
✓ 1404424: T=250, N=178
✓ 1404426: T=475, N=197
✓ 1404427: T=473, N=234
✓ 1404411: T=390, N=217


Future exception was never retrieved
future: <Future finished exception=BrokenPipeError(32, 'Broken pipe')>
Traceback (most recent call last):
  File "/tmp/ipykernel_36580/70449549.py", line 25, in scrape_single_page
    await page.wait_for_selector('.table-responsive', state='visible', timeout=5000)
  File "/home/ai/s16/lib/python3.10/site-packages/playwright/async_api/_generated.py", line 8180, in wait_for_selector
    await self._impl_obj.wait_for_selector(
  File "/home/ai/s16/lib/python3.10/site-packages/playwright/_impl/_page.py", line 425, in wait_for_selector
    return await self._main_frame.wait_for_selector(**locals_to_params(locals()))
  File "/home/ai/s16/lib/python3.10/site-packages/playwright/_impl/_frame.py", line 323, in wait_for_selector
    await self._channel.send("waitForSelector", locals_to_params(locals()))
  File "/home/ai/s16/lib/python3.10/site-packages/playwright/_impl/_connection.py", line 61, in send
    return await self._connection.wrap_api_call(
  File "

Batch 1 ukończony: 20/20 sukces
Batch 2/1592 - IDs: 1404428 do 1404447
✓ 1404428: T=341, N=205
✓ 1404429: T=62, N=79
✓ 1404433: T=193, N=129
✓ 1404430: T=241, N=147
✓ 1404431: T=73, N=70
✓ 1404436: T=117, N=122
✓ 1404434: T=66, N=64
✓ 1404442: T=66, N=71
✓ 1404441: T=27, N=32
✓ 1404440: T=30, N=30
✓ 1404438: T=231, N=136
✓ 1404439: T=81, N=220
✓ 1404443: T=42, N=34
✓ 1404446: T=38, N=46
✓ 1404445: T=147, N=116
✓ 1404447: T=75, N=85
✓ 1404444: T=22, N=30
✓ 1404432: T=83, N=206
✓ 1404437: T=443, N=219
✓ 1404435: T=175, N=195
Batch 2 ukończony: 20/20 sukces
Batch 3/1592 - IDs: 1404448 do 1404467
✓ 1404448: T=71, N=67
✓ 1404450: T=306, N=312
✓ 1404454: T=49, N=64
✓ 1404449: T=44, N=39
✓ 1404455: T=52, N=111
✓ 1404456: T=85, N=139
✓ 1404462: T=246, N=369
✓ 1404458: T=72, N=61
✓ 1404461: T=60, N=81
✓ 1404459: T=72, N=255
✓ 1404464: T=80, N=86
✓ 1404451: T=47, N=208
✓ 1404452: T=45, N=59
✓ 1404463: T=239, N=297
✓ 1404457: T=102, N=207
✓ 1404465: T=155, N=325
✓ 1404460: T=134, N=217
✓ 1404453:

pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe clo

✗ Error https://wybory.gov.pl/prezydent2025/pl/obkw/1/1406028: Page.content: Target page, context or browser has been closed
✗ Error https://wybory.gov.pl/prezydent2025/pl/obkw/1/1406029: Page.content: Target page, context or browser has been closed
✗ Error https://wybory.gov.pl/prezydent2025/pl/obkw/1/1406030: Page.content: Connection closed while reading from the driver
✗ Error https://wybory.gov.pl/prezydent2025/pl/obkw/1/1406031: Page.content: Connection closed while reading from the driver
✗ Error https://wybory.gov.pl/prezydent2025/pl/obkw/1/1406032: Page.content: Connection closed while reading from the driver
✗ Error https://wybory.gov.pl/prezydent2025/pl/obkw/1/1406033: Page.content: Connection closed while reading from the driver
✗ Error https://wybory.gov.pl/prezydent2025/pl/obkw/1/1406034: Page.content: Connection closed while reading from the driver
✗ Error https://wybory.gov.pl/prezydent2025/pl/obkw/1/1406035: Page.content: Connection closed while reading from the driver


Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed\nCall log:\n  - waiting for locator(".table-responsive") to be visible\n    5 × locator resolved to hidden <div class="table-responsive"></div>\n    3 × locator resolved to 2 elements. Proceeding with the first one: <div class="table-responsive"></div>\n')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed
Call log:
  - waiting for locator(".table-responsive") to be visible
    5 × locator resolved to hidden <div class="table-responsive"></div>
    3 × locator resolved to 2 elements. Proceeding with the first one: <div class="table-responsive"></div>

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed\nCall log:\n  - waiting for locator(".table-responsive") to be visible\n    5 × locator resolved to hidden <div class="tab

Exception: Page.close: Connection closed while reading from the driver