<h2>5</h2>

In [4]:
# ====== Config ======
EXCEL_IN    = "detalle_pn5.xlsx"                # archivo con las URLs
COL_URL     = "URL Ficha"                       # nombre EXACTO de la columna con la URL
CSV_OUT     = "num_empleados_caratula_pn5.csv"  # salida
PERIODO     = "12"
ANIO        = "2024"
CONCURRENCY = 4                                  # páginas en paralelo
HEADLESS    = True                               # pon False si quieres ver el navegador

# ====== Código ======
import time, asyncio, nest_asyncio, pandas as pd
from datetime import datetime
from playwright.async_api import async_playwright

nest_asyncio.apply()

ROW_SELECTOR = "div#caratula table tr:has(td:has-text('1.15.00.00'))"
EMP_FALLBACK = "div#caratula table tr:has(td:has-text('Empleados'))"

async def setup_browser(headless=HEADLESS):
    p = await async_playwright().start()
    browser = await p.chromium.launch(headless=headless)
    ctx = await browser.new_context(
        locale="es-CL",
        user_agent=("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                    "(KHTML, like Gecko) Chrome/120 Safari/537.36"),
    )
    # bloquear recursos pesados
    await ctx.route("**/*", lambda route, request: (
        route.abort()
        if request.resource_type in {"image", "stylesheet", "font", "media"}
        else route.continue_()
    ))
    return p, browser, ctx

async def scrape_url(ctx, url, periodo=PERIODO, anio=ANIO, timeout_ms=30000):
    """Devuelve {'URL': url, 'N° de Empleados': valor}"""
    page = await ctx.new_page()
    page.set_default_timeout(timeout_ms)
    try:
        await page.goto(url, wait_until="domcontentloaded")

        # setear selects + Consultar (si existen)
        try:
            form = page.locator("form").first
            selects = form.locator("select")
            if await selects.count() >= 1:
                await selects.nth(0).select_option(value=str(periodo))
            if await selects.count() >= 2:
                try:
                    await selects.nth(1).select_option(value=str(anio))
                except:
                    await selects.nth(1).select_option(label=str(anio))
            # click consultar
            try:
                await form.get_by_role("button", name="Consultar").click()
            except:
                try:
                    await page.locator("button:has-text('Consultar')").click()
                except:
                    await page.locator("input[type=submit]").first.click()
        except Exception:
            pass

        # esperar directamente la fila (o la tabla si no aparece)
        row = page.locator(ROW_SELECTOR).first
        try:
            await row.wait_for(state="visible", timeout=15000)
        except:
            await page.wait_for_selector("div#caratula table", timeout=15000)
            row = page.locator(EMP_FALLBACK).first

        # extraer último <td> de la fila
        valor = None
        if await row.count() > 0:
            tds = row.locator("td")
            n = await tds.count()
            if n > 0:
                valor = (await tds.nth(n-1).inner_text()).strip()

        return {"URL": url, "N° de Empleados": valor}
    except Exception:
        return {"URL": url, "N° de Empleados": None}
    finally:
        try:
            await page.close()
        except:
            pass

async def run_batch(urls, concurrency=CONCURRENCY):
    p, browser, ctx = await setup_browser()
    sem = asyncio.Semaphore(concurrency)

    total = len(urls)
    done = 0
    start_ts = time.time()
    results = []

    async def worker(u, idx):
        nonlocal done
        async with sem:
            t0 = time.time()
            res = await scrape_url(ctx, u)
            elapsed = time.time() - t0
            done += 1

            # imprimir progreso (flush inmediato)
            estado = "OK" if res.get("N° de Empleados") not in (None, "") else "—"
            print(f"[{done}/{total}] {estado}  {elapsed:5.2f}s  -> {u}", flush=True)

            results.append(res)

    try:
        tasks = [asyncio.create_task(worker(u, i)) for i, u in enumerate(urls)]
        # ir mostrando progreso a medida que terminan
        for fut in asyncio.as_completed(tasks):
            await fut

        total_elapsed = time.time() - start_ts
        print(f"\nFin batch: {total} URLs en {total_elapsed:0.1f}s  (~{total/total_elapsed:0.2f} urls/s)", flush=True)
    finally:
        try:  await ctx.close()
        except: pass
        try:  await browser.close()
        except: pass
        try:  await p.stop()
        except: pass
    return results

# ====== Leer Excel, ejecutar y guardar ======
df_in = pd.read_excel(EXCEL_IN)
if COL_URL not in df_in.columns:
    raise ValueError(f"No encontré la columna '{COL_URL}' en {EXCEL_IN}")

urls = df_in[COL_URL].dropna().astype(str).tolist()

print(f"Inicio: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total URLs: {len(urls)} | Concurrency: {CONCURRENCY} | Headless: {HEADLESS}\n")

results = await run_batch(urls)

df_out = pd.DataFrame(results)
df_out.to_csv(CSV_OUT, index=False, encoding="utf-8-sig")
print(f"\n✅ Listo: {CSV_OUT} ({len(df_out)} filas)")
display(df_out.head())

Inicio: 2025-09-21 15:00:25
Total URLs: 999 | Concurrency: 4 | Headless: True

[1/999] —  32.06s  -> https://www.cmfchile.cl/institucional/mercados/entidad.php?auth=&send=&mercado=S&rut=570888&grupo=&tipoentidad=CSNAT&vig=NV&row=AAAwU3AAWAAAAFBAAp&control=svs&pestania=3
[2/999] —  32.09s  -> https://www.cmfchile.cl/institucional/mercados/entidad.php?auth=&send=&mercado=S&rut=9805660&grupo=&tipoentidad=CSNAT&vig=NV&row=AAAwU3AAWAAAAHAAAl&control=svs&pestania=3
[3/999] —  32.11s  -> https://www.cmfchile.cl/institucional/mercados/entidad.php?auth=&send=&mercado=S&rut=3786844&grupo=&tipoentidad=CSNAT&vig=NV&row=AAAwU3AAWAAAAESAAe&control=svs&pestania=3
[4/999] —  32.12s  -> https://www.cmfchile.cl/institucional/mercados/entidad.php?auth=&send=&mercado=S&rut=2714042&grupo=&tipoentidad=CSNAT&vig=NV&row=AAAwU3AAWAAAAEcAAr&control=svs&pestania=3
[5/999] —  31.46s  -> https://www.cmfchile.cl/institucional/mercados/entidad.php?auth=&send=&mercado=S&rut=7259435&grupo=&tipoentidad=CSNAT&vig=NV&row

CancelledError: 