In [1]:
!pip install --quiet playwright pandas openpyxl nest-asyncio
!playwright install chromium


In [None]:
# ====== Config ======
EXCEL_IN   = "detalle_pn1.xlsx"               # archivo con las URLs
COL_URL    = "URL Ficha"                     # nombre EXACTO de la columna con la URL
CSV_OUT    = "num_empleados_caratula_pn1.csv"    # salida
PERIODO    = "12"
ANIO       = "2024"
CONCURRENCY = 4                               # páginas en paralelo (ajusta si tu red/CPU lo permite)
HEADLESS    = True                            # pon False si quieres ver el navegador

# ====== Código ======
import asyncio, nest_asyncio, pandas as pd
from playwright.async_api import async_playwright

nest_asyncio.apply()

ROW_SELECTOR = "div#caratula table tr:has(td:has-text('1.15.00.00'))"
EMP_FALLBACK = "div#caratula table tr:has(td:has-text('Empleados'))"

async def setup_browser(headless=HEADLESS):
    p = await async_playwright().start()
    browser = await p.chromium.launch(headless=headless)
    # contexto único para todo el batch
    ctx = await browser.new_context(
        locale="es-CL",
        user_agent=("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                    "(KHTML, like Gecko) Chrome/120 Safari/537.36"),
    )
    # bloquear recursos pesados
    await ctx.route("**/*", lambda route, request: (
        route.abort()
        if request.resource_type in {"image", "stylesheet", "font", "media"}
        else route.continue_()
    ))
    return p, browser, ctx

async def scrape_url(ctx, url, periodo=PERIODO, anio=ANIO):
    """Devuelve {'URL': url, 'N° de Empleados': valor}"""
    page = await ctx.new_page()
    page.set_default_timeout(30000)
    try:
        await page.goto(url, wait_until="domcontentloaded")

        # setear selects + Consultar (si existen)
        try:
            form = page.locator("form").first
            selects = form.locator("select")
            if await selects.count() >= 1:
                await selects.nth(0).select_option(value=str(periodo))
            if await selects.count() >= 2:
                try:
                    await selects.nth(1).select_option(value=str(anio))
                except:
                    await selects.nth(1).select_option(label=str(anio))
            # click consultar
            try:
                await form.get_by_role("button", name="Consultar").click()
            except:
                try:
                    await page.locator("button:has-text('Consultar')").click()
                except:
                    await page.locator("input[type=submit']").first.click()
        except Exception:
            pass

        # esperar directamente la fila (o la tabla si no aparece)
        row = page.locator(ROW_SELECTOR).first
        try:
            await row.wait_for(state="visible", timeout=15000)
        except:
            # aseguremos que la tabla existe antes del fallback
            await page.wait_for_selector("div#caratula table", timeout=15000)
            row = page.locator(EMP_FALLBACK).first

        # extraer último <td> de la fila
        valor = None
        if await row.count() > 0:
            tds = row.locator("td")
            n = await tds.count()
            if n > 0:
                valor = (await tds.nth(n-1).inner_text()).strip()

        return {"URL": url, "N° de Empleados": valor}
    except Exception:
        return {"URL": url, "N° de Empleados": None}
    finally:
        try:
            await page.close()
        except:
            pass

async def run_batch(urls, concurrency=CONCURRENCY):
    p, browser, ctx = await setup_browser()
    sem = asyncio.Semaphore(concurrency)
    results = []

    async def worker(u):
        async with sem:
            res = await scrape_url(ctx, u)
            results.append(res)

    try:
        await asyncio.gather(*(worker(u) for u in urls))
    finally:
        try:  await ctx.close()
        except: pass
        try:  await browser.close()
        except: pass
        try:  await p.stop()
        except: pass
    return results

# ====== Leer Excel, ejecutar y guardar ======
df_in = pd.read_excel(EXCEL_IN)
if COL_URL not in df_in.columns:
    raise ValueError(f"No encontré la columna '{COL_URL}' en {EXCEL_IN}")

urls = df_in[COL_URL].dropna().astype(str).tolist()
results = await run_batch(urls)

df_out = pd.DataFrame(results)
df_out.to_csv(CSV_OUT, index=False, encoding="utf-8-sig")
print(f"✅ Listo: {CSV_OUT} ({len(df_out)} filas)")
display(df_out.head())
