Make sure to install the required packages. They are listed in the pyproject.toml file. We recommend using uv to install them inside a virtual environment.

Instead of using a browser with playwright directly, we recommend using BrightData's browser API. This way, you can avoid the captcha and the rate limit.

In [1]:
from asyncio import Semaphore
from tqdm.asyncio import tqdm_asyncio
from os import environ
from playwright.async_api import Playwright, async_playwright

cdp_url = environ.get("CDP_URL")

if not cdp_url:
    raise Exception(
        "Provide Browser API credentials in CDP_URL environment variable or update the script."
    )


Create a file named "resumes.txt" in the current folder and fill it in with the Lattes URL for each resume. Separate each link by a single new line. Ex.:

```text
http://lattes.cnpq.br/8894044025019842
http://lattes.cnpq.br/6290785760473426
http://lattes.cnpq.br/3201859120142095
```

In [2]:
with open("resumes.txt", "r") as f:
    resumes = [line.strip() for line in f.readlines()]


The semaphore is used to limit the number of concurrent requests to the browser. We recommend using 5. BrightData's browser API for certain supports more than 5 concurrent requests, but it's not guaranteed. Set a safe limit to avoid being blocked by BrightData.

In [15]:
semaphore = Semaphore(5)


async def scrape(playwright: Playwright, url: str) -> tuple[bool, str, str | None]:
    """
    Scrape a single Lattes resume URL using Playwright with captcha solving.

    This function connects to a browser via CDP, navigates to the provided URL,
    waits for captcha detection and solving, clicks the submit button, and
    saves the page content as an HTML file.

    Args:
        playwright: The Playwright instance for browser automation
        url: The Lattes resume URL to scrape

    Returns:
        A tuple containing:
        - bool: True if scraping was successful, False otherwise
        - str: The original URL that was scraped
        - str | None: Error message if scraping failed, None if successful

    Raises:
        Exception: If any error occurs during the scraping process
    """
    async with semaphore:
        try:
            print(f"Connecting to Browser for {url}...")
            browser = await playwright.chromium.connect_over_cdp(cdp_url)
            try:
                print(f"Connected! Navigating to {url}...")
                page = await browser.new_page()
                client = await page.context.new_cdp_session(page)
                await page.goto(url, timeout=2 * 60_000)
                print(f"Navigated! Waiting captcha to detect and solve for {url}...")
                result = await client.send(
                    "Captcha.waitForSolve",
                    {
                        "detectTimeout": 10 * 1000,
                    },
                )
                status = result["status"]
                print(f"Captcha status: {status} for {url}")
                if status != "solve_finished":
                    return False, url, "Captcha not solved"

                print(f"Clicking submit button for {url}...")
                async with page.expect_navigation(timeout=60_000):
                    await page.click("#submitBtn")

                print(f"Extracting page content for {url}...")
                content = await page.content()

                print(f"Saving page content for {url}...")
                filename = url.split("/")[-1]
                with open(f"./resumes/{filename}.html", "w+", encoding="utf-8") as f:
                    f.write(content)

                print(f"Page saved successfully for {url}!")
                return True, url, None

            finally:
                await browser.close()

        except Exception as e:
            error_msg = f"Error scraping {url}: {str(e)}"
            print(error_msg)
            return False, url, str(e)


async def scrape_all_urls(urls: list[str]) -> tuple[list[str], list[str]]:
    """
    Scrape multiple Lattes resume URLs concurrently with progress tracking.

    This function creates concurrent scraping tasks for all provided URLs,
    executes them with a progress bar, and categorizes results into
    successful and failed URLs.

    Args:
        urls: List of Lattes resume URLs to scrape

    Returns:
        A tuple containing:
        - list[str]: URLs that were successfully scraped
        - list[str]: URLs that failed to be scraped

    Note:
        Uses a semaphore to limit concurrent requests and avoid overwhelming
        the browser API service.
    """
    async with async_playwright() as playwright:
        tasks = [scrape(playwright, url) for url in urls]

        results = await tqdm_asyncio.gather(*tasks)

        successful_urls = []
        failed_urls = []

        for result in results:
            if isinstance(result, Exception):
                # This shouldn't happen since we handle exceptions in scrape()
                print(f"Unexpected error: {result}")
                failed_urls.append("unknown_url")
            else:
                success, url, error = result
                if success:
                    successful_urls.append(url)
                else:
                    failed_urls.append(url)

        return successful_urls, failed_urls


def save_failed_urls(failed_urls: list[str]) -> None:
    """
    Save a list of failed URLs to the failed_extractions.txt file.

    This function writes each failed URL on a separate line in the
    failed_extractions.txt file for later retry attempts.

    Args:
        failed_urls: List of URLs that failed during scraping

    Note:
        If the failed_urls list is empty, no file is created and a
        message is printed indicating no failures occurred.
    """
    if failed_urls:
        with open("failed_extractions.txt", "w+", encoding="utf-8") as f:
            for url in failed_urls:
                f.write(f"{url}\n")
        print(f"Saved {len(failed_urls)} failed URLs to failed_extractions.txt")
    else:
        print("No failed URLs to save!")


def load_failed_urls() -> list[str]:
    """
    Load previously failed URLs from the failed_extractions.txt file.

    This function reads the failed_extractions.txt file and returns
    a list of URLs that can be retried in subsequent scraping attempts.

    Returns:
        list[str]: List of URLs that previously failed to scrape.
                  Returns empty list if file doesn't exist or is empty.

    Note:
        Strips whitespace from each line and filters out empty lines.
        If the file doesn't exist, returns an empty list and prints
        a notification message.
    """
    try:
        with open("failed_extractions.txt", "r", encoding="utf-8") as f:
            return [line.strip() for line in f.readlines() if line.strip()]
    except FileNotFoundError:
        print("No failed_extractions.txt file found")
        return []


In [None]:
async with async_playwright() as playwright:
    print(await scrape(playwright, "http://lattes.cnpq.br/8894044025019842"))


In [None]:
successful_urls, failed_urls = await scrape_all_urls(resumes)

print(f"Successfully scraped: {len(successful_urls)} URLs")
print(f"Failed to scrape: {len(failed_urls)} URLs")

save_failed_urls(failed_urls)


For now we did not implement a way to automatically retry failed URLs. You can do it manually by running the following code:

In [None]:
failed_urls = load_failed_urls()

if failed_urls:
    print(f"Retrying {len(failed_urls)} failed URLs...")
    successful_retry, still_failed = await scrape_all_urls(failed_urls)

    print(
        f"Retry results - Success: {len(successful_retry)}, Still failed: {len(still_failed)}"
    )

    save_failed_urls(still_failed)
else:
    print("No failed URLs to retry")
