In [15]:
import re
import os
import json
import asyncio
import stamina
import instructor
import polars as pl
from tqdm import tqdm
from pathlib import Path
from bs4 import BeautifulSoup
from openai import AsyncOpenAI
from xlsxwriter import Workbook
from tqdm.asyncio import tqdm_asyncio
from pydantic import BaseModel, Field


The following code processes the HTML files scraped from the Lattes platform. Because the HTML structure of the Lattes platform is not very consistent, the code has to be adapted to the specific structure of the HTML file. To make things easier, we'll use AI to help extract the information. Here we use OpenRouter, but you can also use a sufficient LLM from another provider or even a local LLM.

In [44]:
openrouter_api_key = os.environ.get("OPENROUTER_API_KEY")

if not openrouter_api_key:
    raise ValueError("OPENROUTER_API_KEY is not set")
default_model = "mistralai/mistral-medium-3"
openai_client = AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1", api_key=openrouter_api_key
)
instructor_client = instructor.from_openai(openai_client)


Let setup the Pydantic models for the publications and committees. This will guide the LLM to extract the information in the correct format. As the data extracted is in Portuguese, we'll use Portuguese to define the models.

In [38]:
class Publicacao(BaseModel):
    """
    Pydantic model representing a scientific publication extracted from Lattes CV.

    This model defines the structure for publications including journal articles,
    book chapters, and other academic works found in Brazilian Lattes CVs.

    Attributes:
        ano: The year the publication was published
        titulo: The title of the publication
        autores: Comma-separated list of authors
        periodico: The journal/periodical name (optional)
        editora: The publisher or media outlet (optional)
        cidade: The city of publication (optional)
        extra: Additional information about the publication format or type (optional)
    """

    ano: int = Field(description="O ano da publicação.")
    titulo: str = Field(description="O título da publicação.")
    autores: str = Field(
        description="A lista de autores da publicação, separados por vírgulas."
    )
    periodico: str | None = Field(
        description="O periódico da publicação. Não deve incluir a cidade. Ex.: para 'Civitas (Porto Alegre)' o periódico é 'Civitas'"
    )
    editora: str | None = Field(
        description="A editora da publicação. Este campo pode ser None caso a publicação não tenha uma editora. Para alguns tipos de publicação isso pode ser uma emissora de rádio ou televisão ou outro tipo de mídia."
    )
    cidade: str | None = Field(
        description="A cidade ou local da publicação e/ou editora. Ex.: para 'Civitas (Porto Alegre)' o local é 'Porto Alegre'."
    )
    extra: str | None = Field(
        description="Algumas publicações podem incluir informações adicionais que não estão presentes nos outros campos. Por exemplo, descrições mais detalhadas sobre o tipo de publicação, o formato, etc. Ex.: 'Material Didático em PPT sobre Comunicação, Arte e Reprodutibilidade Técnica'."
    )


class Banca(BaseModel):
    """
    Pydantic model representing an academic committee/examination board (banca).

    This model defines the structure for thesis defense committees, qualification
    exams, and other academic evaluation boards found in Brazilian Lattes CVs.

    Attributes:
        ano: The year the committee evaluation took place
        titulo: The title of the work or project being evaluated
        membros: Comma-separated list of committee members
        candidato: The name of the candidate being evaluated (optional)
        instituicao: The institution where the evaluation took place (optional)
    """

    ano: int = Field(description="O ano em que a banca foi realizada.")
    titulo: str = Field(
        description="O título do trabalho avaliado pela banca ou o nome do projeto."
    )
    membros: str = Field(
        description="A lista de membros da banca, separados por vírgulas."
    )
    candidato: str | None = Field(description="O nome do candidato que foi avaliado.")
    instituicao: str | None = Field(
        description="A instituição onde a banca foi realizada, onde o projeto foi desenvolvido ou onde o trabalho foi realizado."
    )


In [36]:
system_prompt = "Você é um assistente prestativo que extrai informações de um texto fornecido. Você receberá o texto de uma referência bibliográfica e precisará extrair as informações no formato especificado pelo esquema."


We'll also need a function to extract the clean text from the HTML element. This function will be used to extract the text from the HTML element and return it as a string, ignoring images and other unwanted content.

In [27]:
def extract_clean_text(element):
    """
    Extract clean text content from an HTML element, removing unwanted tags and formatting.

    This function processes BeautifulSoup HTML elements to extract readable text,
    removing images, superscript tags, and excess whitespace while preserving
    the essential textual content.

    Args:
        element: A BeautifulSoup HTML element to extract text from

    Returns:
        str: Clean text content with normalized whitespace, or empty string if element is None

    Note:
        Currently removes 'img' and 'sup' tags. Modify unwanted_tags list to remove
        additional HTML elements as needed.
    """
    if not element:
        return ""

    temp_element = element.__copy__()

    unwanted_tags = ["img", "sup"]
    for tag in unwanted_tags:
        for unwanted in temp_element.find_all(tag):
            unwanted.decompose()

    text = temp_element.get_text(separator=" ", strip=True)

    text = re.sub(r"\s+", " ", text).strip()

    return text


The HTML structure of the Lattes platform is not very consistent, so this isn't a generic function. For each section we need to extract the citations from, you'll need to provide the correct CSS selector. This approach is not scalable and very prone to break, but it works for now.

In [28]:
def extract_publications(container):
    """
    Extract publication citations organized by sections from a Lattes CV HTML container.

    This function navigates the HTML structure of Lattes CVs to find citation sections
    marked by 'cita-artigos' divs and extracts all citation spans between consecutive
    section headers.

    Args:
        container: BeautifulSoup HTML element containing the publications section

    Returns:
        list[dict]: List of dictionaries, each containing:
            - 'name' (str): Section name from the 'cita-artigos' div
            - 'citations' (list[str]): List of clean citation texts from 'transform' spans

    Note:
        This function is specifically designed for the Lattes platform HTML structure
        and may not work with other CV formats. The HTML structure is inconsistent,
        so this approach may need updates if the platform changes.
    """
    publications = []

    # Find all divs with class "cita-artigos"
    cita_artigos_divs = container.find_all("div", class_="cita-artigos")

    for i, current_div in enumerate(cita_artigos_divs):
        section_name = extract_clean_text(current_div)
        citations = []

        # Start from the current div and look for the next sibling elements
        current_element = current_div.next_sibling

        # Continue until we find the next "cita-artigos" div or reach the end
        while current_element:
            # If we encounter another "cita-artigos" div, stop
            if (
                hasattr(current_element, "get")
                and current_element.get("class")
                and "cita-artigos" in current_element.get("class", [])
            ):
                break

            # If current element has descendants, search for transform spans
            if hasattr(current_element, "find_all"):
                transform_spans = current_element.find_all("span", class_="transform")
                for span in transform_spans:
                    citation_text = extract_clean_text(span)
                    if citation_text:  # Only add non-empty citations
                        citations.append(citation_text)

            # Move to the next sibling
            current_element = current_element.next_sibling

        # Add to publications list
        publications.append({"name": section_name, "citations": citations})

    return publications


In [29]:
def extract_name(soup) -> str:
    """
    Extract the person's name from a Lattes CV HTML document.

    This function locates the personal information section in a Lattes CV
    and extracts the person's name from the designated header element.

    Args:
        soup: BeautifulSoup object representing the parsed HTML document

    Returns:
        str: The extracted name with whitespace stripped

    Raises:
        AttributeError: If the expected HTML structure (infpessoa div or nome h2) is not found

    Note:
        This function expects the standard Lattes CV HTML structure with
        a div.infpessoa containing an h2.nome element.
    """
    info_container = soup.find("div", class_="infpessoa")
    name = info_container.find("h2", class_="nome").text.strip()
    return name


In [None]:
def process_all_resumes():
    """
    Process all HTML files in the resumes folder and extract structured data to JSON files.

    This function performs batch processing of Lattes CV HTML files, extracting
    publications and committee information from each file and saving the results
    as JSON files in the jsons directory.

    The function:
    1. Scans the ./resumes directory for HTML files
    2. For each file, extracts name, publications, and committees data
    3. Saves structured data as JSON files with the same base filename
    4. Reports processing statistics and any failures

    Returns:
        None

    Side Effects:
        - Creates ./jsons directory if it doesn't exist
        - Writes JSON files to ./jsons directory
        - Prints processing status and statistics to console

    Note:
        Failed extractions are logged to console with error details.
        The function continues processing even if individual files fail.
    """
    resumes_path = Path("./resumes")
    html_files = list(resumes_path.glob("*.html"))

    if not html_files:
        print("No HTML files found in the resumes folder")
        return

    print(f"Found {len(html_files)} HTML files to process")

    jsons_path = Path("./jsons")
    jsons_path.mkdir(exist_ok=True)

    successful = 0
    failed = 0
    failed_files = []

    for html_file in html_files:
        try:
            print(f"Processing: {html_file.name}")

            with open(html_file, "r") as file:
                html_content = file.read()

            soup = BeautifulSoup(html_content, "lxml")

            publications_container = soup.select_one(
                'div.title-wrapper > a[name="ProducoesCientificas"]:first-child'
            )

            if publications_container is not None:
                publications_container = publications_container.parent

            committees_container = soup.select_one(
                'div.title-wrapper > a[name="Bancas"]:first-child'
            )

            if committees_container is not None:
                committees_container = committees_container.parent

            data = {
                "name": extract_name(soup),
                "publications": extract_publications(publications_container)
                if publications_container
                else [],
                "committees": extract_publications(committees_container)
                if committees_container
                else [],
            }

            json_filename = f"{html_file.stem}.json"
            json_output_path = jsons_path / json_filename

            with open(json_output_path, "w") as file:
                json.dump(data, file, indent=4, ensure_ascii=False)

            successful += 1
            print(f"  ✓ Saved to {json_output_path}")

        except Exception as e:
            failed += 1
            failed_files.append((html_file.name, str(e)))
            print(f"  ✗ Failed to process {html_file.name}: {e}")

    print("\nProcessing complete!")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")

    if failed_files:
        print("\nFailed files:")
        for filename, error in failed_files:
            print(f"  {filename}: {error}")


process_all_resumes()


In [54]:
semaphore = asyncio.Semaphore(10)


@stamina.retry(on=Exception, attempts=5)
async def process_citation(citation):
    async with semaphore:
        response = await instructor_client.chat.completions.create(
            model=default_model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": citation},
            ],
            response_model=Publicacao,
        )
        return response.model_dump()


async def process_committees(citation):
    async with semaphore:
        response = await instructor_client.chat.completions.create(
            model=default_model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": citation},
            ],
            response_model=Banca,
        )
        return response.model_dump()


In [None]:
async def process_json(json_data: dict, filename: str = "") -> tuple[dict, list[dict]]:
    """
    Processes the citations in the JSON data using tqdm_asyncio.gather for concurrent processing with progress tracking.

    Args:
        json_data (dict): The parsed JSON data.
        filename (str): The source filename for tracking failed requests.

    Returns:
        tuple[dict, list[dict]]: A tuple containing the enriched JSON data and a list of failed requests.
    """
    tasks = []
    citation_mappings = []
    failed_requests = []

    if "publications" in json_data:
        for pub_idx, publication_category in enumerate(json_data["publications"]):
            if "citations" in publication_category:
                for cite_idx, citation in enumerate(publication_category["citations"]):
                    task = process_citation(citation)
                    tasks.append(task)
                    citation_mappings.append(
                        (
                            "publications",
                            pub_idx,
                            cite_idx,
                            citation,
                            publication_category.get("name", "Unknown Section"),
                        )
                    )

    if "committees" in json_data:
        for comm_idx, committee_category in enumerate(json_data["committees"]):
            if "citations" in committee_category:
                for cite_idx, citation in enumerate(committee_category["citations"]):
                    task = process_committees(citation)
                    tasks.append(task)
                    citation_mappings.append(
                        (
                            "committees",
                            comm_idx,
                            cite_idx,
                            citation,
                            committee_category.get("name", "Unknown Section"),
                        )
                    )

    if tasks:
        # Use return_exceptions=True to get both results and exceptions
        results = await tqdm_asyncio.gather(*tasks, desc="Processing citations")

        for result, (
            category_type,
            category_idx,
            citation_idx,
            citation_text,
            section_name,
        ) in zip(results, citation_mappings):
            if isinstance(result, Exception):
                # Track failed request for retry
                failed_request = {
                    "filename": filename,
                    "citation_text": citation_text,
                    "category_type": category_type,
                    "category_idx": category_idx,
                    "citation_idx": citation_idx,
                    "section_name": section_name,
                    "error": str(result),
                    "error_type": type(result).__name__,
                    "timestamp": asyncio.get_event_loop().time(),
                }
                failed_requests.append(failed_request)
                print(f"Error processing citation in {section_name}: {result}")
                continue

            if category_type == "publications":
                if not isinstance(
                    json_data["publications"][category_idx]["citations"], list
                ):
                    json_data["publications"][category_idx]["citations"] = [None] * len(
                        json_data["publications"][category_idx]["citations"]
                    )
                json_data["publications"][category_idx]["citations"][citation_idx] = (
                    result
                )
            else:
                if not isinstance(
                    json_data["committees"][category_idx]["citations"], list
                ):
                    json_data["committees"][category_idx]["citations"] = [None] * len(
                        json_data["committees"][category_idx]["citations"]
                    )
                json_data["committees"][category_idx]["citations"][citation_idx] = (
                    result
                )

    return json_data, failed_requests


async def process_all_jsons():
    """
    Process all JSON files in the jsons folder and save enriched versions to enriched_jsons folder.
    Also tracks failed requests for later retry.
    """
    jsons_path = Path("./jsons")
    enriched_path = Path("./enriched_jsons")
    failed_requests_path = Path("./failed_requests")

    enriched_path.mkdir(exist_ok=True)
    failed_requests_path.mkdir(exist_ok=True)

    json_files = list(jsons_path.glob("*.json"))

    if not json_files:
        print("No JSON files found in the jsons folder")
        return

    print(f"Found {len(json_files)} JSON files to process")

    successful = 0
    failed = 0
    failed_files = []
    all_failed_requests = []

    for json_file in tqdm_asyncio(json_files, desc="Processing JSON files"):
        try:
            print(f"\nProcessing: {json_file.name}")

            with open(json_file, "r", encoding="utf-8") as file:
                data = json.load(file)

            enriched_data, failed_requests = await process_json(data, json_file.name)

            # Add failed requests to the global list
            all_failed_requests.extend(failed_requests)

            enriched_filename = json_file.name
            enriched_output_path = enriched_path / enriched_filename

            with open(enriched_output_path, "w", encoding="utf-8") as file:
                json.dump(
                    enriched_data, file, indent=4, ensure_ascii=False, default=str
                )

            # Save individual file's failed requests
            if failed_requests:
                failed_requests_file = (
                    failed_requests_path / f"{json_file.stem}_failed.json"
                )
                with open(failed_requests_file, "w", encoding="utf-8") as file:
                    json.dump(failed_requests, file, indent=4, ensure_ascii=False)
                print(
                    f"  ⚠ {len(failed_requests)} failed citations saved to {failed_requests_file}"
                )

            successful += 1
            print(f"  ✓ Saved enriched data to {enriched_output_path}")

        except Exception as e:
            failed += 1
            failed_files.append((json_file.name, str(e)))
            print(f"  ✗ Failed to process {json_file.name}: {e}")

    # Save all failed requests in a single file for batch retry
    if all_failed_requests:
        all_failed_requests_file = Path("./failed_requests_all.json")
        with open(all_failed_requests_file, "w", encoding="utf-8") as file:
            json.dump(all_failed_requests, file, indent=4, ensure_ascii=False)
        print(
            f"\n📝 Total of {len(all_failed_requests)} failed citation requests saved to {all_failed_requests_file}"
        )

    print("\nProcessing complete!")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    print(f"Total failed citations: {len(all_failed_requests)}")

    if failed_files:
        print("\nFailed files:")
        for filename, error in failed_files:
            print(f"  {filename}: {error}")

        with open("failed_extractions.txt", "w", encoding="utf-8") as f:
            for filename, error in failed_files:
                f.write(f"{filename}: {error}\n")


await process_all_jsons()


In [None]:
async def retry_failed_requests(
    failed_requests_file: str = "./failed_requests_all.json",
):
    """
    Retry processing failed citation requests from a saved file.

    Args:
        failed_requests_file (str): Path to the JSON file containing failed requests.
    """
    failed_requests_path = Path(failed_requests_file)

    if not failed_requests_path.exists():
        print(f"Failed requests file not found: {failed_requests_file}")
        return

    with open(failed_requests_path, "r", encoding="utf-8") as file:
        failed_requests = json.load(file)

    if not failed_requests:
        print("No failed requests to retry")
        return

    print(f"Found {len(failed_requests)} failed requests to retry")

    successful_retries = []
    still_failed = []

    for request in tqdm_asyncio(failed_requests, desc="Retrying failed requests"):
        try:
            if request["category_type"] == "publications":
                result = await process_citation(request["citation_text"])
            else:
                result = await process_committees(request["citation_text"])

            # Add the successful result to the retry list
            successful_retries.append(
                {
                    **request,
                    "retry_result": result,
                    "retry_timestamp": asyncio.get_event_loop().time(),
                }
            )

        except Exception as e:
            # Still failed, add to still_failed list
            still_failed.append(
                {
                    **request,
                    "retry_error": str(e),
                    "retry_error_type": type(e).__name__,
                    "retry_timestamp": asyncio.get_event_loop().time(),
                }
            )

    # Save results
    if successful_retries:
        successful_retries_file = Path("./successful_retries.json")
        with open(successful_retries_file, "w", encoding="utf-8") as file:
            json.dump(successful_retries, file, indent=4, ensure_ascii=False)
        print(
            f"✓ {len(successful_retries)} successful retries saved to {successful_retries_file}"
        )

    if still_failed:
        still_failed_file = Path("./still_failed_requests.json")
        with open(still_failed_file, "w", encoding="utf-8") as file:
            json.dump(still_failed, file, indent=4, ensure_ascii=False)
        print(
            f"⚠ {len(still_failed)} requests still failed, saved to {still_failed_file}"
        )

    print("\nRetry complete!")
    print(f"Successful retries: {len(successful_retries)}")
    print(f"Still failed: {len(still_failed)}")


async def apply_successful_retries(retries_file: str = "./successful_retries.json"):
    """
    Apply successful retry results back to the enriched JSON files.

    Args:
        retries_file (str): Path to the JSON file containing successful retries.
    """
    retries_path = Path(retries_file)

    if not retries_path.exists():
        print(f"Successful retries file not found: {retries_file}")
        return

    with open(retries_path, "r", encoding="utf-8") as file:
        successful_retries = json.load(file)

    if not successful_retries:
        print("No successful retries to apply")
        return

    enriched_path = Path("./enriched_jsons")
    files_to_update = {}

    # Group retries by filename
    for retry in successful_retries:
        filename = retry["filename"]
        if filename not in files_to_update:
            files_to_update[filename] = []
        files_to_update[filename].append(retry)

    print(
        f"Applying {len(successful_retries)} successful retries to {len(files_to_update)} files"
    )

    for filename, retries in files_to_update.items():
        enriched_file_path = enriched_path / filename

        if not enriched_file_path.exists():
            print(f"Warning: Enriched file not found: {enriched_file_path}")
            continue

        # Load the enriched JSON
        with open(enriched_file_path, "r", encoding="utf-8") as file:
            enriched_data = json.load(file)

        # Apply the retry results
        for retry in retries:
            category_type = retry["category_type"]
            category_idx = retry["category_idx"]
            citation_idx = retry["citation_idx"]
            result = retry["retry_result"]

            if category_type == "publications":
                enriched_data["publications"][category_idx]["citations"][
                    citation_idx
                ] = result
            else:
                enriched_data["committees"][category_idx]["citations"][citation_idx] = (
                    result
                )

        # Save the updated enriched JSON
        with open(enriched_file_path, "w", encoding="utf-8") as file:
            json.dump(enriched_data, file, indent=4, ensure_ascii=False, default=str)

        print(f"✓ Updated {filename} with {len(retries)} retry results")

    print("Successfully applied all retry results!")


In [16]:
def process_json_to_excel(json_file_path: Path) -> tuple[bool, str]:
    """
    Process a single JSON file and create an Excel file.

    Args:
        json_file_path: Path to the JSON file to process

    Returns:
        tuple[bool, str]: (success, message)
    """
    try:
        with open(json_file_path, "r", encoding="utf-8") as file:
            json_data = json.load(file)

        original_name = json_data.get("name", "default_name")
        formatted_name = original_name.lower().replace(" ", "_")

        # Create excel directory if it doesn't exist
        excel_dir = Path("./excel")
        excel_dir.mkdir(exist_ok=True)

        excel_file_name = excel_dir / f"{formatted_name}.xlsx"

        publications_list = []
        for pub_category in json_data.get("publications", []):
            category_name = pub_category.get("name")
            for citation in pub_category.get("citations", []):
                citation["tipo"] = category_name
                publications_list.append(citation)

        df_publications = pl.DataFrame(publications_list)

        pub_cols_order = [
            "tipo",
            "ano",
            "titulo",
            "autores",
            "periodico",
            "editora",
            "cidade",
            "extra",
        ]

        df_publications = df_publications.select(
            [col for col in pub_cols_order if col in df_publications.columns]
        )

        committees_list = []
        for com_category in json_data.get("committees", []):
            category_name = com_category.get("name")
            for citation in com_category.get("citations", []):
                citation["tipo"] = category_name
                committees_list.append(citation)

        df_committees = pl.DataFrame(committees_list)

        com_cols_order = [
            "tipo",
            "ano",
            "titulo",
            "membros",
            "candidato",
            "instituicao",
        ]
        df_committees = df_committees.select(
            [col for col in com_cols_order if col in df_committees.columns]
        )

        with Workbook(str(excel_file_name)) as wb:
            df_publications.write_excel(
                workbook=wb,
                worksheet="publicações",
                autofit=True,
                column_formats={"ano": "@"},
            )

            df_committees.write_excel(
                workbook=wb,
                worksheet="bancas",
                autofit=True,
                column_formats={"ano": "@"},
            )

        return True, f"Successfully created {excel_file_name}"

    except Exception as e:
        return False, f"Error processing {json_file_path.name}: {str(e)}"


def process_all_enriched_jsons():
    """
    Process all JSON files in the enriched_jsons folder and create Excel files.
    """
    enriched_jsons_path = Path("./enriched_jsons")

    if not enriched_jsons_path.exists():
        print("enriched_jsons folder not found!")
        return

    json_files = list(enriched_jsons_path.glob("*.json"))

    if not json_files:
        print("No JSON files found in enriched_jsons folder!")
        return

    print(f"Found {len(json_files)} JSON files to process")

    successful = 0
    failed = 0
    failed_files = []

    # Process files with progress bar
    for json_file in tqdm(json_files, desc="Creating Excel files"):
        success, message = process_json_to_excel(json_file)

        if success:
            successful += 1
        else:
            failed += 1
            failed_files.append(message)
            print(f"\n  ✗ {message}")

    print("\nProcessing complete!")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")

    if failed_files:
        print("\nFailed files:")
        for error_msg in failed_files:
            print(f"  {error_msg}")


# Run the processing
process_all_enriched_jsons()


Found 104 JSON files to process


Creating Excel files: 100%|██████████| 104/104 [00:04<00:00, 24.84it/s]


Processing complete!
Successful: 104
Failed: 0



