In [22]:
import re
import os
import json
import asyncio
import instructor
from pathlib import Path
from bs4 import BeautifulSoup
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm_asyncio
from pydantic import BaseModel, Field


The following code processes the HTML files scraped from the Lattes platform. Because the HTML structure of the Lattes platform is not very consistent, the code has to be adapted to the specific structure of the HTML file. To make things easier, we'll use AI to help extract the information. Here we use OpenRouter, but you can also use a sufficient LLM from another provider or even a local LLM.

In [3]:
openrouter_api_key = os.environ.get("OPENROUTER_API_KEY")

if not openrouter_api_key:
    raise ValueError("OPENROUTER_API_KEY is not set")
default_model = "google/gemini-2.0-flash-lite-001"
openai_client = AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1", api_key=openrouter_api_key
)
instructor_client = instructor.from_openai(openai_client)


Let setup the Pydantic model for the article. This will guide the LLM to extract the information in the correct format.

In [4]:
class Article(BaseModel, extra="allow"):
    authors: str = Field(
        description="A lista de autores do artigo, separados por vírgulas"
    )
    title: str = Field(description="O título do artigo")
    journal: str = Field(description="O periódico do artigo")
    city: str = Field(description="A cidade do artigo")
    year: int = Field(description="O ano do artigo")


In [5]:
system_prompt = "Você é um assistente prestativo que extrai informações de um texto fornecido. Você receberá o texto de uma citação e precisará extrair as informações no formato especificado pelo esquema."


We'll also need a function to extract the clean text from the HTML element. This function will be used to extract the text from the HTML element and return it as a string, ignoring images and other unwanted content.

In [6]:
def extract_clean_text(element):
    """
    Extract clean text content from an HTML element, ignoring images and other unwanted content.
    """
    if not element:
        return ""

    temp_element = element.__copy__()

    unwanted_tags = ["img", "sup"]
    for tag in unwanted_tags:
        for unwanted in temp_element.find_all(tag):
            unwanted.decompose()

    text = temp_element.get_text(separator=" ", strip=True)

    text = re.sub(r"\s+", " ", text).strip()

    return text


The HTML structure of the Lattes platform is not very consistent, so this isn't a generic function. For each section we need to extract the citations from, you'll need to provide the correct CSS selector. This approach is not scalable and very prone to break, but it works for now.

In [7]:
def extract_publications(container):
    """
    Extract all spans with class 'transform' between consecutive 'cita-artigos' divs
    and organize them by section.
    """
    publications = []

    # Find all divs with class "cita-artigos"
    cita_artigos_divs = container.find_all("div", class_="cita-artigos")

    for i, current_div in enumerate(cita_artigos_divs):
        section_name = extract_clean_text(current_div)
        citations = []

        # Start from the current div and look for the next sibling elements
        current_element = current_div.next_sibling

        # Continue until we find the next "cita-artigos" div or reach the end
        while current_element:
            # If we encounter another "cita-artigos" div, stop
            if (
                hasattr(current_element, "get")
                and current_element.get("class")
                and "cita-artigos" in current_element.get("class", [])
            ):
                break

            # If current element has descendants, search for transform spans
            if hasattr(current_element, "find_all"):
                transform_spans = current_element.find_all("span", class_="transform")
                for span in transform_spans:
                    citation_text = extract_clean_text(span)
                    if citation_text:  # Only add non-empty citations
                        citations.append(citation_text)

            # Move to the next sibling
            current_element = current_element.next_sibling

        # Add to publications list
        publications.append({"name": section_name, "citations": citations})

    return publications


In [18]:
def extract_name(soup):
    info_container = soup.find("div", class_="infpessoa")
    name = info_container.find("h2", class_="nome").text.strip()
    return name


In [None]:
def process_all_resumes():
    """Process all HTML files in the resumes folder and extract data to JSON files"""

    # Get all HTML files in the resumes folder
    resumes_path = Path("./resumes")
    html_files = list(resumes_path.glob("*.html"))

    if not html_files:
        print("No HTML files found in the resumes folder")
        return

    print(f"Found {len(html_files)} HTML files to process")

    # Ensure jsons directory exists
    jsons_path = Path("./jsons")
    jsons_path.mkdir(exist_ok=True)

    successful = 0
    failed = 0
    failed_files = []

    for html_file in html_files:
        try:
            print(f"Processing: {html_file.name}")

            # Read the HTML file
            with open(html_file, "r", encoding="utf-8") as file:
                html_content = file.read()

            soup = BeautifulSoup(html_content, "lxml")

            # Extract containers using the same selectors
            publications_container = soup.select_one(
                'div.title-wrapper > a[name="ProducoesCientificas"]:first-child'
            )

            if publications_container is not None:
                publications_container = publications_container.parent

            committees_container = soup.select_one(
                'div.title-wrapper > a[name="Bancas"]:first-child'
            )

            if committees_container is not None:
                committees_container = committees_container.parent

            # Extract data
            data = {
                "name": extract_name(soup),
                "publications": extract_publications(publications_container)
                if publications_container
                else [],
                "committees": extract_publications(committees_container)
                if committees_container
                else [],
            }

            # Generate output filename using the same base name
            json_filename = f"{html_file.stem}.json"
            json_output_path = jsons_path / json_filename

            # Save to JSON
            with open(json_output_path, "w", encoding="utf-8") as file:
                json.dump(data, file, indent=4, ensure_ascii=False)

            successful += 1
            print(f"  ✓ Saved to {json_output_path}")

        except Exception as e:
            failed += 1
            failed_files.append((html_file.name, str(e)))
            print(f"  ✗ Failed to process {html_file.name}: {e}")

    print(f"\nProcessing complete!")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")

    if failed_files:
        print("\nFailed files:")
        for filename, error in failed_files:
            print(f"  {filename}: {error}")


# Run the processing
process_all_resumes()
