In [1]:
import re
import instructor
from os import environ
from bs4 import BeautifulSoup
from openai import AsyncOpenAI
from pydantic import BaseModel, Field


In [2]:
openrouter_api_key = environ.get("OPENROUTER_API_KEY")

if not openrouter_api_key:
    raise ValueError("OPENROUTER_API_KEY is not set")
default_model = "google/gemini-2.0-flash-lite-001"
openai_client = AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1", api_key=openrouter_api_key
)
instructor_client = instructor.from_openai(openai_client)


In [11]:
class Article(BaseModel, extra="allow"):
    authors: str = Field(
        description="The list authors of the article, separated by commas"
    )
    title: str = Field(description="The title of the article")
    journal: str = Field(description="The journal of the article")
    city: str = Field(description="The city of the article")


In [7]:
system_prompt = "Você é um assistente prestativo que extrai informações de um texto fornecido. Você receberá o texto de uma citação e precisará extrair as informações no formato especificado pelo esquema."


In [4]:
def extract_clean_text(element):
    """
    Extract clean text content from an HTML element, ignoring images and other unwanted content.
    """
    if not element:
        return ""

    # Create a copy to avoid modifying the original
    temp_element = element.__copy__()

    # Remove unwanted elements
    unwanted_tags = ["img", "sup"]  # Add more tags as needed
    for tag in unwanted_tags:
        for unwanted in temp_element.find_all(tag):
            unwanted.decompose()

    # Get clean text
    text = temp_element.get_text(separator=" ", strip=True)

    # Clean up multiple whitespaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [None]:
with open("./resumes/0036766179644104.html", "r") as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, "lxml")


async def extract_article(soup):
    artigos_completos = soup.find("div", id="artigos-completos")
    if artigos_completos:
        artigos = []
        entries = artigos_completos.find_all("span", class_="transform")
        for entry in entries:
            year = entry.find("span", attrs={"data-tipo-ordenacao": "ano"}).get_text(
                strip=True
            )
            citation = extract_clean_text(entry)
            data = await instructor_client.chat.completions.create(
                model=default_model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": citation},
                ],
                response_model=Article,
            )
            data.year = year  # type: ignore
            artigos.append(data)
        return artigos
    return []


In [12]:
try:
    response = await instructor_client.chat.completions.create(
        model=default_model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": citations[0]},
        ],
        response_model=Article,
    )
except Exception as e:
    print(e)

print(response)


authors='GOHN, MARIA DA GLORIA; PENTEADO, CLAUDIO LUIS DE CAMARGO; MORALES, ERNESTO' title='Colectivos' journal='Civitas' city='Porto Alegre'


In [13]:
response.year = 2025


In [14]:
print(response)


authors='GOHN, MARIA DA GLORIA; PENTEADO, CLAUDIO LUIS DE CAMARGO; MORALES, ERNESTO' title='Colectivos' journal='Civitas' city='Porto Alegre' year=2025
