https://g1.globo.com/economia/noticia/2025/07/15/tarifaco-de-50percent-ao-brasil-eleva-custos-para-familias-e-afeta-empresas-norte-americanas-dizem-camaras-de-comercio.ghtml

In [None]:
import datetime
import logging
import pandas as pd
import xml.etree.ElementTree as ET

In [None]:
# Configure logging to file
logging.basicConfig(
    filename='xml_processing.log',
    level=logging.INFO,
    format='%(asctime)s|%(levelname)s|%(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [3]:
# Referência: https://www.gov.br/inpi/pt-br/servicos/patentes/informacao-tecnologica 
# Fonte: https://www.gov.br/inpi/pt-br/servicos/patentes/copy_of_BR_AF_20250103.zip
tree = ET.parse('BR_AF_20250103.xml')

In [4]:
root = tree.getroot()

In [5]:
entry_list = []
error_count = 0

for entry_index, entry in enumerate(root.findall('authority-file-entry')):
    
    publication_reference = entry.find('publication-reference')

    if publication_reference is None:
        error_count += 1
        logging.error(f"Missing <publication-reference> in entry {entry_index}")
        continue
    
    document_id = publication_reference.find('document-id')

    if document_id is None:
        error_count += 1
        logging.error(f"Missing <document-id> in entry {entry_index}")
        continue

    country = document_id.findtext('country')
    doc_number = document_id.findtext('doc-number')
    date = document_id.findtext('date')
    kind = document_id.findtext('kind')
    entry_list.append({
                'country': country,
                'doc-number': doc_number,
                'kind': kind,
                'date': datetime.datetime.strptime(date, "%Y%m%d")
            })
    
logging.info(f"Processed {len(entry_list) + error_count} entries, "
             f"success: {len(entry_list)}, "
             f"errors: {error_count}")

In [6]:
df = pd.DataFrame(entry_list)

In [7]:
df = df[(df['kind'] == 'B1') | (df['kind'] == 'Y1')]

In [11]:
df.to_excel('BR_AF_20250103.xlsx')

In [13]:
df.to_pickle('BR_AF_20250103.pkl')