In [2]:
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
import xml.etree.ElementTree as ET
import fitz
import os
import re
import pdfplumber
import pandas as pd

1) Extrae XML de PDFS 

In [19]:
def extract_embedded_xml_from_pdf(pdf_path, out_dir="resultados"):
    os.makedirs(out_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    xml_paths = []

    #validar si hay archivos embebidos
    if doc.embfile_count() == 0:
        print(f"[SIN XML] {os.path.basename(pdf_path)} no tiene XML adjuntos")
        doc.close()
        return []

    #nombre base del pdf sin extensión

    base_name = os.path.splitext(os.path.basename(pdf_path))[0]

    for i in range(doc.embfile_count()):
        #obtenemos directamente el binario
        data = doc.embfile_get(i)

        #nombre basado en el PDF

        name = f"{base_name}.xml"
        if doc.embfile_count() > 1: #si hay mas de 1 xml
            name = f"{base_name}_{i+1}.xml"

        out_path = os.path.join(out_dir, name)
        
        with open(out_path, "wb") as f:
            f.write(data)
        xml_paths.append(out_path)
    
    doc.close()
    return xml_paths

#uso: extrae xml de todos los pdf de una carpeta

pdf_folder = "."
all_xml = []
for f in os.listdir(pdf_folder):
    if f.lower().endswith(".pdf"):
        all_xml += extract_embedded_xml_from_pdf(os.path.join(pdf_folder, f))


[SIN XML] factura_001.pdf no tiene XML adjuntos
[SIN XML] factura_002.pdf no tiene XML adjuntos
[SIN XML] factura_003.pdf no tiene XML adjuntos
[SIN XML] temp_factura.pdf no tiene XML adjuntos


In [17]:
all_xml

[]

2. Extraer contenido de XML

In [None]:
def parse_invoice_xml(path):
    tree = ET.parse(path)
    root = tree.getroot()

    #helpers
    def txt(tag):
        el = root.find(tag)
        return el.text.strip() if el is not None and el.text else None
    
    #encabezado
    doc = txt("./Header/DocumentNumber")
    date = txt("./Header/IssueDate")
    ccy = txt("./Header/Currency")

    #Emisor

    supplier = {
        "supplier_ruc" : txt("./Supplier/RUC"),
        "supplier_name" : txt("./Supplier/Name"),
        "supplier_address" : txt("./Supplier/Address")
    }

    #Cliente
    customer = {
        "customer_doc_type" : txt("./Customer/DocumentType"),
        "customer_doc_number" : txt("./Customer/DocumentNumber"),
        "customer_name" : txt("./Customer/Name"),
        "customer_address" : txt("./Customer/Address")
    }
    #resumen
    summary = {
        "documente_number" : doc,
        "issue_date" : date,
        "currency" : ccy,
        "subtotal" : float(txt("./Summary/SubTotal") or 0),
        "discounts" : float(txt("./Summary/Discounts") or 0),
        "tax_total" : float(txt("./Summary/TaxTotal") or 0),
        "grand_total" : float(txt("./Summary/GrandTotal") or 0)
    }
    summary.update(supplier)
    summary.update(customer)

    #detalle  de items

    detail_rows = []
    for it in root.findall("./Items/Item"):
        def get_text(elem_name):
            el = it.find(elem_name)
            return el.text.strip() if el is not None and el.text else None
        
        line = {
            "document_number" : doc,
            "issue_date" : date,
            "currency" : ccy,
            "line_id" : get_text("LineID"),
            "description" : get_text("Description"),
            "quantity" : float(it.find("Quantity").text),
            "unit" : it.find("Quantity").attrib.get("unit"),
            "unit_price" : float(get_text("UnitPrice") or 0.0),
            "line_extension_amount" : float(get_text("LineExtensionAmount") or 0.0),
            "tax_type" : it.find("Tax").attrib.get("type"),
            "tax_rate" : float(it.find("Tax").attrib.get("rate")),
            "tax_amount" : float(it.find("Tax").text or 0.0),
            "line_total" : float(get_text("LineTotal") or 0.0)

        }
        #actualizar el diccionario con los nuevos valores

        line.update(supplier)
        line.update(customer)
        detail_rows.append(line)
    return summary, detail_rows
xml_folder = "."
summaries, details = [],[]

for fname in os.listdir(xml_folder):
    if fname.lower().endswith(".xml"):
        print(fname)
        s, d = parse_invoice_xml(os.path.join(xml_folder, fname))
        summaries.append(s)
        details.extend(d)

factura_001.xml
factura_002.xml


In [24]:
details

[{'document_number': 'F001-00001234',
  'issue_date': '2025-08-15',
  'currency': 'PEN',
  'line_id': None,
  'description': 'Licencia Software Pro',
  'quantity': 2.0,
  'unit': 'NIU',
  'unit_price': 120.0,
  'line_extension_amount': 240.0,
  'tax_type': 'IGV',
  'tax_rate': 0.18,
  'tax_amount': 43.2,
  'line_total': 283.2,
  'supplier_ruc': '20123456789',
  'supplier_name': 'Comercial Andina S.A.C.',
  'supplier_address': 'Av. Principal 123, Lima, PE',
  'customer_doc_type': 'RUC',
  'customer_doc_number': '20678901234',
  'customer_name': 'Servicios Integrales del Pacífico S.A.',
  'customer_address': 'Jr. Los Olivos 456, Lima, PE'},
 {'document_number': 'F001-00001234',
  'issue_date': '2025-08-15',
  'currency': 'PEN',
  'line_id': None,
  'description': 'Servicio de Implementación',
  'quantity': 1.0,
  'unit': 'ZZ',
  'unit_price': 350.0,
  'line_extension_amount': 350.0,
  'tax_type': 'IGV',
  'tax_rate': 0.18,
  'tax_amount': 63.0,
  'line_total': 413.0,
  'supplier_ruc': '2