In [None]:
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
import xml.etree.ElementTree as ET
import fitz
import os
import re
import pdfplumber
import pandas as pd

1) Extrae XML de PDFS 

In [4]:
def extract_embedded_xml_from_pdf(pdf_path, out_dir="resultados"):
    os.makedirs(out_dir, exist_ok=True)
    doc = fitz.open(pdf_path)
    xml_paths = []

    #validar si hay archivos embebidos
    if doc.embfile_count() == 0:
        print(f"[SIN XML] {os.path.basename(pdf_path)} no tiene XML adjuntos")
        doc.close()
        return []

    #nombre base del pdf sin extensión

    base_name = os.path.splitext(os.path.basename(pdf_path))[0]

    for i in range(doc.embfile_count()):
        #obtenemos directamente el binario
        data = doc.embfile_get(i)

        #nombre basado en el PDF

        name = f"{base_name}.xml"
        if doc.embfile_count() > 1: #si hay mas de 1 xml
            name = f"{base_name}_{i+1}.xml"

        out_path = os.path.join(out_dir, name)
        
        with open(out_path, "wb") as f:
            f.write(data)
        xml_paths.append(out_path)
    
    doc.close()
    return xml_paths

#uso: extrae xml de todos los pdf de una carpeta

pdf_folder = "."
all_xml = []
for f in os.listdir(pdf_folder):
    if f.lower().endswith(".pdf"):
        all_xml += extract_embedded_xml_from_pdf(os.path.join(pdf_folder, f))


[SIN XML] factura_001.pdf no tiene XML adjuntos
[SIN XML] factura_002.pdf no tiene XML adjuntos
[SIN XML] factura_003.pdf no tiene XML adjuntos
[SIN XML] temp_factura.pdf no tiene XML adjuntos


In [17]:
all_xml

[]

2. Extraer contenido de XML

In [6]:
def parse_invoice_xml(path):
    tree = ET.parse(path)
    root = tree.getroot()

    #helpers
    def txt(tag):
        el = root.find(tag)
        return el.text.strip() if el is not None and el.text else None
    
    #encabezado
    doc = txt("./Header/DocumentNumber")
    date = txt("./Header/IssueDate")
    ccy = txt("./Header/Currency")

    #Emisor

    supplier = {
        "supplier_ruc" : txt("./Supplier/RUC"),
        "supplier_name" : txt("./Supplier/Name"),
        "supplier_address" : txt("./Supplier/Address")
    }

    #Cliente
    customer = {
        "customer_doc_type" : txt("./Customer/DocumentType"),
        "customer_doc_number" : txt("./Customer/DocumentNumber"),
        "customer_name" : txt("./Customer/Name"),
        "customer_address" : txt("./Customer/Address")
    }
    #resumen
    summary = {
        "document_number" : doc,
        "issue_date" : date,
        "currency" : ccy,
        "subtotal" : float(txt("./Summary/SubTotal") or 0),
        "discounts" : float(txt("./Summary/Discounts") or 0),
        "tax_total" : float(txt("./Summary/TaxTotal") or 0),
        "grand_total" : float(txt("./Summary/GrandTotal") or 0)
    }
    summary.update(supplier)
    summary.update(customer)

    #detalle  de items

    detail_rows = []
    for it in root.findall("./Items/Item"):
        def get_text(elem_name):
            el = it.find(elem_name)
            return el.text.strip() if el is not None and el.text else None
        
        line = {
            "document_number" : doc,
            "issue_date" : date,
            "currency" : ccy,
            "line_id" : get_text("LineID"),
            "description" : get_text("Description"),
            "quantity" : float(it.find("Quantity").text),
            "unit" : it.find("Quantity").attrib.get("unit"),
            "unit_price" : float(get_text("UnitPrice") or 0.0),
            "line_extension_amount" : float(get_text("LineExtensionAmount") or 0.0),
            "tax_type" : it.find("Tax").attrib.get("type"),
            "tax_rate" : float(it.find("Tax").attrib.get("rate")),
            "tax_amount" : float(it.find("Tax").text or 0.0),
            "line_total" : float(get_text("LineTotal") or 0.0)

        }
        #actualizar el diccionario con los nuevos valores

        line.update(supplier)
        line.update(customer)
        detail_rows.append(line)
    return summary, detail_rows
xml_folder = "."
summaries, details = [],[]

for fname in os.listdir(xml_folder):
    if fname.lower().endswith(".xml"):
        print(fname)
        s, d = parse_invoice_xml(os.path.join(xml_folder, fname))
        summaries.append(s)
        details.extend(d)

factura_001.xml
factura_002.xml


In [7]:
df_summary = pd.DataFrame(summaries).sort_values(["issue_date","document_number"])
df_detail = pd.DataFrame(details).sort_values(["issue_date","document_number","line_id"])



In [15]:
print(df_summary.head())
#print(df_detail.head())



  document_number  issue_date currency  subtotal  discounts  tax_total  \
0   F001-00001234  2025-08-15      PEN     590.0        0.0     106.20   
1   F001-00001235  2025-08-20      PEN    1241.2       50.0     214.42   

   grand_total supplier_ruc            supplier_name  \
0       696.20  20123456789  Comercial Andina S.A.C.   
1      1405.62  20123456789  Comercial Andina S.A.C.   

              supplier_address customer_doc_type customer_doc_number  \
0  Av. Principal 123, Lima, PE               RUC         20678901234   
1  Av. Principal 123, Lima, PE               DNI            45678901   

                            customer_name              customer_address  
0  Servicios Integrales del Pacífico S.A.  Jr. Los Olivos 456, Lima, PE  
1                              Juan Pérez       Mz. B Lt. 7, Callao, PE  


In [16]:
df_summary.to_csv("resultados/facturas_resumen.csv", index=False)
df_detail.to_csv("resultados/facturas_detalles.csv", index=False)

3. Extraer componentes de PDF que no tienen XML incrustado

In [2]:
#ruta de pdf

pdf_path = "factura_003.pdf"


#Extraer texto del PDF
texto = ""
with pdfplumber.open(pdf_path) as pdf:
    for pagina in pdf.pages:
        texto += pagina.extract_text() + "\n"
print("Texto Extraido:")
print(texto[:500]) #se muestra solo los primeros caracteres

Texto Extraido:
COMERCIAL ANDINA S.A.C. FACTURA ELECTRÓNICA
RUC: 20123456789 F001-00001236
Cliente: Logística Andina EIRL
RUC/DNI: 20543210987
Cant. Descripción P. Unit IGV Total
10 Paquete de Soporte Anual 25.00 43.62 285.93
2 Servicio de Capacitación (Exonerado) 200.00 0.00 387.69
TOTAL: 673.62 USD



In [4]:
#--Regex para componentes

patrones ={
    "RUC": r"RUC\s*:?\s*(\d+)",
    "Serie_Numero" : r"(F\d{3}-\d+|B\d{3}-\d+)",
    "Fecha" : r"(\d{2}/\d{2}/\d{4})",
    "Total" : r"TOTAL\s*:?[\sS/.]*([\d,]+\.\d{2})",
    "IGV" : r"IGV\s*:?[\sS/.]*([\d,]+\.\d{2})",
    "Razon_Social" : r"([A-Z\s]+S\.A\.C\.|[A-Z\s]+S\.R\.L\.)"
}

datos = {}
for campo, regex in patrones.items():
    match = re.search(regex, texto, re.IGNORECASE)
    datos[campo] = match.group(1) if match else None

# convertir a DataFrame

df = pd.DataFrame([datos])
print("\nDataFrame con los datos extraidos:")
print(df)



DataFrame con los datos extraidos:
           RUC   Serie_Numero Fecha   Total   IGV             Razon_Social
0  20123456789  F001-00001236  None  673.62  None  COMERCIAL ANDINA S.A.C.


In [5]:
#Extraer detalle

patron = r"(\d+)\s+(.+?)\s+([\d,]+\.\d{2})\s+([\d,]+\.\d{2})\s+([\d,]+\.\d{2})"

matches = re.findall(patron, texto)

#convertir a dataframe

df_detalle = pd.DataFrame(matches, columns=["Cantidad", "Descripcion", "P.Unit", "IGV", "Total"])

#convertir columnas numericas
df_detalle[["Cantidad", "P.Unit", "IGV", "Total"]] = df_detalle[["Cantidad", "P.Unit", "IGV", "Total"]].apply(pd.to_numeric)
print(df_detalle)

   Cantidad                           Descripcion  P.Unit    IGV   Total
0        10              Paquete de Soporte Anual    25.0  43.62  285.93
1         2  Servicio de Capacitación (Exonerado)   200.0   0.00  387.69


5) Escritura de un PDF basado en un XML

In [None]:
def xml_to_pdf(xml_file, pdf_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    #crear PDF

    c = canvas.Canvas(pdf_file, pagesize=A4)
    width, height = A4

    #encabezado
    c.setFont("Helvetica-Bold", 14)
    c.drawString(50,height-50,"Comercial Andina SAC")
    c.setFont("Helvetica", 10)
    c.drawString(50,height-65, "RUC: " + root.find("./Supplier/RUC").text)

    #documento

    c.setFont("Helvetica-Bold", 12)
    c.drawString(350,height-50,"FACTURA ELECTRONICA")
    c.drawString(350,height-65,root.find("./Header/DocumentNumber").text)

    #Cliente
    c.setFont("Helvetica", 10)
    c.drawString(50,height-100, "Cliente: " + root.find("./Customer/Name").text)
    c.drawString(50,height-115, "RUC/DNI: " + root.find("./Customer/DocumentNumber").text)

    #Tabla simple de items
    y = height-160
    c.drawString(50, y, "Cant. ")
    c.drawString(100, y, "Descripción")
    c.drawString(300, y, "P. Unit")
    c.drawString(380, y, "IGV")
    c.drawString(450, y, "Total")
    y -= 20

    for it in root.findall("./Items/Item"):
        c.drawString(50, y, it.find("Quantity").text)
        c.drawString(100, y, it.find("Description").text)
        c.drawRightString(340, y, it.find("UnitPrice").text)
        c.drawRightString(420, y, it.find("Tax").text)
        c.drawRightString(500, y, it.find("LineTotal").text)
        y -= 20
    
    #Totales
    c.setFont("Helvetica-Bold", 12)
    c.drawRightString(500, 100, "Total: " + root.find("./Summary/GrandTotal").text + " " + root.find("./Header/Currency").text)

    c.save()
xml_to_pdf("factura_001.xml","resultados/factura_001_xml_a_pdf.pdf")
