In [83]:
import os
from PIL import Image, ImageEnhance
import pytesseract
import cv2
import numpy as np
from pdf2image import convert_from_path
import pandas as pd
import tempfile
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

### Page segmentation modes:

0. Orientation and script detection (OSD) only.

1. Automatic page segmentation with OSD.

2. Automatic page segmentation, but no OSD, or OCR. (not implemented)

3. Fully automatic page segmentation, but no OSD. (Default)

4. Assume a single column of text of variable sizes.

5. Assume a single uniform block of vertically aligned text.

6. Assume a single uniform block of text.

7. Treat the image as a single text line.

8. Treat the image as a single word.

9. Treat the image as a single word in a circle.

10. Treat the image as a single character.

11. Sparse text. Find as much text as possible in no particular order.

12. Sparse text with OSD.

13. Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.

### OCR Engine modes:

0. Legacy engine only.
1. Neural nets LSTM engine only.
2. Legacy + LSTM engines.
3. Default, based on what is available.

In [84]:
ROIS_COORDENADAS = {
    'roi_importe': (335, 357, 193, 295),
    'roi_fecha': (335, 356, 86, 161),
    'roi_factura': (610, 628, 665, 736),
    'roi_empresa': (116, 135, 66, 281)
}

CONFIG = '--psm 8 --oem 3 --psm 7'

In [85]:
def preprocesar_imagen(ruta_imagen):
    #imagen = cv2.imread(ruta_imagen)
    img_pil = Image.open(ruta_imagen)
    enhancer = ImageEnhance.Contrast(img_pil)
    imagen = enhancer.enhance(1.5)


    enhancer = ImageEnhance.Sharpness(imagen)
    imagen = enhancer.enhance(1.5)
    img_cv = np.array(imagen)
    img_gris = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)

    umbral = cv2.threshold(
        img_gris, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU
    )[1]

    umbral = cv2.GaussianBlur(umbral, (3, 3), 0)

    cv2.imwrite('temp.png', umbral)
    return umbral
    

In [None]:
def resize_roi(roi):
    return cv2.resize(roi, None, fx=4, fy=4, interpolation=cv2.INTER_CUBIC)


def extraer_texto_roi(imagen_umbral, roi_coordenadas, CONFIG):

    y1, y2, x1, x2 = roi_coordenadas
    roi = imagen_umbral[y1:y2, x1:x2]
    resized_roi = resize_roi(roi)
    return pytesseract.image_to_string(resized_roi, lang='spa', config=CONFIG)

In [87]:
def procesar_documento(ruta_documento):
    datos_facturas = []

    if ruta_documento.lower().endswith('.pdf'):
        paginas = convert_from_path(ruta_documento, dpi = 100)

        if not isinstance(paginas, list):
            paginas = [paginas]
        print(type(paginas))

        for i, pagina in enumerate(paginas):
            with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as temp_file:
                ruta_temp = temp_file.name
                pagina.save(ruta_temp, 'JPEG')

            umbral_img = preprocesar_imagen(ruta_temp)
            os.remove(ruta_temp)
            
            if umbral_img is not None:
                datos_pagina = {}
                for key, coords in ROIS_COORDENADAS.items():
                    datos_pagina[key.replace('roi_', 'text_')] = extraer_texto_roi(umbral_img, coords, CONFIG)
                datos_facturas.append(datos_pagina)

    else:
        umbral = preprocesar_imagen(ruta_documento)
        if umbral_img is not None:
            datos_pagina = {}
            for key, coords in ROIS_COORDENADAS.items():
                datos_pagina[key.replace('roi_', 'text_')] = extraer_texto_roi(umbral_img, coords, CONFIG)
            datos_facturas.append(datos_pagina)
        

    return datos_facturas

In [88]:
def guardar_csv(datos, nombre_csv):
    if not datos:
        print("No hay datos para guardar en el CSV.")
        return
    
    df = pd.DataFrame(datos)
    header = not os.path.exists(nombre_csv) or os.path.getsize(nombre_csv) == 0
    df.to_csv(nombre_csv, mode='a', header=header, index=False, encoding='utf-8')
    

In [None]:
carpeta_facturas = 'Facturas'
todos_los_datos_facturas = []


for nombre_archivo in os.listdir(carpeta_facturas):
    ruta_documento_actual = os.path.join(carpeta_facturas, nombre_archivo)

    if os.path.isfile(ruta_documento_actual) and (nombre_archivo.lower().endswith(('.pdf', '.png', '.jpg', '.jpeg'))):
        
        datos_factura = procesar_documento(ruta_documento_actual)
        for datos_pagina in datos_factura:
            datos_pagina['nombre_archivo_original'] = nombre_archivo 
            for clave, valor in datos_pagina.items():
                if isinstance(valor, str):
                    datos_pagina[clave] = valor.strip()
                print(f"  {clave}: {valor}")
            todos_los_datos_facturas.append(datos_pagina)
            
    
# Guardar en CSV
guardar_csv(todos_los_datos_facturas, 'factura_ejemplo.csv')

### OBTENER ROI INTERACTIVAMENTE

In [None]:
import cv2
import numpy as np
from pdf2image import convert_from_path

# Variables globales para rastrear el rectángulo
rect_start = None
rect_end = None
drawing = False

def click_event(event, x, y, flags, param):
    global rect_start, rect_end, drawing, img

    if event == cv2.EVENT_LBUTTONDOWN:
        if not drawing:
            # Primer clic: inicio del rectángulo
            rect_start = (x, y)
            drawing = True
            cv2.circle(img, rect_start, 5, (0, 255, 0), -1)
        elif rect_start is not None and drawing:
            # Segundo clic: fin del rectángulo
            rect_end = (x, y)
            drawing = False
            
            # Dibujar el rectángulo
            cv2.rectangle(img, rect_start, rect_end, (0, 255, 0), 2)
            
            # Calcular ancho y alto
            x1, y1 = rect_start
            x2, y2 = rect_end
            ancho = abs(x2 - x1)
            alto = abs(y2 - y1)
            
            # Mostrar coordenadas y tamaño
            print(f"Coordenadas: (x1={x1}, y1={y1}, x2={x2}, y2={y2})")
            print(f"Tamaño: ancho={ancho} píxeles, alto={alto} píxeles")
            
        cv2.imshow('Imagen', img)

# Ruta del PDF y Poppler
ruta_documento = 'FacturaSencilla.pdf'  # Ajusta según tu archivo

paginas = convert_from_path(ruta_documento, dpi= 100)
img = cv2.cvtColor(np.array(paginas[0]), cv2.COLOR_RGB2BGR)

cv2.imshow('Imagen', img)
cv2.setMouseCallback('Imagen', click_event)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
cv2.imshow('Imagen de pagina_0', resize_roi(roi_importe))

cv2.waitKey(0)

cv2.destroyAllWindows()

NameError: name 'roi_importe' is not defined