# Invoide reader

In [1]:
# Instalar bibliotecas necesarias
!pip install opencv-python-headless Pillow



In [2]:
import os

# Definir rutas
DATASET_PATH = "./train"
IMAGES_PATH = os.path.join(DATASET_PATH, "image_50")
LABELS_PATH = os.path.join(DATASET_PATH, "json_50")

# Verificar que las carpetas existan
print("Imágenes:", os.listdir(IMAGES_PATH)[:5])
print("Etiquetas:", os.listdir(LABELS_PATH)[:5])


Imágenes: ['receipt_00004.png', 'receipt_00003.png', 'receipt_00040.png', 'receipt_00036.png', 'receipt_00048.png']
Etiquetas: ['receipt_00009.json', 'receipt_00026.json', 'receipt_00020.json', 'receipt_00027.json', 'receipt_00010.json']


In [3]:
import json
import cv2
import os
from PIL import Image

# Configuración
TARGET_SIZE = (512, 512)  # Tamaño deseado para las imágenes
OUTPUT_PATH = "./preprocessed_dataset"
OUTPUT_IMAGES_PATH = os.path.join(OUTPUT_PATH, "images_50")
OUTPUT_LABELS_PATH = os.path.join(OUTPUT_PATH, "labels_50")

# Crear directorios para los datos preprocesados
os.makedirs(OUTPUT_IMAGES_PATH, exist_ok=True)
os.makedirs(OUTPUT_LABELS_PATH, exist_ok=True)

def preprocess_image(image_path):
    # Leer imagen con OpenCV
    img = cv2.imread(image_path)
    h, w, _ = img.shape

    # Escalar la imagen manteniendo proporción
    scale = min(TARGET_SIZE[0] / h, TARGET_SIZE[1] / w)
    new_h, new_w = int(h * scale), int(w * scale)
    resized_img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)

    # Crear imagen cuadrada con padding
    padded_img = cv2.copyMakeBorder(
        resized_img, 0, TARGET_SIZE[0] - new_h, 0, TARGET_SIZE[1] - new_w,
        cv2.BORDER_CONSTANT, value=(255, 255, 255))
    return padded_img, scale, (new_h, new_w)

def preprocess_annotations(label_path, scale, new_size):
    with open(label_path, "r") as f:
        data = json.load(f)

    # Ajustar coordenadas
    for line in data["valid_line"]:
        for word in line["words"]:
            quad = word["quad"]
            quad["x1"] = int(quad["x1"] * scale)
            quad["y1"] = int(quad["y1"] * scale)
            quad["x2"] = int(quad["x2"] * scale)
            quad["y2"] = int(quad["y2"] * scale)
            quad["x3"] = int(quad["x3"] * scale)
            quad["y3"] = int(quad["y3"] * scale)
            quad["x4"] = int(quad["x4"] * scale)
            quad["y4"] = int(quad["y4"] * scale)
    return data

# Proceso principal
for img_file in os.listdir(IMAGES_PATH):
    if img_file.endswith(".png") or img_file.endswith(".jpg"):
        # Preprocesar imagen
        img_path = os.path.join(IMAGES_PATH, img_file)
        preprocessed_img, scale, new_size = preprocess_image(img_path)

        # Guardar imagen preprocesada
        output_img_path = os.path.join(OUTPUT_IMAGES_PATH, img_file)
        cv2.imwrite(output_img_path, preprocessed_img)

        # Preprocesar anotaciones
        label_file = img_file.replace(".png", ".json").replace(".jpg", ".json")
        label_path = os.path.join(LABELS_PATH, label_file)
        if os.path.exists(label_path):  # Asegurarse de que exista un JSON para cada imagen
            preprocessed_annotations = preprocess_annotations(label_path, scale, new_size)

            # Guardar las anotaciones ajustadas
            output_label_path = os.path.join(OUTPUT_LABELS_PATH, label_file)
            with open(output_label_path, "w") as f:
                json.dump(preprocessed_annotations, f, indent=4)


In [4]:
# Revisar imágenes preprocesadas
print("Imágenes preprocesadas:", os.listdir(OUTPUT_IMAGES_PATH)[:5])

# Revisar etiquetas preprocesadas
print("Etiquetas preprocesadas:", os.listdir(OUTPUT_LABELS_PATH)[:5])


Imágenes preprocesadas: ['receipt_00004.png', 'receipt_00003.png', 'receipt_00040.png', 'receipt_00036.png', 'receipt_00048.png']
Etiquetas preprocesadas: ['receipt_00009.json', 'receipt_00026.json', 'receipt_00020.json', 'receipt_00027.json', 'receipt_00010.json']


In [6]:
!pip3 install transformers datasets torch torchvision



In [7]:
!pip install transformers datasets torch



In [8]:
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
import torch
from PIL import Image
import json
import os

# Inicializamos el procesador de LayoutLMv3
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")

# Si planeas usar un modelo preentrenado (por ejemplo, para Fine-Tuning)
# model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [13]:
from datasets import Dataset

def process_example(image_path, label_path):
    # Cargar imagen
    image = Image.open(image_path).convert("RGB")

    # Cargar anotaciones
    with open(label_path, "r") as f:
        data = json.load(f)

    # Procesar el texto y las coordenadas de las palabras
    words = []
    boxes = []
    word_labels = []

    for line in data["valid_line"]:
        for word in line["words"]:
            text = word["text"]
            quad = word["quad"]

            # Las coordenadas de las bounding boxes (x1, y1, x2, y2, etc.) deben ser en formato de lista
            boxes.append([quad["x1"], quad["y1"], quad["x2"], quad["y2"], quad["x3"], quad["y3"], quad["x4"], quad["y4"]])
            words.append(text)
            word_labels.append(word.get("is_key", 0))  # Puedes usar "is_key" o cualquier otra etiqueta según el caso

    # Usar el procesador de LayoutLMv3 para convertir imágenes y texto en un formato adecuado
    encoding = processor(image, words, boxes=boxes, padding="max_length", truncation=True, return_tensors="pt")

    return {
        "input_ids": encoding["input_ids"].squeeze(0),  # Eliminar la dimensión de batch
        "attention_mask": encoding["attention_mask"].squeeze(0),
        "bbox": encoding["bbox"].squeeze(0),
        "labels": torch.tensor(word_labels),  # Etiquetas de las palabras
    }

# Ejemplo de cómo procesar una imagen y su etiqueta
example_image_path = os.path.join(OUTPUT_IMAGES_PATH, "receipt_00000.png")
example_label_path = os.path.join(OUTPUT_LABELS_PATH, "receipt_00000.json")
processed_example = process_example(example_image_path, example_label_path)

# Verificar el resultado
print(processed_example)


ValueError: You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True.