In [None]:
pip install transformers datasets torch torchvision opencv-python pytesseract

In [None]:
!sudo apt-get install tesseract-ocr -y


In [None]:
!apt update
!apt install -y tesseract-ocr
!apt install -y libtesseract-dev


In [4]:
import os
import cv2
import xml.etree.ElementTree as ET
from tqdm import tqdm
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import pytesseract
import torch
from PIL import Image

os.environ["WANDB_DISABLED"] = "true"

In [5]:
# Directories
IMAGE_DIR = "/content/drive/MyDrive/DATASET/Paredes - Reglas generales"
ANNOTATION_DIR = "/content/drive/MyDrive/DATASET/photo_anna"

# Label mapping from your dataset (for now we assume one class "text")
LABEL2ID = {"background": 0, "text": 1}
ID2LABEL = {0: "background", 1: "text"}

In [6]:
# Load processor and model
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

model = LayoutLMv3ForTokenClassification.from_pretrained(
    "microsoft/layoutlmv3-base",
    num_labels=2,
    id2label=ID2LABEL,
    label2id=LABEL2ID
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def parse_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    filename = root.find("filename").text
    boxes = []

    for obj in root.findall("object"):
        label = obj.find("name").text
        bbox = obj.find("bndbox")
        x1 = int(bbox.find("xmin").text)
        y1 = int(bbox.find("ymin").text)
        x2 = int(bbox.find("xmax").text)
        y2 = int(bbox.find("ymax").text)
        boxes.append({
            "bbox": [x1, y1, x2, y2],
            "label": LABEL2ID.get(label, 1)
        })
    return filename, boxes

In [8]:
def match_tokens_to_labels(image_path, annotations):
    image = Image.open(image_path).convert("RGB")
    width, height = image.size

    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

    words, boxes, labels = [], [], []

    for i in range(len(ocr_data["text"])):
        word = ocr_data["text"][i].strip()
        if not word:
            continue

        # Original bbox in pixel units
        x1 = ocr_data["left"][i]
        y1 = ocr_data["top"][i]
        x2 = x1 + ocr_data["width"][i]
        y2 = y1 + ocr_data["height"][i]

        # Normalize bbox to 0–1000
        norm_x1 = int(1000 * x1 / width)
        norm_y1 = int(1000 * y1 / height)
        norm_x2 = int(1000 * x2 / width)
        norm_y2 = int(1000 * y2 / height)

        # Clamp to [0, 1000]
        norm_x1 = min(max(norm_x1, 0), 1000)
        norm_y1 = min(max(norm_y1, 0), 1000)
        norm_x2 = min(max(norm_x2, 0), 1000)
        norm_y2 = min(max(norm_y2, 0), 1000)

        # Default label is "O" (1)
        label_id = 1

        for ann in annotations:
            ax1, ay1, ax2, ay2 = ann["bbox"]
            if x1 >= ax1 and y1 >= ay1 and x2 <= ax2 and y2 <= ay2:
                if isinstance(ann["label"], str):
                    label_str = ann["label"].lower()
                    label_id = LABEL2ID.get(label_str, 1)
                else:
                    label_id = ann["label"]
                break

        words.append(word)
        boxes.append([norm_x1, norm_y1, norm_x2, norm_y2])
        labels.append(label_id)

    if not words:
        return None

    inputs = processor(image, words, boxes=boxes, word_labels=labels,
                       truncation=True, padding="max_length", max_length=512, return_tensors="pt",
                       return_overflowing_tokens=False)

    return {
        "input_ids": inputs.input_ids.squeeze(0),
        "attention_mask": inputs.attention_mask.squeeze(0),
        "bbox": inputs.bbox.squeeze(0),
        "labels": inputs.labels.squeeze(0),
        "image_path": image_path
    }


In [9]:
data = []
for xml_file in tqdm(os.listdir(ANNOTATION_DIR)):
    if not xml_file.endswith(".xml"):
        continue
    xml_path = os.path.join(ANNOTATION_DIR, xml_file)
    image_filename, annotations = parse_xml(xml_path)
    image_path = os.path.join(IMAGE_DIR, image_filename)

    if not os.path.exists(image_path):
        continue

    result = match_tokens_to_labels(image_path, annotations)
    if result:
        data.append(result)

hf_dataset = Dataset.from_list(data)

100%|██████████| 57/57 [22:20<00:00, 23.51s/it]


In [1]:
import numpy as np

return_entity_level_metrics = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [3]:
args = TrainingArguments(
    output_dir="./layoutlmv3-xml",
    per_device_train_batch_size=2,
    num_train_epochs=15,
    logging_steps=10,
    save_steps=50,
    evaluation_strategy="no",
    save_total_limit=2,
    report_to="none"  # 👈 disables W&B and others
)

NameError: name 'TrainingArguments' is not defined

In [11]:
%env CUDA_LAUNCH_BLOCKING=1

env: CUDA_LAUNCH_BLOCKING=1


In [2]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=hf_dataset,
    compute_metrics=compute_metrics,
)

NameError: name 'Trainer' is not defined

In [13]:
trainer.train()



Step,Training Loss
10,0.0619
20,0.0001
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0
90,0.0
100,0.0




TrainOutput(global_step=145, training_loss=0.004284400099804143, metrics={'train_runtime': 77.3195, 'train_samples_per_second': 3.686, 'train_steps_per_second': 1.875, 'total_flos': 75124156446720.0, 'train_loss': 0.004284400099804143, 'epoch': 5.0})

In [15]:
model.save_pretrained("/content/drive/MyDrive/DATASET/layoutlmv3-xml")
processor.save_pretrained("/content/drive/MyDrive/DATASET/layoutlmv3-xml")


[]