In [2]:
input_path = "inputs/receipt_walmart.png"

# microsoft/layoutlmv3-base

In [None]:
pip install transformers datasets pytesseract torchvision torch Pillow

In [None]:
# Load path
from PIL import Image
input_path = "inputs/receipt_walmart.png"
image = Image.open(input_path).convert("RGB")

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("microsoft/layoutlmv3-base", torch_dtype="auto"),

In [None]:
import pytesseract

ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

words = []
boxes = []

for i in range(len(ocr_data['text'])):
    word = ocr_data['text'][i]
    if word.strip() == "":
        continue

    x, y, w, h = (ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i])
    words.append(word)
    boxes.append([x, y, x + w, y + h])

In [None]:
width, height = image.size

def normalize_bbox(box, width, height):
    return [
        int(1000 * box[0] / width),
        int(1000 * box[1] / height),
        int(1000 * box[2] / width),
        int(1000 * box[3] / height),
    ]

normalized_boxes = [normalize_bbox(b, width, height) for b in boxes]

In [None]:
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification

processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")

encoding = processor(
    image,
    words,
    boxes=normalized_boxes,
    return_tensors="pt",
    truncation=True,
    padding="max_length"
)

In [None]:
import torch

with torch.no_grad():
    outputs = model(**encoding)
    logits = outputs.logits

predictions = torch.argmax(logits, dim=2)

In [None]:
tokens = processor.tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
labels = predictions[0].tolist()

for token, label in zip(tokens, labels):
    print(f"{token} -> {label}")

# clovaai/donut

In [None]:
pip install transformers torchvision pytorch-lightning protobuf

In [1]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch

# Choose the best-suited pretrained model
model_id = "naver-clova-ix/donut-base-finetuned-cord-v2"

processor = DonutProcessor.from_pretrained(model_id)
model = VisionEncoderDecoderModel.from_pretrained(model_id)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


VisionEncoderDecoderModel(
  (encoder): DonutSwinModel(
    (embeddings): DonutSwinEmbeddings(
      (patch_embeddings): DonutSwinPatchEmbeddings(
        (projection): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DonutSwinEncoder(
      (layers): ModuleList(
        (0): DonutSwinStage(
          (blocks): ModuleList(
            (0): DonutSwinLayer(
              (layernorm_before): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
              (attention): DonutSwinAttention(
                (self): DonutSwinSelfAttention(
                  (query): Linear(in_features=128, out_features=128, bias=True)
                  (key): Linear(in_features=128, out_features=128, bias=True)
                  (value): Linear(in_features=128, out_features=128, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                )

In [3]:
image = Image.open(input_path).convert("RGB")

# Resize + normalize internally
pixel_values = processor(image, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)

In [4]:
# Set task-specific prompt
prompt = "<s_cord-v2>"  # this varies based on fine-tuned model
decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
decoder_input_ids = decoder_input_ids.to(device)

# Generate
outputs = model.generate(
    pixel_values,
    decoder_input_ids=decoder_input_ids,
    max_length=512,
    early_stopping=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    eos_token_id=processor.tokenizer.eos_token_id
)

# Decode and parse output
output_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
structured_output = processor.token2json(output_text)

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [5]:
from pprint import pprint
pprint(structured_output)

{'menu': [{'cnt': {'unitprice': '7KZGFSL2FF'},
           'nm': 'See bock of receipt for your chance',
           'price': ')',
           'unitprice': '$1000'},
          {'cnt': '(',
           'nm': 'MANGGER MIKE MEDIUNGER',
           'price': '8755',
           'unitprice': '970'},
          {'cnt': {'unitprice': '00014組成 TE* 0270'},
           'nm': 'DUNANGO DEL RIO',
           'price': '0.95',
           'unitprice': '1165 S東北 DURAGO CO 81303'},
          {'cnt': '0.95',
           'nm': 'BEYERAGE',
           'price': '2.00 R',
           'unitprice': '003120oz075'},
          {'cnt': '2.00',
           'nm': 'OS R',
           'price': '0.96',
           'unitprice': '0674717820'},
          {'cnt': {'unitprice': '3.78'},
           'nm': 'STUBURY CC',
           'price': '3.78',
           'unitprice': '0.96'},
          {'cnt': '44503',
           'nm': 'HARAL SALAMI 0205336036與 F',
           'price': '3.98',
           'unitprice': '3.98'},
          {'cnt': '4',
        