# LayoutLMv3 Experiment
Evaluating LayoutLMv3 for extracting key data fields from invoices.



# config

note: pytesseract is necesseriy for running the LayoutLMv3 Processer!


In [1]:
import os
from transformers import LayoutLMv3ForQuestionAnswering, AutoTokenizer, LayoutLMv3Processor, LayoutLMv3FeatureExtractor
from PIL import Image

invoice_file = ["/content/BRE-03_page1.png"] # add your png file path here

processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
model = LayoutLMv3ForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")
tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlmv3-base")

Some weights of LayoutLMv3ForQuestionAnswering were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['qa_outputs.dense.bias', 'qa_outputs.dense.weight', 'qa_outputs.out_proj.bias', 'qa_outputs.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install pytesseract

# Picture pre processing and OCR

In [2]:
image_path = invoice_file[0]

try:
    # Open and convert the image to RGB
    image = Image.open(image_path).convert("RGB")

    # Process the image using the loaded processor
    inputs = processor(images=image, return_tensors="pt")

    # Use the loaded model to get outputs
    outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Determine the start and end indices
    start_index = start_logits.argmax()
    end_index = end_logits.argmax()

    # Get input_ids from processed inputs
    input_ids = inputs["input_ids"][0]

    # Decode the tokens
    extracted_text = tokenizer.decode(input_ids[start_index:end_index+1])

    # Print the extracted text
    print("Extracted Text from Image:")
    print(extracted_text)

except FileNotFoundError:
    print(f"Error: File not found at {image_path}")
except Exception as e:
    print(f"An error occurred: {e}")



Extracted Text from Image:



# End-to-End Question Answering

In [5]:
# Define the questions
questions = [
    "What is the total amount?",
    "Who is the recipient?"
]

extracted_answers = {}
image_path = invoice_file[0]

try:
    # Open and convert the image to RGB
    image = Image.open(image_path).convert("RGB")

    for question in questions:
        # Prepare inputs for the model
        inputs = processor(images=image, text=question, return_tensors="pt", padding="max_length", truncation=True)

        # Get model outputs
        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Find the answer span
        start_index = start_logits.argmax()
        end_index = end_logits.argmax()

        # Decode the answer
        answer_tokens = inputs["input_ids"][0, start_index : end_index + 1]
        answer = tokenizer.decode(answer_tokens)

        # Store answers associated with the image path and question
        if image_path not in extracted_answers:
            extracted_answers[image_path] = {}
        extracted_answers[image_path][question] = answer

except FileNotFoundError:
    print(f"Error: The file {image_path} was not found.")
except Exception as e:
    print(f"An error occurred with file {image_path}: {e}")

# Print the extracted answers
print("\nExtracted Answers:")
if extracted_answers:
    for image_path, answers in extracted_answers.items():
        print(f"Answers for {image_path}:")
        for question, answer in answers.items():
            print(f"  Question: {question}")
            print(f"  Answer: {answer}")
else:
    print("No answers were extracted.")




Extracted Answers:
Answers for /content/BRE-03_page1.png:
  Question: What is the total amount?
  Answer: 
  Question: Who is the recipient?
  Answer: 


# Turn Text into Lines with Postions

In [4]:
# Third Main Section: Text Chunking and Bounding Boxes

image_path = invoice_file[0] # Define the image path

try:
    # Open and convert the image to RGB
    image = Image.open(image_path).convert("RGB")

    # Using the processor to get encoding which includes bbox and text
    encoding = processor(images=image, return_tensors="pt")

    # Extract tokens, bounding boxes, and text
    tokens = encoding.input_ids[0]
    bboxes = encoding.bbox[0]
    words = processor.tokenizer.batch_decode(tokens, skip_special_tokens=True)

    # Create a list of dictionaries for each token
    token_info = []
    for token, bbox, word in zip(tokens, bboxes, words):
        # Filter out empty words or special tokens if necessary
        if word.strip() and token not in processor.tokenizer.all_special_ids:
             token_info.append({"text": word, "bbox": bbox.tolist(), "token_id": token.item()})

    # Sort tokens primarily by y-coordinate to group by line
    token_info_sorted_y = sorted(token_info, key=lambda x: x["bbox"][1])

    # Group tokens into lines based on vertical proximity
    line_tolerance = 10 # Pixels; adjust as needed

    sorted_lines = [] # List of tuples (y_coordinate, list_of_tokens_in_line)
    current_line = []
    current_y = -1

    for token in token_info_sorted_y:
        # Use the y-coordinate of the top-left corner for grouping
        token_y = token["bbox"][1]

        if not current_line:
            current_line.append(token)
            current_y = token_y
        elif abs(token_y - current_y) < line_tolerance:
            current_line.append(token)
            # Update current_y to the average y of the line for better grouping
            current_y = sum(t["bbox"][1] for t in current_line) / len(current_line)
        else:
            # Sort tokens within the line by x-coordinate
            current_line_sorted_x = sorted(current_line, key=lambda x: x["bbox"][0])
            sorted_lines.append((current_y, current_line_sorted_x))
            current_line = [token]
            current_y = token_y

    # Add the last line
    if current_line:
         current_line_sorted_x = sorted(current_line, key=lambda x: x["bbox"][0])
         sorted_lines.append((current_y, current_line_sorted_x))

    print(f"Processed image and grouped tokens into {len(sorted_lines)} lines.")

    # Split lines into Chunks
    chunk_gap_threshold = 40   # Pixel; adjust as needed or calculate dynamically

    chunks = []  # List of lists; each inner list contains the tokens of a chunk

    for y, line_tokens in sorted_lines:
        # Sort tokens in the line left->right
        line_tokens_sorted = sorted(line_tokens, key=lambda x: x["bbox"][0])
        if not line_tokens_sorted:
            continue

        current_chunk = [line_tokens_sorted[0]]

        for tok in line_tokens_sorted[1:]:
            prev_right = current_chunk[-1]["bbox"][2]   # right edge of the last token
            gap       = tok["bbox"][0] - prev_right     # distance to the next token's start

            # If gap is too large, start a new chunk
            if gap > chunk_gap_threshold:
                chunks.append(current_chunk)
                current_chunk = [tok]
            else:
                current_chunk.append(tok)

        # Append the last chunk of the line
        chunks.append(current_chunk)

    # Convert tokens to Text + Collective BBox per Chunk
    chunk_objects = []
    for chunk in chunks:
        # Assemble raw text; .strip() removes leading/trailing whitespace
        chunk_text = "".join(t["text"] for t in chunk).strip()
        if not chunk_text:          # skip empty chunks
            continue

        # Collective BBox of the chunk
        min_x = min(t["bbox"][0] for t in chunk)
        min_y = min(t["bbox"][1] for t in chunk)
        max_x = max(t["bbox"][2] for t in chunk)
        max_y = max(t["bbox"][3] for t in chunk)

        chunk_objects.append(
            {"text": chunk_text,
             "bbox": [min_x, min_y, max_x, max_y]}
        )

    # Print the identified chunks
    print("\nGefundene Chunks:")
    for obj in chunk_objects:
        print(obj)

except FileNotFoundError:
    print(f"Error: File not found at {image_path}")
except Exception as e:
    print(f"Error processing file {image_path}: {e}")

Processed image and grouped tokens into 33 lines.

Gefundene Chunks:
{'text': 'USTERMANA', 'bbox': [239, 78, 333, 86]}
{'text': 'Mustermann GmbH.', 'bbox': [635, 113, 782, 123]}
{'text': 'Test 123', 'bbox': [633, 130, 694, 139]}
{'text': 'Abs.: Mustermann GmbH. | HauptstraBe 123 | 5020 Salzburg', 'bbox': [106, 143, 458, 153]}
{'text': '5020 Salzburg Osterreich', 'bbox': [634, 144, 821, 157]}
{'text': 'Tel.:+43 1234 123456', 'bbox': [633, 161, 790, 170]}
{'text': 'Herr Dr. Hubert Brinkmann', 'bbox': [122, 184, 318, 193]}
{'text': 'office@mustermann.com', 'bbox': [627, 176, 815, 195]}
{'text': 'Hauptstrasse 125/7/3', 'bbox': [122, 202, 278, 213]}
{'text': '83395 BERLIN', 'bbox': [121, 219, 222, 228]}
{'text': 'DEUTSCHLAND', 'bbox': [122, 237, 234, 245]}
{'text': 'Rechnung: Re-2/2015', 'bbox': [740, 270, 898, 281]}
{'text': 'Datum: 12.03.2015', 'bbox': [762, 288, 898, 296]}
{'text': 'Kundennummer: 11', 'bbox': [753, 305, 894, 314]}
{'text': 'Rechnung', 'bbox': [443, 326, 566, 345]}
{'text