In [50]:
import json
from datasets import load_dataset

In [51]:
from transformers import AutoProcessor

In [52]:
train = load_dataset("naver-clova-ix/cord-v2", split="train")
test = load_dataset("naver-clova-ix/cord-v2", split="test")
val = load_dataset("naver-clova-ix/cord-v2", split="validation")

In [53]:
import json

print(json.loads(train[0]["ground_truth"]).keys())
print("gt_parse keys: ", json.loads(train[0]["ground_truth"])["gt_parse"].keys())
print("meta keys: ", json.loads(train[0]["ground_truth"])["meta"].keys())
print("valid_line keys: ", json.loads(train[0]["ground_truth"])["valid_line"][0].keys())
print("roi keys: ", json.loads(train[0]["ground_truth"])["roi"].keys())

dict_keys(['gt_parse', 'meta', 'valid_line', 'roi', 'repeating_symbol', 'dontcare'])
gt_parse keys:  dict_keys(['menu', 'sub_total', 'total'])
meta keys:  dict_keys(['version', 'split', 'image_id', 'image_size'])
valid_line keys:  dict_keys(['words', 'category', 'group_id', 'sub_group_id'])
roi keys:  dict_keys([])


In [54]:
import json

print(json.loads(train[1]["ground_truth"]).keys())
print("gt_parse keys: ", json.loads(train[1]["ground_truth"])["gt_parse"].keys())
print("meta keys: ", json.loads(train[1]["ground_truth"])["meta"].keys())
print("valid_line keys: ", json.loads(train[1]["ground_truth"])["valid_line"][0].keys())
print("roi keys: ", json.loads(train[1]["ground_truth"])["roi"].keys())

dict_keys(['gt_parse', 'meta', 'valid_line', 'roi', 'repeating_symbol', 'dontcare'])
gt_parse keys:  dict_keys(['menu', 'sub_total', 'total'])
meta keys:  dict_keys(['version', 'split', 'image_id', 'image_size'])
valid_line keys:  dict_keys(['words', 'category', 'group_id', 'sub_group_id'])
roi keys:  dict_keys([])


In [55]:
print(json.loads(train[0]["ground_truth"])["gt_parse"]["menu"][0])

{'nm': 'Nasi Campur Bali', 'cnt': '1 x', 'price': '75,000'}


In [56]:
from transformers import AutoProcessor, AutoModelForTokenClassification

processor = AutoProcessor.from_pretrained("nielsr/layoutlmv3-finetuned-cord", apply_ocr=False)
model = AutoModelForTokenClassification.from_pretrained("nielsr/layoutlmv3-finetuned-cord")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'LayoutLMv3TokenizerFast'.


In [57]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [58]:
import torch

# Move encoding to the same device as the model (e.g., 'cuda' if available)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

In [59]:
words = []
bboxes = []
word_labels = [] # Will be populated if label_map is provided

label_map = {
        "menu.nm": 0, "menu.cnt": 1, "menu.price": 2,
        "sub_total.price": 3, "total.total_price": 4,
        "discount.price": 5, "void_menu.nm": 6, "item.cnt": 7,
        "item.price": 8, "item.nm": 9, "sub_total.discount_price": 10,
        "cashprice": 11, "changeprice": 12, "emoneyprice": 13,
        "menu.unitprice": 14, "service_charge.price": 15, "vat.price": 16,
        "O": 17
    }

id_to_label = {v: k for k, v in label_map.items()}
num_labels = len(label_map)

def normalize_bbox(bbox, width, height):
    return [
        int(1000 * (bbox[0] / width)),
        int(1000 * (bbox[1] / height)),
        int(1000 * (bbox[2] / width)),
        int(1000 * (bbox[3] / height)),
    ]

width, height = train[0]["image"].size

# CORD v2 annotations are in 'valid_line'
for line in json.loads(train[0]["ground_truth"])["valid_line"]:
    for word_info in line['words']:
        text = word_info['text']
        # Convert polygon (quad) to bounding box (min_x, min_y, max_x, max_y)
        # CORD quad format: [x1, y1, x2, y2, x3, y3, x4, y4]
        quad_dict = word_info['quad']
        # Extract x and y coordinates from the dictionary
        x_coords = [quad_dict['x1'], quad_dict['x2'], quad_dict['x3'], quad_dict['x4']]
        y_coords = [quad_dict['y1'], quad_dict['y2'], quad_dict['y3'], quad_dict['y4']]

        # Get min/max for rectangular bbox
        x_min = min(x_coords)
        y_min = min(y_coords)
        x_max = max(x_coords)
        y_max = max(y_coords)

        # Ensure valid bounding box (e.g., x_min <= x_max, y_min <= y_max)
        # This can happen if OCR is poor or annotations are malformed
        if x_min > x_max: x_min, x_max = x_max, x_min
        if y_min > y_max: y_min, y_max = y_max, y_min

        # Normalize bounding box
        normalized_bbox = normalize_bbox([x_min, y_min, x_max, y_max], width, height)

        # Append word and bbox
        words.append(text)
        bboxes.append(normalized_bbox)

        # If doing token classification, map the category to an ID
        if label_map:
            category = line['category'] # Category is at the line level in CORD
            if category in label_map:
                word_labels.append(label_map[category])
            else:
                word_labels.append(label_map["O"]) # "O" for "Other" or unknown

In [68]:
words[0], bboxes[0], id_to_label[word_labels[0]]

('1', [268, 287, 282, 300], 'menu.cnt')

In [61]:
encoding = processor(    
    images=train[0]["image"],
    text=words,
    boxes=bboxes,
    word_labels=word_labels if label_map else None,
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
)

In [62]:

model_inputs = {k: v for k, v in encoding.items()}

In [63]:
# 4. Pass the encoding to the model
with torch.no_grad(): # Disable gradient calculation for inference
    outputs = model(**model_inputs)

In [64]:
# 5. Process the model output
# For token classification, outputs will typically have 'logits'
# logits.shape will be (batch_size, sequence_length, num_labels)
logits = outputs.logits
print(logits)

tensor([[[ 0.0434,  0.7787, -0.1083,  ..., -0.2363, -0.4289, -0.0496],
         [-0.2689,  1.0043,  0.0466,  ..., -0.1882, -0.2804, -0.0453],
         [-0.2277,  1.1927,  0.1188,  ..., -0.1765, -0.2872, -0.1512],
         ...,
         [ 0.0361,  0.8066, -0.1061,  ..., -0.2353, -0.4506, -0.0483],
         [ 0.0361,  0.8066, -0.1061,  ..., -0.2353, -0.4506, -0.0483],
         [ 0.0361,  0.8066, -0.1061,  ..., -0.2353, -0.4506, -0.0483]]])


In [65]:
# 5. Process the model output
# For token classification, outputs will typically have 'logits'
# logits.shape will be (batch_size, sequence_length, num_labels)
logits = outputs.logits
print(f"\nModel output logits shape: {logits.shape}")

# Get predicted labels
predictions = logits.argmax(dim=-1).squeeze().tolist()

# Decode the predictions back to human-readable labels
# Remember that the tokenizer adds special tokens (like [CLS], [SEP])
# and can split words into subwords. The labels are aligned with these tokens.
# Tokens with label_ids == -100 are special tokens or padding that should be ignored.
tokenized_input_ids = encoding['input_ids'].squeeze().tolist()
tokens = processor.tokenizer.convert_ids_to_tokens(tokenized_input_ids)

print("\n--- Model Predictions ---")
for token, pred_id, original_label_id in zip(tokens, predictions, encoding['labels'].squeeze().tolist()):
    if pred_id != -100: # -100 is typically used for special tokens or ignored padding
        predicted_label = id_to_label.get(pred_id, "UNK")
        original_label = id_to_label.get(original_label_id, "UNK_ORIG") if original_label_id != -100 else "N/A"
        print(f"Token: {token:<15} | Predicted: {predicted_label:<15} | Original: {original_label}")
    else:
        print(f"Token: {token:<15} | Predicted: (Ignored)    | Original: (Ignored)")


Model output logits shape: torch.Size([1, 512, 61])

--- Model Predictions ---
Token: <s>             | Predicted: void_menu.nm    | Original: N/A
Token: Ġ1              | Predicted: total.total_price | Original: menu.cnt
Token: Ġx              | Predicted: total.total_price | Original: menu.cnt
Token: ĠN              | Predicted: menu.cnt        | Original: menu.nm
Token: asi             | Predicted: UNK             | Original: N/A
Token: ĠCamp           | Predicted: menu.cnt        | Original: menu.nm
Token: ur              | Predicted: UNK             | Original: N/A
Token: ĠB              | Predicted: menu.cnt        | Original: menu.nm
Token: ali             | Predicted: UNK             | Original: N/A
Token: Ġ75             | Predicted: void_menu.nm    | Original: menu.price
Token: ,               | Predicted: void_menu.nm    | Original: N/A
Token: 000             | Predicted: void_menu.nm    | Original: N/A
Token: Ġ1              | Predicted: total.total_price | Original: menu.