In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from datasets import load_dataset

ds = load_dataset("katanaml-org/invoices-donut-data-v1")

In [3]:
schema_dict = {
    "header": {
        "invoice_no": "Invoice number of the document",
        "invoice_date": "Date when the invoice was issued",
        "seller": "Full address and name of the seller",
        "client": "Full address and name of the client",
        "seller_tax_id": "Tax identification number of the seller",
        "client_tax_id": "Tax identification number of the client",
        "iban": "Bank IBAN number of the seller"
    },
    "items": [
        {
            "item_desc": "Description of the service or product sold",
            "item_qty": "Quantity of the item (usually in units or pieces)",
            "item_net_price": "Unit price excluding VAT",
            "item_net_worth": "Total price excluding VAT",
            "item_vat": "VAT rate applied to this item",
            "item_gross_worth": "Total price including VAT"
        }
    ],
    "summary": {
        "total_net_worth": "Total net amount before VAT",
        "total_vat": "Total VAT amount",
        "total_gross_worth": "Final total amount including VAT"
    }
}

In [4]:
import json
instruction = f"""You are a specialized in invoice and your role is to extract information from any invoice that is provided to you in the following valid json format. if the corresponding value is not present, leave the key with empty string.

{json.dumps(schema_dict)}

Fill the keys only when the information is available.
"""

In [5]:
instruction

'You are a specialized in invoice and your role is to extract information from any invoice that is provided to you in the following valid json format. if the corresponding value is not present, leave the key with empty string.\n\n{"header": {"invoice_no": "Invoice number of the document", "invoice_date": "Date when the invoice was issued", "seller": "Full address and name of the seller", "client": "Full address and name of the client", "seller_tax_id": "Tax identification number of the seller", "client_tax_id": "Tax identification number of the client", "iban": "Bank IBAN number of the seller"}, "items": [{"item_desc": "Description of the service or product sold", "item_qty": "Quantity of the item (usually in units or pieces)", "item_net_price": "Unit price excluding VAT", "item_net_worth": "Total price excluding VAT", "item_vat": "VAT rate applied to this item", "item_gross_worth": "Total price including VAT"}], "summary": {"total_net_worth": "Total net amount before VAT", "total_vat"

In [6]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(token=secret_value_0)

In [7]:
# ! pip install vllm

In [8]:
import PIL

def create_vllm_messages(image_path,prompt):
    placeholder = "<|image_pad|>"
    vllm_prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
          f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
          f"{prompt}<|im_end|>\n"
          "<|im_start|>assistant\n")

    image = PIL.Image.open(image_path)
    inputs = {
        "prompt": vllm_prompt,
        "multi_modal_data": {
            "image": image
        }
    }
    return inputs

In [9]:
image = ds["test"][2]['image']

In [10]:
image.save("test_image.png")

In [11]:
create_vllm_messages("/kaggle/working/test_image.png",instruction)

{'prompt': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>You are a specialized in invoice and your role is to extract information from any invoice that is provided to you in the following valid json format. if the corresponding value is not present, leave the key with empty string.\n\n{"header": {"invoice_no": "Invoice number of the document", "invoice_date": "Date when the invoice was issued", "seller": "Full address and name of the seller", "client": "Full address and name of the client", "seller_tax_id": "Tax identification number of the seller", "client_tax_id": "Tax identification number of the client", "iban": "Bank IBAN number of the seller"}, "items": [{"item_desc": "Description of the service or product sold", "item_qty": "Quantity of the item (usually in units or pieces)", "item_net_price": "Unit price excluding VAT", "item_net_worth": "Total price excluding VAT", "item_vat": "VAT rate applied to this

In [12]:
import os
if not os.path.exists("testing"):
    os.mkdir("testing")

In [13]:
# Run this in a cell before initializing your model
import torch
torch.cuda.empty_cache()

In [14]:
! pip install git+https://github.com/huggingface/transformers accelerate

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-5eca45jc
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-5eca45jc
  Resolved https://github.com/huggingface/transformers to commit c338fd43b0be2c7f5d73e693fa6fb1b5e7a0bdc2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [15]:
! pip install qwen-vl-utils[decord]==0.0.8



In [16]:
!nvidia-smi

Tue Jul 22 14:35:41 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [17]:
# from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
# from transformers import BitsAndBytesConfig

model_id = "Adarsh203/qwen-2.5-vl-3b-invoices"


# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     model_id,
#     device_map="auto")
# processor = AutoProcessor.from_pretrained(model_id)


In [18]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
# import transformers

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id, torch_dtype="auto", device_map="auto"
)

# default processer
processor = AutoProcessor.from_pretrained(model_id)


2025-07-22 14:35:46.434657: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753194946.457045     174 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753194946.463861     174 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [19]:
import gradio as gr
from PIL import Image
import os
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import json

# Default prompt
instruction = f"""You are a specialized in invoice and your role is to extract information from any invoice that is provided to you in the following valid json format. if the corresponding value is not present, leave the key with empty string.

{json.dumps(schema_dict)}

Fill the keys only when the information is available.
"""

# Function to create messages
def create_messages(image_path, prompt):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image_path,
                    "resized_height": 640,
                    "resized_width": 640
                },
                {"type": "text", "text": prompt},
            ],
        }
    ]
    return messages

# Inference logic
def infer(messages):
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)

    generated_ids = model.generate(**inputs, max_new_tokens=2048)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text[0]

# Gradio handler
def invoice_parser(image: Image.Image):
    image_path = os.path.join("inference", "test_image.png")
    os.makedirs("inference", exist_ok=True)
    image.save(image_path)

    messages = create_messages(image_path=image_path, prompt=instruction)
    output = infer(messages)
    return output

def reset_fields():
    return None, None

# Gradio UI
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Upload Invoice")
            image_input = gr.Image(type="pil", label="Invoice Image")
            submit_btn = gr.Button("Submit")
            reset_btn = gr.Button("Reset")

        with gr.Column(scale=1):
            gr.Markdown("### Parsed Invoice Details")
            output_text = gr.Textbox(label="Extracted Data", lines=10)

    submit_btn.click(fn=invoice_parser, inputs=image_input, outputs=output_text)
    reset_btn.click(fn=reset_fields, inputs=[], outputs=[image_input, output_text])

demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://febb3627aa01439150.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [20]:
import os
if not os.path.exists("tests"):
    os.mkdir("tests")

In [21]:
from tqdm import tqdm

i = 0

ground_truth_list = []
predictions = []
# batched_messages = []

for row in tqdm(ds["test"]):
    image = row['image']
    image_path = os.path.join("tests",f"test_{i}.png")
    image.save(image_path)

    ground_truth = json.loads(row['ground_truth'])["gt_parse"]
    ground_truth_list.append(ground_truth)

    message = create_messages(image_path = image_path,prompt=instruction)
    predict = infer(message)
    predictions.append(predict)
    i+= 1
    

100%|██████████| 26/26 [17:28<00:00, 40.31s/it]


In [22]:
predictions[0]

'{"header": {"invoice_no": "97159829", "invoice_date": "09/18/2015", "seller": "Bradley-Andrade 9879 Elizabeth Common Lake Jonathan, RI 12335", "client": "Castro PLC Unit 9678 Box 9664 DPO AP 69387", "seller_tax_id": "985-73-8194", "client_tax_id": "994-72-1270", "iban": "GB81LLZWO32519172531418"}, "items": [{"item_desc": "12\\" Marble Lapis Inlay Chess Table Top With 2\\" Pieces & 15\\" Wooden Stand W037", "item_qty": "2,00", "item_net_price": "444,60", "item_net_worth": "889,20", "item_vat": "10%", "item_gross_worth": "978,12"}], "summary": {"total_net_worth": "$ 889,20", "total_vat": "$ 88,92", "total_gross_worth": "$ 978,12"}}'

In [23]:
ground_truth_list[0]

{'header': {'invoice_no': '97159829',
  'invoice_date': '09/18/2015',
  'seller': 'Bradley-Andrade 9879 Elizabeth Common Lake Jonathan, RI 12335',
  'client': 'Castro PLC Unit 9678 Box 9664 DPO AP 69387',
  'seller_tax_id': '985-73-8194',
  'client_tax_id': '994-72-1270',
  'iban': 'GB81LZWO32519172531418'},
 'items': [{'item_desc': '12" Marble Lapis Inlay Chess Table Top With 2" Pieces & 15" Wooden Stand W537',
   'item_qty': '2,00',
   'item_net_price': '444,60',
   'item_net_worth': '889,20',
   'item_vat': '10%',
   'item_gross_worth': '978,12'}],
 'summary': {'total_net_worth': '$ 889,20',
  'total_vat': '$ 88,92',
  'total_gross_worth': '$ 978,12'}}

In [24]:
import json

def flatten_dict(d, parent_key='', sep=' | '):
    """Flatten a nested dict/list into a readable string format."""
    items = []
    if isinstance(d, dict):
        for k, v in d.items():
            new_key = f"{parent_key}{k}" if parent_key == '' else f"{parent_key}{sep}{k}"
            items.append(flatten_dict(v, new_key, sep=sep))
    elif isinstance(d, list):
        for i, v in enumerate(d):
            new_key = f"{parent_key}[{i}]"
            items.append(flatten_dict(v, new_key, sep=sep))
    else:
        items.append(f"{parent_key}: {d}")
    return sep.join(items)

# Prepare normalized text lists
norm_preds = []
norm_refs = []

for pred_str, gt_dict in zip(predictions, ground_truth_list):
    try:
        pred_dict = json.loads(pred_str)
    except Exception as e:
        print("JSON parse error in prediction:", pred_str)
        pred_dict = {}

    pred_text = flatten_dict(pred_dict)
    gt_text = flatten_dict(gt_dict)

    norm_preds.append(pred_text.lower())
    norm_refs.append(gt_text.lower())


In [25]:
norm_preds

['header | invoice_no: 97159829 | header | invoice_date: 09/18/2015 | header | seller: bradley-andrade 9879 elizabeth common lake jonathan, ri 12335 | header | client: castro plc unit 9678 box 9664 dpo ap 69387 | header | seller_tax_id: 985-73-8194 | header | client_tax_id: 994-72-1270 | header | iban: gb81llzwo32519172531418 | items[0] | item_desc: 12" marble lapis inlay chess table top with 2" pieces & 15" wooden stand w037 | items[0] | item_qty: 2,00 | items[0] | item_net_price: 444,60 | items[0] | item_net_worth: 889,20 | items[0] | item_vat: 10% | items[0] | item_gross_worth: 978,12 | summary | total_net_worth: $ 889,20 | summary | total_vat: $ 88,92 | summary | total_gross_worth: $ 978,12',
 "header | invoice_no: 13194726 | header | invoice_date: 05/29/2021 | header | seller: hopkins and sons 62283 flores tunnel north luis, ia 69983 | header | client: sims plc uss kramer fpo aa 81651 | header | seller_tax_id: 952-73-7223 | header | client_tax_id: 995-88-9495 | header | iban: gb31

In [26]:
!pip install evaluate rouge-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f622bb2b965ab9d40611f14029e9a1e396b90e5f23928a14f1271965bc87df96
  Stored

#### Model Evaluation

In [27]:
import evaluate

rouge = evaluate.load("rouge")
results = rouge.compute(predictions=norm_preds, references=norm_refs)
print(results)

Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': 0.9717973759033505, 'rouge2': 0.9517742369542243, 'rougeL': 0.9705970469093224, 'rougeLsum': 0.9705499501599857}


In [28]:
from collections import Counter

def flatten_dict(d,parent_keys=""):
    items = {}
    for k,v in d.items():
        new_key = f"{parent_key}.{k}" if parent_key else k

        if isinstance(v,dict):
            items.update(flatten_dict(v,new_key))
        elif isinstance(v,list):
            if all(isinstance(i,dict) for i in v):
                for idx,item in enumerate(v):
                    items.update(flatten_dict(item,f"{new_key}[{idx}]"))
            else:
                items[new_key] = v
        else:
            items[new_key] = v
    return items

In [31]:
with open("predictions.txt",'w') as file:
    for item in predictions:
        file.write(str(item) + '\n')


In [32]:
import json, ast, re
from decimal import Decimal, InvalidOperation
from collections import defaultdict


def parse_pred(pred_str):
    """Robust JSON parse. Fall back to ast.literal_eval; return {} on failure."""
    if isinstance(pred_str, dict):
        return pred_str
    try:
        return json.loads(pred_str)
    except Exception:
        try:
            return ast.literal_eval(pred_str)
        except Exception:
            return {}


_ws_re = re.compile(r"\s+")

def norm_text(x):
    if x is None:
        return ""
    s = str(x)
    s = s.replace('\\"', '"')  # unescape quotes
    s = _ws_re.sub(" ", s).strip().lower()
    return s

_currency_re = re.compile(r"[^\d,.\-]")

def norm_number(x):
    """Return canonical numeric string, or raw normalized text if cannot parse."""
    if x is None:
        return ""
    s = str(x)
    s = s.strip()
    s = _currency_re.sub("", s)
    if s.count(",") == 1 and s.count(".") == 0:
        s = s.replace(",", ".")

    if s.count(".") > 1:
        parts = s.split(".")
        s = "".join(parts[:-1]) + "." + parts[-1]
    try:
        val = Decimal(s)
   
        quant = Decimal("0.0001")
        valq = val.quantize(quant) if val.as_tuple().exponent < -4 else val
        return str(valq.normalize())
    except (InvalidOperation, ValueError):
        return norm_text(x)

NUMERIC_FIELD_FRAGMENTS = {
    "qty", "price", "worth", "vat", "total", "amount", "gross", "net"
}

def is_numeric_key(flat_key):
    lk = flat_key.lower()
    return any(seg in lk for seg in NUMERIC_FIELD_FRAGMENTS)

def normalize_value(flat_key, value):
    return norm_number(value) if is_numeric_key(flat_key) else norm_text(value)


def flatten(obj, prefix=""):
    """
    Returns dict: flat_key -> value (leaf scalars only).
    Lists indexed as [i]; nested dicts dotted.
    """
    flat = {}
    if isinstance(obj, dict):
        for k, v in obj.items():
            new_pref = f"{prefix}.{k}" if prefix else k
            flat.update(flatten(v, new_pref))
    elif isinstance(obj, list):
        for i, v in enumerate(obj):
            new_pref = f"{prefix}[{i}]"
            flat.update(flatten(v, new_pref))
    else:
        flat[prefix] = obj
    return flat


def evaluate_structured_predictions(predictions, ground_truth_list, return_per_example=True):
    """
    predictions: list[str|dict]
    ground_truth_list: list[dict]
    """
    assert len(predictions) == len(ground_truth_list), "Length mismatch."

    per_example = []
    field_totals = defaultdict(int)
    field_correct = defaultdict(int)
    n_examples = len(predictions)
    parse_failures = 0

    for idx, (pred_raw, gt_dict) in enumerate(zip(predictions, ground_truth_list)):
        pred_dict = parse_pred(pred_raw)
        if not pred_dict:
            parse_failures += 1

        flat_gt = flatten(gt_dict)
        flat_pr = flatten(pred_dict)

        ex_record = {"index": idx, "num_fields": 0, "num_correct": 0, "errors": []}

        for k_gt, v_gt in flat_gt.items():
            field_totals[k_gt] += 1
            ex_record["num_fields"] += 1

            v_pr = flat_pr.get(k_gt, None)

            norm_gt = normalize_value(k_gt, v_gt)
            norm_pr = normalize_value(k_gt, v_pr)

            if norm_pr == norm_gt:
                field_correct[k_gt] += 1
                ex_record["num_correct"] += 1
            else:
                ex_record["errors"].append({
                    "field": k_gt,
                    "gt": v_gt,
                    "pred": v_pr,
                    "norm_gt": norm_gt,
                    "norm_pred": norm_pr
                })

        per_example.append(ex_record)

    # Per-field accuracy
    field_accuracy = {
        k: field_correct[k] / field_totals[k] for k in field_totals
    }

    # Macro accuracy: mean over fields
    macro_acc = sum(field_accuracy.values()) / len(field_accuracy) if field_accuracy else 0.0

    # Micro accuracy: all correct fields / all fields
    total_correct = sum(field_correct.values())
    total_fields = sum(field_totals.values())
    micro_acc = total_correct / total_fields if total_fields else 0.0

    report = {
        "n_examples": n_examples,
        "parse_failures": parse_failures,
        "micro_accuracy": micro_acc,
        "macro_accuracy": macro_acc,
        "field_accuracy": dict(sorted(field_accuracy.items())),
    }

    if return_per_example:
        return report, per_example
    return report


def print_field_report(report, top_n=None):
    print(f"Examples: {report['n_examples']}")
    print(f"Prediction parse failures: {report['parse_failures']}")
    print(f"Micro accuracy: {report['micro_accuracy']:.4f}")
    print(f"Macro accuracy: {report['macro_accuracy']:.4f}")
    print("\nPer-field accuracy:")
    items = list(report["field_accuracy"].items())
    if top_n is not None:
        items = items[:top_n]
    for k, v in items:
        print(f"  {k:<40} {v:.4f}")


In [33]:
report, per_ex = evaluate_structured_predictions(predictions, ground_truth_list)
print_field_report(report)

# Inspect a specific example with errors
i = 0
print("\nErrors for example", i, ":")
for err in per_ex[i]["errors"]:
    print(f"- {err['field']}: GT={err['gt']} | PRED={err['pred']}")


Examples: 26
Prediction parse failures: 0
Micro accuracy: 0.8939
Macro accuracy: 0.8606

Per-field accuracy:
  header.client                            0.7308
  header.client_tax_id                     1.0000
  header.iban                              0.5000
  header.invoice_date                      1.0000
  header.invoice_no                        1.0000
  header.seller                            0.5769
  header.seller_tax_id                     1.0000
  items[0].iban                            0.0000
  items[0].item_desc                       0.3462
  items[0].item_gross_worth                1.0000
  items[0].item_net_price                  1.0000
  items[0].item_net_worth                  1.0000
  items[0].item_qty                        1.0000
  items[0].item_vat                        1.0000
  items[1].iban                            0.0000
  items[1].item_desc                       0.4762
  items[1].item_gross_worth                1.0000
  items[1].item_net_price                