In [6]:
# Imports
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

In [7]:
VLM_QWEN3_VL_2B = 'Qwen/Qwen3-VL-2B-Instruct'
image_path = "invoices_dataset/unified_dataset/images/dataset1_katanaml_test_katanaml_0001.png"

In [None]:
class VLMQwen3:
    def __init__(self, model_type):
        self.model = Qwen3VLForConditionalGeneration.from_pretrained(model_type, device_map="auto", dtype='auto').eval()
        self.processor = AutoProcessor.from_pretrained(model_type)

    def process(self, image_path, image_query):
        image_query_chat = image_query.replace("<image>", "").strip()

        system_prompt = 'You are a helpful assistant.'
        messages = [
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": system_prompt}
                ]
            },
            {
                "role": "user", "content": [
                {"type": "image",
                 "url": image_path},
                {"type": "text", "text": image_query_chat},
            ]
            },
        ]

        inputs = self.processor.apply_chat_template(
            messages,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
            add_generation_prompt=True
        )

        inputs = inputs.to(self.model.device)

        generation = self.model.generate(**inputs, max_new_tokens=400)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generation)
        ]
        decoded = self.processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True
        )

        return decoded

    def process_text(self, text_query):
        inputs = self.processor(
            text=text_query,
            return_tensors="pt",
            padding=True
        )

        device = next(self.model.parameters()).device
        inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}

        generation = self.model.generate(**inputs, max_new_tokens=400, do_sample=True)
        decoded = self.processor.decode(generation[0], skip_special_tokens=True)

        print(decoded)
        return decoded

In [9]:
vlm_processor = VLMQwen3(model_type=VLM_QWEN3_VL_2B)

In [10]:
image_query_vlm_ocr = "Read the OCR in the image. Do not repeat any text that has already been identified."
response_vlm_ocr = vlm_processor.process(image_path=image_path, image_query=image_query_vlm_ocr)
print(response_vlm_ocr)

['Invoice no: 97159829\nDate of issue: 09/18/2015\n\nSeller:\nBradley-Andrade\n9879 Elizabeth Common\nLake Jonathan, RI 12335\nTax Id: 985-73-8194\nIBAN: GB81LZWO32519172531418\n\nClient:\nCastro PLC\nUnit 9678 Box 9664\nDPO AP 69387\nTax Id: 994-72-1270\n\nITEMS\nNo. Description Qty UM Net price Net worth VAT [%] Gross worth\n1. 12" Marble Lapis Inlay Chess Table Top With 2" Pieces & 15" Wooden Stand W537 2,00 each 444,60 889,20 10% 978,12\n\nSUMMARY\nVAT [%] Net worth VAT Gross worth\n10% 889,20 88,92 978,12\nTotal $ 889,20 $ 88,92 $ 978,12']
['Invoice no: 97159829\nDate of issue: 09/18/2015\n\nSeller:\nBradley-Andrade\n9879 Elizabeth Common\nLake Jonathan, RI 12335\nTax Id: 985-73-8194\nIBAN: GB81LZWO32519172531418\n\nClient:\nCastro PLC\nUnit 9678 Box 9664\nDPO AP 69387\nTax Id: 994-72-1270\n\nITEMS\nNo. Description Qty UM Net price Net worth VAT [%] Gross worth\n1. 12" Marble Lapis Inlay Chess Table Top With 2" Pieces & 15" Wooden Stand W537 2,00 each 444,60 889,20 10% 978,12\n\nS