In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from peft import PeftModel
from qwen_vl_utils import process_vision_info

In [None]:
BASE_MODEL = "Qwen/Qwen2-VL-2B-Instruct"
ADAPTER_MODEL = "NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct" 

In [None]:
base_model = Qwen2VLForConditionalGeneration.from_pretrained(
      BASE_MODEL,
      torch_dtype="auto",
      device_map="auto"
)

model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)

processor = AutoProcessor.from_pretrained(BASE_MODEL)

In [None]:
IMAGE_PATH = "path/image.jpg"

prompt = "Below is the image of one page of a document, as well as some raw textual content that was previously extracted for it. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate."
max_tokens = 2000

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": f"file://{IMAGE_PATH}"},
            {"type": "text", "text": prompt},
        ],
    }
  ]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
      text=[text],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
).to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
generated_ids_trimmed = [
      out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print("OCR Output:\n", output_text)