# Testbed

Ones that work:
- Qwen
- Ovis
- EasyOCR
- TrOCR
- PyTesseract

In [1]:
from qwen_vl_utils import process_vision_info
from transformers import Qwen3VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, AutoModel, AutoImageProcessor, AutoModelForCausalLM, TrOCRProcessor, VisionEncoderDecoderModel, default_data_collator, BitsAndBytesConfig
from PIL import Image
import torch
import pytesseract as pt
import easyocr
import os
import numpy as np
import time

device = "cuda" if torch.cuda.is_available() else "cpu"

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

### Ovis 2.5

In [12]:
MODEL_PATH = "AIDC-AI/Ovis2.5-9B"

# Total tokens for thinking + answer. Ensure: max_new_tokens > thinking_budget + 25
max_new_tokens = 64
ovis_quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    quantization_config=ovis_quantization_config,
    device_map="auto"
    
).eval().cuda()



`torch_dtype` is deprecated! Use `dtype` instead!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
img = Image.open("C:\\PhD\\DissolutionProgramming\\NLP---New-Land-Paper\\Data\\Processed\\subsidy1524\\value_lines\\value_12.png")
messages = [{
    "role": "user",
    "content": [
        {"type": "image", "image": img},
        {"type": "text", "text": "OCR this, outputting ONLY the text. Be very careful not to accidentally misidentify a fraction as a numeral. Fractions will be one small number on top of another with no line."},
    ],
}]

input_ids, pixel_values, grid_thws = model.preprocess_inputs(
    messages=messages,
    add_generation_prompt=True,
    enable_thinking=False
)
input_ids = input_ids.cuda()
pixel_values = pixel_values.cuda() if pixel_values is not None else None
grid_thws = grid_thws.cuda() if grid_thws is not None else None

outputs = model.generate(
    inputs=input_ids,
    pixel_values=pixel_values,
    grid_thws=grid_thws,
    enable_thinking=False,
    max_new_tokens=max_new_tokens,
)

response = model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Richard Langworthie G 3 1/3


### Model Loading

In [None]:
# Qwen
qwen_quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", # Use the standard model, BNB quantizes it on the fly
    quantization_config=qwen_quantization_config,
    device_map="auto",
    attn_implementation="sdpa"
).eval().cuda()

qwen_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", use_fast=True)
print("Qwen model loaded.")

# Pytesseract
pt.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
pt_config = r'--oem 3 --psm 7 -c tessedit_char_whitelist= 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/ '

# EasyOCR
eo_model = easyocr.Reader(['en'])

# TrOCR
trocr_model_id = "microsoft/trocr-large-printed"

trocr_quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    llm_int8_skip_modules=["pooler"] 
)

trocr_model = VisionEncoderDecoderModel.from_pretrained(
    trocr_model_id,
    quantization_config=trocr_quant_config,
    device_map="auto",
    low_cpu_mem_usage=True
)

trocr_processor = TrOCRProcessor.from_pretrained(trocr_model_id)
trocr_tokenizer = AutoTokenizer.from_pretrained(trocr_model_id)
trocr_model.to("cuda")
print('All models loaded successfully.')

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Qwen model loaded.


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-large-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


All models loaded successfully.


In [3]:
pt_config = r'--oem 3 --psm 7 -c tessedit_char_whitelist= 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/ '

### Qwen

In [None]:
for img_path in os.listdir("C:\\PhD\\DissolutionProgramming\\NLP---New-Land-Paper\\Data\\Processed\\subsidy1524\\value_lines"):
    start_time = time.time()
    img = Image.open(os.path.join("C:\\PhD\\DissolutionProgramming\\NLP---New-Land-Paper\\Data\\Processed\\subsidy1524\\value_lines", img_path))
    # Qwen
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": img,
                },
                {"type": "text", "text": "OCR this, outputting ONLY the text. Be very careful not to accidentally misidentify a fraction as a numeral. Fractions will be one small number on top of another with no line."},
            ],
        }
    ]

    text = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = qwen_processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        return_tensors="pt",
    )

    inputs = inputs.to("cuda")

    generated_ids = qwen_model.generate(**inputs, max_new_tokens=64)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    qwen_text = qwen_processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0].strip()
    # EasyOCR
    img_array = np.array(img)
    eo_text = eo_model.readtext(img_array)
    eo_text = ' '.join([x[1] for x in eo_text]).strip()
    #TrOCR
    pixel_values = trocr_processor(img, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    generated_ids = trocr_model.generate(pixel_values)
    generated_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    tr_text = generated_text.strip()

    # PyTesseract
    pt_text = pt.image_to_string(img, config=pt_config).strip()
    print("Qwen VL OCR Text:"
        , qwen_text)
    print("EasyOCR OCR Text:"
        , eo_text)
    print("TrOCR OCR Text:"
            , tr_text)
    print("Pytesseract OCR Text:"
            , pt_text)
    end_time = time.time()
    print(f"Total Time Taken: {end_time - start_time} seconds")
    print('------------------------------------------------\n')

Qwen VL OCR Text: Amy at Ven wid G 2
EasyOCR OCR Text: at Ven wid G   2 Amy
TrOCR OCR Text: AMY AT VEN WID G 2
Pytesseract OCR Text: Amy at Ven wid G 2
Total Time Taken: 4.59190034866333 seconds
------------------------------------------------

Qwen VL OCR Text: John Couse G 4
EasyOCR OCR Text: John Couse 6
TrOCR OCR Text: JOHH COUSE G 4
Pytesseract OCR Text: John Couse G 4
Total Time Taken: 2.602468490600586 seconds
------------------------------------------------

Qwen VL OCR Text: Alice Full G.
EasyOCR OCR Text: Alice Full G
TrOCR OCR Text: ALICE FULL ORIG.
Pytesseract OCR Text: Alice Full G .
Total Time Taken: 1.9561901092529297 seconds
------------------------------------------------

Qwen VL OCR Text: Wm Man of Boldburgh G 83
EasyOCR OCR Text: Wm Man of Boldburgh G 8 3
TrOCR OCR Text: WM MAN OF BOLDBURGH G 85
Pytesseract OCR Text: Wm Man of Boldburgh G 83
Total Time Taken: 2.89148211479187 seconds
------------------------------------------------

Qwen VL OCR Text: Richard Langwor

### EasyOCR

### TrOCR

### PyTesseract

Qwen VL OCR Text: ['Richard Langworthie G 3½']
EasyOCR OCR Text: 1 Richard Langworthie G 33
TrOCR OCR Text: RICHARD LANGWORTHIE G 3%
Pytesseract OCR Text: Richard Langworthie G 3s
Total Time Taken: 4.944117546081543 seconds
