In [13]:
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
import torch
from PIL import Image
import os
import sys
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

sys.path.append(os.path.abspath(os.path.join("../..")))
from src.datasets.kaggledataset import KaggleDataset  # noqa: E402

In [3]:
# Load model and processor
processor = TrOCRProcessor.from_pretrained("anuashok/ocr-captcha-v3")
model = VisionEncoderDecoderModel.from_pretrained("anuashok/ocr-captcha-v3")



preprocessor_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

The image processor of type `ViTImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/480 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

In [7]:
dataset = KaggleDataset(os.environ["KAGGLE_ROOT_DIR"], preload=False)

In [17]:
# ----------------------
# Configuration
# ----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
batch_size = 32  # Adjust depending on GPU memory

# Custom collate to keep images and labels separate
def collate_fn(batch):
    images, labels = zip(*batch)
    return list(images), list(labels)

# DataLoader (parallel loading)
loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=8,  # CPU threads for preprocessing
    pin_memory=True,
    collate_fn=collate_fn
)

# ----------------------
# Inference Loop
# ----------------------
max_batches = 5  # Number of batches you want to run
results = []

for batch_idx, (images, labels) in enumerate(tqdm(loader, desc="Running OCR inference")):
    if batch_idx >= max_batches:
        break  # Stop after max_batches

    # ------------------------
    # 1️⃣ Preprocess images properly
    # ------------------------
    processed_images = []
    for img in images:
        # 1. Convert to RGBA
        img_rgba = img.convert("RGBA")
        # 2. White background
        background = Image.new("RGBA", img_rgba.size, (255, 255, 255, 255))
        # 3. Alpha composite
        img_composite = Image.alpha_composite(background, img_rgba)
        # 4. Convert back to RGB
        img_rgb = img_composite.convert("RGB")
        processed_images.append(img_rgb)

    # 2️⃣ Generate model inputs
    pixel_values = processor(processed_images, return_tensors="pt").pixel_values.to(device)

    # 3️⃣ Generate predictions
    with torch.no_grad():
        generated_ids = model.generate(pixel_values)

    preds = processor.batch_decode(generated_ids, skip_special_tokens=True)

    # 4️⃣ Convert labels to strings
    actuals = ["".join(dataset.idx_to_char[i.item()] for i in t) for t in labels]

    # 5️⃣ Store results
    for i, (a, p) in enumerate(zip(actuals, preds)):
        results.append({
            "idx": batch_idx * batch_size + i,
            "actual": a,
            "pred": p
        })


Running OCR inference:   0%|          | 0/3534 [00:00<?, ?it/s]

In [18]:
df_results = pd.DataFrame(results)
print(df_results.head())

   idx actual    pred
0    0  x8jy8   x8jy8
1    1  1VTNw   NVTNW
2    2  N324n  TN324n
3    3  1ZlCO  JZ100-
4    4  OvTL4  TOTAL4


In [20]:
(df_results["actual"] == df_results["pred"]).sum() / len(df_results) * 100

np.float64(9.375)