In [1]:
import pandas as pd
from pathlib import Path
import json

BASE_DIR = Path("/kaggle/input/textocr-text-extraction-from-images-dataset")
IMG_DIR = BASE_DIR / "train_val_images/train_images"

annot = pd.read_parquet(BASE_DIR / "annot.parquet")
imgs = pd.read_parquet(BASE_DIR / "img.parquet").rename(columns={"id": "image_id"})

# Group annotations by image_id
grouped = (
    annot.groupby("image_id")
    .agg(bbox_list=("bbox", list), text_list=("utf8_string", list))
    .reset_index()
)

df = imgs.merge(grouped, on="image_id", how="inner")

df["image_path"] = df["file_name"].apply(
    lambda fn: str(IMG_DIR / Path(fn).name)
)

# ---------------------------
# Normalize bounding boxes to 0–1000
# ----------------------------
def normalize_bbox(bbox, width, height):
    x, y, w, h = bbox
    x1, y1 = x, y
    x2, y2 = x + w, y + h
    return [
        max(0, min(1000, int((x1 / width) * 1000))),
        max(0, min(1000, int((y1 / height) * 1000))),
        max(0, min(1000, int((x2 / width) * 1000))),
        max(0, min(1000, int((y2 / height) * 1000))),
    ]

df["annotations"] = df.apply(
    lambda row: [
        {
            "bbox_2d": normalize_bbox(bbox, row["width"], row["height"]),
            "text_content": str(text)
        }
        for bbox, text in zip(row["bbox_list"], row["text_list"])
    ],
    axis=1
)

# ----------------------------
# Build messages with JSON block
# ----------------------------
def build_message(row):
    annotations_json = json.dumps(row["annotations"], ensure_ascii=False)
    annotations_json_block = f"```json\n{annotations_json}\n```"

    return {
        "image": row["image_path"],
        "conversations": [
            {
                "from": "human",
                "value": (
                    "<image> Spot all text in the image at line-level. "
                    "Output the result in valid JSON format like below:\n"
                    "```json\n[{'bbox_2d': [x1, y1, x2, y2], 'text_content': 'text'}, ...]\n```"
                )
            },
            {
                "from": "assistant",
                "value": annotations_json_block
            }
        ]
    }

df["message"] = df.apply(build_message, axis=1)

# ----------------------------
# print first decoded message
# ----------------------------
import pprint
pprint.pprint(df["message"].iloc[0])


{'conversations': [{'from': 'human',
                    'value': '<image> Spot all text in the image at '
                             'line-level. Output the result in valid JSON '
                             'format like below:\n'
                             '```json\n'
                             "[{'bbox_2d': [x1, y1, x2, y2], 'text_content': "
                             "'text'}, ...]\n"
                             '```'},
                   {'from': 'assistant',
                    'value': '```json\n'
                             '[{"bbox_2d": [625, 3, 861, 36], "text_content": '
                             '"Performance"}, {"bbox_2d": [636, 63, 745, 100], '
                             '"text_content": "Sport"}, {"bbox_2d": [746, 62, '
                             '861, 93], "text_content": "Watch"}, {"bbox_2d": '
                             '[687, 138, 862, 180], "text_content": '
                             '"...period."}, {"bbox_2d": [465, 160, 537, 197], '
       

In [2]:
import re
from pathlib import Path
from typing import Dict, Any, List
import torch

IGNORE_INDEX = -100  

# ----------------------------
# Helper: make absolute paths
# ----------------------------
def _make_abs_paths(base_path: Path, file_path: str) -> str:
    return str(base_path / file_path)


# ----------------------------
# Build chat messages from item
# ----------------------------
def _build_messages(item: Dict[str, Any], base_path: Path) -> List[Dict[str, Any]]:
    # Handle images/videos
    images = item.get("image") or []
    if isinstance(images, str):
        images = [images]

    videos = item.get("video") or []
    if isinstance(videos, str):
        videos = [videos]

    image_pool = [{"type": "image", "image": _make_abs_paths(base_path, img)} for img in images]
    video_pool = [{"type": "video", "video": _make_abs_paths(base_path, vid)} for vid in videos]

    messages = []

    for turn in item["conversations"]:
        role = "user" if turn["from"] == "human" else "assistant"
        text: str = turn["value"]

        if role == "user":
            content: List[Dict[str, Any]] = []
            # Split on placeholders like <image> or <video>
            text_parts = re.split(r"(<image>|<video>)", text)

            for seg in text_parts:
                seg = seg.strip()
                if seg == "<image>":
                    if not image_pool:
                        raise ValueError("Too many <image> placeholders")
                    content.append(image_pool.pop(0))
                elif seg == "<video>":
                    if not video_pool:
                        raise ValueError("Too many <video> placeholders")
                    content.append(video_pool.pop(0))
                elif seg:
                    content.append({"type": "text", "text": seg})

            messages.append({"role": role, "content": content})

        else:  # assistant
            messages.append({
                "role": role,
                "content": [{"type": "text", "text": text}]
            })

    if image_pool:
        raise ValueError(f"{len(image_pool)} unused image(s)")
    if video_pool:
        raise ValueError(f"{len(video_pool)} unused video(s)")

    return messages


# ----------------------------
# Preprocess for Qwen visual
# ----------------------------
def preprocess_qwen_visual(sources: List[Dict[str, Any]], processor) -> Dict[str, Any]:
    if len(sources) != 1:
        raise ValueError(f"Expected 1 source, got {len(sources)}")

    source = sources[0]
    base_path = Path(source.get("data_path", ""))
    messages = _build_messages(source, base_path)

    # Apply processor
    full_result = processor.apply_chat_template(
        messages,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    )

    input_ids = full_result["input_ids"]
    if isinstance(input_ids, list):
        input_ids = torch.tensor(input_ids).unsqueeze(0)

    labels = torch.full_like(input_ids, IGNORE_INDEX)

    # Set labels for answer portion (based on Qwen special tokens)
    input_ids_flat = input_ids[0].tolist()
    L = len(input_ids_flat)
    pos = 0
    while pos < L:
        if input_ids_flat[pos] == 77091:  # <|im_start|>assistant token
            ans_start = pos + 2
            ans_end = ans_start
            while ans_end < L and input_ids_flat[ans_end] != 151645:  # <|im_end|> token
                ans_end += 1
            if ans_end < L:
                labels[0, ans_start:ans_end + 2] = input_ids[0, ans_start:ans_end + 2]
                pos = ans_end
        pos += 1

    full_result["labels"] = labels
    full_result["input_ids"] = input_ids
    return full_result


In [3]:
import torch
from transformers import AutoProcessor
from pathlib import Path

IGNORE_INDEX = -100  # make sure this matches your preprocess function

model_id = "Qwen/Qwen3-VL-4B-Instruct"
processor = AutoProcessor.from_pretrained(model_id)

row = df["message"].iloc[0]

images = row.get("image")
if isinstance(images, str):
    images = [images]

item = {
    "image": images,
    "conversations": row["conversations"],
    "data_path": "/kaggle/input/textocr-text-extraction-from-images-dataset/train_val_images/train_images"
}

result = preprocess_qwen_visual([item], processor)

print("Input IDs:\n", result["input_ids"])
print("\nDecoded Text:\n", processor.batch_decode(result["input_ids"])[0])
print("\nLabels:\n", result["labels"])


2026-02-11 16:10:38.547587: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770826238.792081      17 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770826238.861206      17 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770826239.411450      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770826239.411503      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770826239.411506      17 computation_placer.cc:177] computation placer alr

preprocessor_config.json:   0%|          | 0.00/390 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

Input IDs:
 tensor([[151644,    872,    198,  ...,  73594, 151645,    198]])

Decoded Text:
 <|im_start|>user
<|vision_start|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|i

In [4]:
print("\nLabels:\n", result["labels"])


Labels:
 tensor([[  -100,   -100,   -100,  ...,  73594, 151645,    198]])


In [5]:
# torch.set_printoptions(profile="full")
# print(result["labels"])

In [6]:
df.columns

Index(['image_id', 'width', 'height', 'set', 'file_name', 'bbox_list',
       'text_list', 'image_path', 'annotations', 'message'],
      dtype='object')

In [7]:
print("Keys in result:", result.keys())
if "pixel_values" in result:
    print("Pixel Values Shape:", result["pixel_values"].shape)
if "image_grid_thw" in result:
    print("Grid THW:", result["image_grid_thw"])

Keys in result: KeysView({'input_ids': tensor([[151644,    872,    198,  ...,  73594, 151645,    198]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]]), 'pixel_values': tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]]), 'image_grid_thw': tensor([[ 1, 64, 52]]), 'labels': tensor([[  -100,   -100,   -100,  ...,  73594, 151645,    198]])})
Pixel Values Shape: torch.Size([3328, 1536])
Grid THW: tensor([[ 1, 64, 52]])


In [8]:
print(result.keys())

KeysView({'input_ids': tensor([[151644,    872,    198,  ...,  73594, 151645,    198]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]]), 'pixel_values': tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]]), 'image_grid_thw': tensor([[ 1, 64, 52]]), 'labels': tensor([[  -100,   -100,   -100,  ...,  73594, 151645,    198]])})
