In [1]:
import pandas as pd
from pathlib import Path
import json

BASE_DIR = Path("/kaggle/input/textocr-text-extraction-from-images-dataset")
IMG_DIR = BASE_DIR / "train_val_images/train_images"

annot = pd.read_parquet(BASE_DIR / "annot.parquet")
imgs = pd.read_parquet(BASE_DIR / "img.parquet").rename(columns={"id": "image_id"})

# Group annotations by image_id
grouped = (
    annot.groupby("image_id")
    .agg(bbox_list=("bbox", list), text_list=("utf8_string", list))
    .reset_index()
)

df = imgs.merge(grouped, on="image_id", how="inner")

df["image_path"] = df["file_name"].apply(
    lambda fn: str(IMG_DIR / Path(fn).name)
)

# ---------------------------
# Normalize bounding boxes to 0–1000
# ----------------------------
def normalize_bbox(bbox, width, height):
    x, y, w, h = bbox
    x1, y1 = x, y
    x2, y2 = x + w, y + h
    return [
        max(0, min(1000, int((x1 / width) * 1000))),
        max(0, min(1000, int((y1 / height) * 1000))),
        max(0, min(1000, int((x2 / width) * 1000))),
        max(0, min(1000, int((y2 / height) * 1000))),
    ]

df["annotations"] = df.apply(
    lambda row: [
        {
            "bbox_2d": normalize_bbox(bbox, row["width"], row["height"]),
            "text_content": str(text)
        }
        for bbox, text in zip(row["bbox_list"], row["text_list"])
    ],
    axis=1
)

# ----------------------------
# Build messages with JSON block
# ----------------------------
def build_message(row):
    annotations_json = json.dumps(row["annotations"], ensure_ascii=False)
    annotations_json_block = f"```json\n{annotations_json}\n```"

    return {
        "image": row["image_path"],
        "conversations": [
            {
                "from": "human",
                "value": (
                    "<image> Spot all text in the image at line-level. "
                    "Output the result in valid JSON format like below:\n"
                    "```json\n[{'bbox_2d': [x1, y1, x2, y2], 'text_content': 'text'}, ...]\n```"
                )
            },
            {
                "from": "assistant",
                "value": annotations_json_block
            }
        ]
    }

df["message"] = df.apply(build_message, axis=1)

# ----------------------------
# print first decoded message
# ----------------------------
import pprint
pprint.pprint(df["message"].iloc[0])


{'conversations': [{'from': 'human',
                    'value': '<image> Spot all text in the image at '
                             'line-level. Output the result in valid JSON '
                             'format like below:\n'
                             '```json\n'
                             "[{'bbox_2d': [x1, y1, x2, y2], 'text_content': "
                             "'text'}, ...]\n"
                             '```'},
                   {'from': 'assistant',
                    'value': '```json\n'
                             '[{"bbox_2d": [625, 3, 861, 36], "text_content": '
                             '"Performance"}, {"bbox_2d": [636, 63, 745, 100], '
                             '"text_content": "Sport"}, {"bbox_2d": [746, 62, '
                             '861, 93], "text_content": "Watch"}, {"bbox_2d": '
                             '[687, 138, 862, 180], "text_content": '
                             '"...period."}, {"bbox_2d": [465, 160, 537, 197], '
       

In [2]:
import re
from pathlib import Path
from typing import Dict, Any, List
import torch

IGNORE_INDEX = -100  

# ----------------------------
# Helper: make absolute paths
# ----------------------------
def _make_abs_paths(base_path: Path, file_path: str) -> str:
    return str(base_path / file_path)


# ----------------------------
# Build chat messages from item
# ----------------------------
def _build_messages(item: Dict[str, Any], base_path: Path) -> List[Dict[str, Any]]:
    # Handle images/videos
    images = item.get("image") or []
    if isinstance(images, str):
        images = [images]

    videos = item.get("video") or []
    if isinstance(videos, str):
        videos = [videos]

    image_pool = [{"type": "image", "image": _make_abs_paths(base_path, img)} for img in images]
    video_pool = [{"type": "video", "video": _make_abs_paths(base_path, vid)} for vid in videos]

    messages = []

    for turn in item["conversations"]:
        role = "user" if turn["from"] == "human" else "assistant"
        text: str = turn["value"]

        if role == "user":
            content: List[Dict[str, Any]] = []
            # Split on placeholders like <image> or <video>
            text_parts = re.split(r"(<image>|<video>)", text)

            for seg in text_parts:
                seg = seg.strip()
                if seg == "<image>":
                    if not image_pool:
                        raise ValueError("Too many <image> placeholders")
                    content.append(image_pool.pop(0))
                elif seg == "<video>":
                    if not video_pool:
                        raise ValueError("Too many <video> placeholders")
                    content.append(video_pool.pop(0))
                elif seg:
                    content.append({"type": "text", "text": seg})

            messages.append({"role": role, "content": content})

        else:  # assistant
            messages.append({
                "role": role,
                "content": [{"type": "text", "text": text}]
            })

    if image_pool:
        raise ValueError(f"{len(image_pool)} unused image(s)")
    if video_pool:
        raise ValueError(f"{len(video_pool)} unused video(s)")

    return messages


# ----------------------------
# Preprocess for Qwen visual
# ----------------------------
def preprocess_qwen_visual_batch(sources: List[Dict[str, Any]], processor) -> Dict[str, Any]:
    all_messages = []
    for source in sources:
        base_path = Path(source.get("data_path", ""))
        messages = _build_messages(source, base_path)
        all_messages.append(messages)

    # Apply processor to batch of messages with padding
    full_result = processor.apply_chat_template(
        all_messages,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        padding=True,
        pad_to_multiple_of=8
    )

    input_ids = full_result["input_ids"]
    if isinstance(input_ids, list):
        input_ids = torch.tensor(input_ids)

    # Prepare labels with IGNORE_INDEX
    labels = torch.full_like(input_ids, IGNORE_INDEX)

    # Qwen special token logic for masking assistant answers
    for batch_idx in range(input_ids.size(0)):
        input_ids_flat = input_ids[batch_idx].tolist()
        L = len(input_ids_flat)
        pos = 0
        while pos < L:
            if input_ids_flat[pos] == 77091:  # <|im_start|>assistant token
                ans_start = pos + 2
                ans_end = ans_start
                while ans_end < L and input_ids_flat[ans_end] != 151645:  # <|im_end|> token
                    ans_end += 1
                if ans_end < L:
                    labels[batch_idx, ans_start:ans_end + 2] = input_ids[batch_idx, ans_start:ans_end + 2]
                    pos = ans_end
            pos += 1

    full_result["input_ids"] = input_ids
    full_result["labels"] = labels
    return full_result


In [3]:
# Flash attention 2 will work for Ampere and above series 
# !pip install flash-attn --no-build-isolation

In [4]:
!pip install bitsandbytes peft



In [5]:
import torch
from transformers import AutoProcessor , Qwen3VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from pathlib import Path

IGNORE_INDEX = -100  
model_name = "Qwen/Qwen3-VL-2B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)
device_to_use = "cuda:0"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    model_name,
    device_map={"":device_to_use},
    quantization_config=bnb_config,
    dtype=torch.float16,      
    trust_remote_code=True,
    attn_implementation="sdpa"
)

# Load processor for VL tasks
processor = AutoProcessor.from_pretrained(model_name)

2026-02-14 18:04:41.764835: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771092281.786404    1159 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771092281.792798    1159 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771092281.810035    1159 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771092281.810054    1159 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771092281.810056    1159 computation_placer.cc:177] computation placer alr

In [6]:
rows = [
    df["message"].iloc[0],
    df["message"].iloc[1]  # another row
]

# Convert rows to proper dict format for preprocess
sources = []
for row in rows:
    images = row.get("image")
    if isinstance(images, str):
        images = [images]
    sources.append({
        "image": images,
        "conversations": row["conversations"]
    })

result = preprocess_qwen_visual_batch(sources, processor)

print(result["input_ids"].shape)  
print(result["labels"].shape)

torch.Size([2, 2328])
torch.Size([2, 2328])


In [7]:
print(result.keys())

KeysView({'input_ids': tensor([[151644,    872,    198,  ..., 151643, 151643, 151643],
        [151644,    872,    198,  ...,  73594, 151645,    198]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'pixel_values': tensor([[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
        [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
        [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
        ...,
        [-0.0588, -0.0431, -0.0431,  ..., -0.4902, -0.4588, -0.4118],
        [-0.0824, -0.0980, -0.0980,  ..., -0.4745, -0.4902, -0.4431],
        [ 0.0431,  0.0431,  0.0118,  ..., -0.4588, -0.5451, -0.5922]]), 'image_grid_thw': tensor([[ 1, 64, 52],
        [ 1, 42, 64]]), 'labels': tensor([[  -100,   -100,   -100,  ...,   -100,   -100,   -100],
        [  -100,   -100,   -100,  ...,  73594, 151645,    198]])})


In [8]:
print("Input IDs:\n", result["input_ids"])
print("\nDecoded Text:\n", processor.batch_decode(result["input_ids"])[0])
print("\nLabels:\n", result["labels"])

Input IDs:
 tensor([[151644,    872,    198,  ..., 151643, 151643, 151643],
        [151644,    872,    198,  ...,  73594, 151645,    198]])

Decoded Text:
 <|im_start|>user
<|vision_start|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|im

In [9]:
# Note: These parameters are subject to the VRAM of GPU , if you have a better GPU feel free to tune the Config params

Max_Context_Length = 1024
Max_pixels = 196*32*32 # increase it 
Min_pixels = 56*56
Factor = 32
Max_Long_Side = 8192

In [10]:
import math

def round_by_factor(number, factor):
    return round(number / factor) * factor

def floor_by_factor(number, factor):
    return math.floor(number / factor) * factor

def ceil_by_factor(number, factor):
    return math.ceil(number / factor) * factor

def smart_resize(height: int, width: int, factor: int = Factor, 
                 min_pixels: int = Min_pixels, 
                 max_pixels: int = Max_pixels, 
                 max_long_side: int = Max_Long_Side) -> tuple[int, int]:
    
    
    if max(height, width) > max_long_side:
        beta = max(height, width) / max_long_side
        height, width = int(height / beta), int(width / beta)

    h_bar = round_by_factor(height, factor)
    w_bar = round_by_factor(width, factor)
    
    if h_bar * w_bar > max_pixels:
        # Calculate scaling beta based on the area ratio
        beta = math.sqrt((height * width) / max_pixels)
        h_bar = floor_by_factor(height / beta, factor)
        w_bar = floor_by_factor(width / beta, factor)
    elif h_bar * w_bar < min_pixels:
        beta = math.sqrt(min_pixels / (height * width))
        h_bar = ceil_by_factor(height * beta, factor)
        w_bar = ceil_by_factor(width * beta, factor)
    
    return h_bar, w_bar

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from PIL import Image
from pathlib import Path

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

class QwenVisualDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.to_dict('records')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        item = row["message"]
        base_path = Path(item.get("data_path", ""))
        messages = _build_messages(item, base_path)
        
        for msg in messages:
            if isinstance(msg["content"], list):
                for part in msg["content"]:
                    if part["type"] == "image" and isinstance(part["image"], str):
                        img = Image.open(part["image"]).convert("RGB")
                        
                        # Apply smart_resize
                        w, h = img.size
                        new_h, new_w = smart_resize(h, w)
                        img = img.resize((new_w, new_h), resample=Image.LANCZOS)
                        
                        part["image"] = img
        return messages

def collate_fn(batch):
    full_result = processor.apply_chat_template(
        batch,
        tokenize=True,
        return_dict=True,
        truncation=True,            
        max_length=Max_Context_Length,     
        return_tensors="pt",
        padding=True,
        pad_to_multiple_of=8
    )

    input_ids = full_result["input_ids"]
    labels = torch.full_like(input_ids, -100)

    for batch_idx in range(input_ids.size(0)):
        ids = input_ids[batch_idx].tolist()
        L = len(ids)
        pos = 0
        while pos < L:
            if ids[pos] == 77091:
                ans_start = pos + 2
                ans_end = ans_start
                while ans_end < L and ids[ans_end] != 151645:
                    ans_end += 1
                if ans_end < L:
                    labels[batch_idx, ans_start:ans_end + 2] = input_ids[batch_idx, ans_start:ans_end + 2]
                    pos = ans_end
            pos += 1

    full_result["labels"] = labels
    return full_result

train_dataset = QwenVisualDataset(train_df)
test_dataset = QwenVisualDataset(test_df)

train_loader = DataLoader(
    train_dataset, 
    batch_size=1,
    num_workers=1,
    pin_memory=True,
    persistent_workers=True,
    shuffle=True, 
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=1,
    num_workers=1,
    pin_memory=True,
    persistent_workers=True,
    shuffle=False, 
    collate_fn=collate_fn
)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"], 
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 1,605,632 || all params: 2,129,137,664 || trainable%: 0.0754


In [13]:
from torch.optim import AdamW
from tqdm import tqdm
import torch
from torch.cuda.amp import GradScaler

train_losses, test_losses = [], []
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-6)
scaler = GradScaler()
accumulation_steps = 4 
test_iter = iter(test_loader)

model.train()
for epoch in range(1):
    pbar = tqdm(enumerate(train_loader), total=20, desc=f"Epoch {epoch}")
    for step, batch in pbar:
        if step >= 20: 
            break
            
        batch = {k: v.to("cuda") if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
        if "pixel_values" in batch:
            batch["pixel_values"] = batch["pixel_values"].to(torch.float16)

        with torch.autocast(device_type="cuda", dtype=torch.float16):
            outputs = model(**batch)
            raw_loss = outputs.loss
            loss = raw_loss / accumulation_steps

        if not torch.isfinite(raw_loss):
            print(f"NaN detected at step {step}. Skipping...")
            optimizer.zero_grad(set_to_none=True)
            continue 

        scaler.scale(loss).backward()
        train_losses.append(raw_loss.item())

        model.eval()
        try:
            test_batch = next(test_iter)
        except StopIteration:
            test_iter = iter(test_loader)
            test_batch = next(test_iter)
        
        test_batch = {k: v.to("cuda") if isinstance(v, torch.Tensor) else v for k, v in test_batch.items()}
        if "pixel_values" in test_batch: 
            test_batch["pixel_values"] = test_batch["pixel_values"].to(torch.float16)

        with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.float16):
            test_outputs = model(**test_batch)
            test_losses.append(test_outputs.loss.item())
        model.train()

        if (step + 1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            
        pbar.set_postfix({"tr": raw_loss.item(), "te": test_losses[-1]})

model.save_pretrained("./qwen3_vl_adapter")

  scaler = GradScaler()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 0:   5%|▌         | 1/20 [00:01<00:28,  1.51s/it]

NaN detected at step 0. Skipping...


Epoch 0:  15%|█▌        | 3/20 [00:03<00:20,  1.18s/it, tr=0.958, te=nan]

NaN detected at step 2. Skipping...


Epoch 0:  50%|█████     | 10/20 [00:13<00:11,  1.18s/it, tr=0.578, te=0.55]

NaN detected at step 9. Skipping...


Epoch 0:  55%|█████▌    | 11/20 [00:13<00:09,  1.05s/it, tr=0.578, te=0.55]

NaN detected at step 10. Skipping...


Epoch 0:  70%|███████   | 14/20 [00:18<00:07,  1.22s/it, tr=0.777, te=0.767]

NaN detected at step 13. Skipping...


Epoch 0:  75%|███████▌  | 15/20 [00:18<00:05,  1.08s/it, tr=0.777, te=0.767]

NaN detected at step 14. Skipping...


Epoch 0:  90%|█████████ | 18/20 [00:23<00:02,  1.24s/it, tr=0.578, te=0.506]

NaN detected at step 17. Skipping...


Epoch 0: 100%|██████████| 20/20 [00:26<00:00,  1.32s/it, tr=0.616, te=0.784]
