In [1]:
# -*- coding: utf-8 -*-
# ============================================================
# Qwen2.5-VL-3B-Instruct — Single-Model MathBench Generation
# - No MoE, no splitting
# - Full question in, up to 2048 new tokens out
# - Input:  Validation.jsonl  (id, diff, topic, query)
# - Output: Validation_Qwen2_5_VL_3B.txt / .md
# ============================================================

import json
import time
from pathlib import Path

import torch
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

# -------------------------
# Paths (edit if needed)
# -------------------------
MODEL_DIR = r"C:\Users\super\Desktop\MoE_LLM\Pretrained\Qwen2.5-VL-3B-Instruct"

VAL_JSONL = Path(r"C:\Users\super\Desktop\MoE_LLM\Pretrained\validationHard.jsonl")

OUT_TXT = VAL_JSONL.with_name("ValidationHard_Qwen2_5_VL_3B.txt")
OUT_MD  = VAL_JSONL.with_name("ValidationHard_Qwen2_5_VL_3B.md")
2
MAX_NEW_TOKENS = 2048

# -------------------------
# Helpers
# -------------------------
def load_mathbench(path: Path):
    items = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            items.append(obj)
    return items


def build_conversation(example):
    """
    Qwen2.5-VL chat format:
    conversation = [
        {
            "role": "user",
            "content": [
                { "type": "text", "text": "..."}
            ]
        }
    ]
    """
    q = example["query"]
    topic = example.get("topic", "")
    diff = example.get("diff", "")
    # Pass topic/difficulty as context text, but DO NOT split into tasks
    prompt = f"[Difficulty: {diff}] [Topic: {topic}]\n\n{q}"
    return [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt}
            ],
        }
    ]


def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"[DEV] device={device} | torch={torch.__version__}")

    # -------------------------
    # Load data
    # -------------------------
    if not VAL_JSONL.is_file():
        raise FileNotFoundError(f"Validation file not found: {VAL_JSONL}")

    problems = load_mathbench(VAL_JSONL)
    n = len(problems)
    print(f"[DATA] Loaded {n} problems from {VAL_JSONL}")

    # -------------------------
    # Load processor + model
    # -------------------------
    print(f"[MODEL] Loading Qwen2.5-VL-3B from: {MODEL_DIR}")

    processor = AutoProcessor.from_pretrained(MODEL_DIR)

    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_DIR,
        torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
    )
    model.to(device)
    model.eval()

    # Make sure padding is sane
    if hasattr(processor, "tokenizer"):
        tok = processor.tokenizer
        if tok.pad_token_id is None:
            tok.pad_token_id = tok.eos_token_id
        model.generation_config.pad_token_id = tok.pad_token_id
        model.generation_config.eos_token_id = tok.eos_token_id

    print("[MODEL] Loaded. Starting generation...\n")

    # -------------------------
    # Open output files
    # -------------------------
    with OUT_TXT.open("w", encoding="utf-8") as f_txt, \
         OUT_MD.open("w", encoding="utf-8") as f_md:

        f_txt.write(f"# Qwen2.5-VL-3B-Instruct — MathBench Validation\n")
        f_txt.write(f"# Source: {VAL_JSONL.name}\n\n")

        f_md.write(f"# Qwen2.5-VL-3B-Instruct — MathBench Validation\n")
        f_md.write(f"- Source: `{VAL_JSONL.name}`\n\n")

        for idx, ex in enumerate(problems, start=1):
            pid   = ex.get("id", idx)
            diff  = ex.get("diff", "")
            topic = ex.get("topic", "")
            query = ex.get("query", "")

            print(f"[{idx:03d}/{n:03d}] ID={pid} | {topic} ({diff})")

            messages = build_conversation(ex)

            # Encode with chat template (TEXT-ONLY, no images/videos)
            inputs = processor.apply_chat_template(
                messages,
                add_generation_prompt=True,
                tokenize=True,
                return_tensors="pt",
                return_dict=True,
            ).to(device)

            prompt_len = inputs["input_ids"].shape[1]

            with torch.no_grad():
                t0 = time.time()
                output_ids = model.generate(
                    **inputs,
                    max_new_tokens=MAX_NEW_TOKENS,
                    do_sample=False,
                    temperature=1.0,
                    top_p=1.0,
                )
                dt = time.time() - t0

            # Strip the prompt portion
            gen_ids = output_ids[:, prompt_len:]
            answer = processor.batch_decode(
                gen_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )[0].strip()

            print(f"   → tokens={gen_ids.shape[1]} | {dt:.2f}s")
            print("-" * 60)

            # -------------------------
            # Write TEXT output
            # -------------------------
            f_txt.write(f"=== Problem {idx}/{n} ===\n")
            f_txt.write(f"ID    : {pid}\n")
            f_txt.write(f"Topic : {topic}\n")
            f_txt.write(f"Diff  : {diff}\n\n")
            f_txt.write("---- Question ----\n")
            f_txt.write(query.strip() + "\n\n")
            f_txt.write("---- Qwen2.5-VL-3B Answer ----\n")
            f_txt.write(answer + "\n\n")
            f_txt.write("\n" + "=" * 72 + "\n\n")
            f_txt.flush()

            # -------------------------
            # Write MARKDOWN output
            # -------------------------
            f_md.write(f"## Problem {idx}/{n} — ID {pid}\n\n")
            if topic or diff:
                f_md.write(f"- **Topic:** {topic}\n")
                f_md.write(f"- **Difficulty:** {diff}\n\n")

            f_md.write("### Question\n\n")
            f_md.write(query.strip() + "\n\n")

            f_md.write("### Qwen2.5-VL-3B Answer\n\n")
            f_md.write(answer + "\n\n---\n\n")
            f_md.flush()

    print("\n[DONE]")
    print(f"TXT : {OUT_TXT}")
    print(f"MD  : {OUT_MD}")


if __name__ == "__main__":
    main()


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


[DEV] device=cuda | torch=2.9.0+cu130
[DATA] Loaded 50 problems from C:\Users\super\Desktop\MoE_LLM\Pretrained\validationHard.jsonl
[MODEL] Loading Qwen2.5-VL-3B from: C:\Users\super\Desktop\MoE_LLM\Pretrained\Qwen2.5-VL-3B-Instruct


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[MODEL] Loaded. Starting generation...

[001/050] ID=1 | Real Analysis (Engineering hard)
   → tokens=1079 | 37.46s
------------------------------------------------------------
[002/050] ID=2 | Measure & Integration (Engineering hard)
   → tokens=1465 | 51.39s
------------------------------------------------------------
[003/050] ID=3 | Functional Analysis (Engineering hard)
   → tokens=1500 | 55.09s
------------------------------------------------------------
[004/050] ID=4 | Real Analysis / BV & AC (Engineering hard)
   → tokens=1336 | 49.46s
------------------------------------------------------------
[005/050] ID=5 | PDE / Sturm–Liouville (Engineering hard)
   → tokens=1362 | 51.77s
------------------------------------------------------------
[006/050] ID=6 | Probability / Martingales (Engineering hard)
   → tokens=1222 | 48.39s
------------------------------------------------------------
[007/050] ID=7 | Probability / Limit Theorems (Engineering hard)
   → tokens=1232 | 42.75s
---