In [1]:
import torch
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

from datasets import load_dataset
from PIL import Image
from tqdm import tqdm
import wandb
from transformers import AutoProcessor
from vllm import LLM, SamplingParams

import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value


  from .autonotebook import tqdm as notebook_tqdm


INFO 12-12 00:36:57 [__init__.py:216] Automatically detected platform cuda.


In [2]:
PHASES = ["read", "explore", "insight", "solve", "verify"]

@dataclass
class ConfidenceSummary:
    absolute: float
    relative: float
    phase_curve: List[Tuple[str, float]]

@dataclass
class SampleResult:
    question_id: str
    question: str
    answer: str
    prediction: str
    confidence: ConfidenceSummary
    trajectory: List[Dict[str, float]]



def extract_confidence(
    token_ids: List[int],
    logprob_trace: List[Dict],
) -> ConfidenceSummary:

    token_confidences = []
    margin_confidences = []

    for step, token_id in enumerate(token_ids):
        step_logprobs = logprob_trace[step] if step < len(logprob_trace) else {}
        probs = []
        target_prob = None

        for entry in step_logprobs.values():
            if isinstance(entry, (float, int)):
                logprob = float(entry)
                entry_token_id = None
            else:
                logprob = getattr(entry, "logprob", None)
                entry_token_id = getattr(entry, "token_id", None)

            if logprob is None:
                continue

            prob = float(torch.exp(torch.tensor(logprob)).item())
            probs.append(prob)
            if entry_token_id == token_id:
                target_prob = prob

        if target_prob is None and probs:
            target_prob = probs[0]

        token_confidences.append(target_prob or 0.0)

        if len(probs) >= 2:
            top_two = sorted(probs, reverse=True)[:2]
            margin_confidences.append(top_two[0] - top_two[1])
        elif probs:
            margin_confidences.append(probs[0])
        else:
            margin_confidences.append(0.0)

    absolute = float(sum(token_confidences) / max(len(token_confidences), 1))
    relative = float(sum(margin_confidences) / max(len(margin_confidences), 1))
    phase_curve = interpolate_phases(token_confidences)

    return ConfidenceSummary(absolute, relative, phase_curve)


def interpolate_phases(token_confidences: List[float]) -> List[Tuple[str, float]]:
    if not token_confidences:
        return [(phase, 0.0) for phase in PHASES]

    mn, mx = min(token_confidences), max(token_confidences)
    span = max(mx - mn, 1e-6)

    normalized = [(c - mn) / span for c in token_confidences]

    checkpoints = []
    for idx, phase in enumerate(PHASES):
        pos = int((idx / max(len(PHASES)-1, 1)) * (len(normalized)-1))
        checkpoints.append((phase, float(normalized[pos])))

    return checkpoints

def run_sample(llm, processor, sample, sampling_params):
    # 从路径加载图像（你当前用的是原始 image 路径，不是 decoded_image）
    image = Image.open("/root/autodl-tmp/data/" + sample["image"]).convert("RGB")
    
    # 构造 OpenAI-style messages（vLLM 内部会自动转换为合法 token）
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": sample["query"]}
            ]
        }
    ]
    
    # ✅ 关键：用 processor 生成 prompt（包含合法的视觉 token）
    prompt = processor.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    # 调用 vLLM（注意：multi_modal_data 传 [image] 列表！）
    outputs = llm.generate(
        {
            "prompt": prompt,
            "multi_modal_data": {"image": [image]}  
        },
        sampling_params=sampling_params,
    )

    
    result = outputs[0].outputs[0]

    
    logprob_trace = result.logprobs or []
    confidence = extract_confidence(result.token_ids, logprob_trace)

    return SampleResult(
        question_id=str(sample.get("pid", sample.get("id", "unknown"))),
        question=sample["question"],
        answer=sample.get("answer", ""),
        prediction=result.text.strip(),
        confidence=confidence,
        trajectory=[
            {"phase": phase, "confidence": value}
            for phase, value in confidence.phase_curve
        ],
    )
    
def log_to_wandb(table, result):
    table.add_data(
        result.question_id,
        result.question,
        result.answer,
        result.prediction,
        result.confidence.absolute,
        result.confidence.relative,
        result.trajectory,
    )


In [3]:
model_id = "/root/autodl-tmp/model"
split = "testmini"
limit = 16
batch_size = 16
max_new_tokens = 1024
project = "StateReasoning"
run_name = "qwen3-vl-4b-thinking"
data_dir = None
# V100 (compute capability 7.0) does not support bfloat16; use float16 instead
# to avoid engine initialization failure.
dtype = "float16"


def evaluate():
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

    llm = LLM(
        model=model_id,
        trust_remote_code=True,
        dtype=dtype,
        # Limit context length to fit V100 32GB KV cache (was 262144 needing ~36 GiB)
        max_model_len=8192,
        # Slightly raise utilization; adjust if OOM
        gpu_memory_utilization=0.5,
        enable_chunked_prefill=False  # 多模态建议关闭
    )


    dataset = load_dataset("AI4Math/MathVista", split=split)
    if limit:
        dataset = dataset.select(range(limit))

    sampling_params = SamplingParams(
        max_tokens=max_new_tokens,
        temperature=0.1,
        logprobs=10,
    )

    wandb.init(project=project, name=run_name)

    table = wandb.Table(
        columns=[
            "question_id",
            "question",
            "ground_truth",
            "prediction",
            "absolute_confidence",
            "relative_confidence",
            "phase_trajectory",
        ]
    )

    for start in tqdm(range(0, len(dataset), batch_size), desc="Evaluating"):

        end = min(start + batch_size, len(dataset))
        batch = dataset.select(range(start, end)).to_list()

        for sample in batch:
            result = run_sample(llm, processor, sample, sampling_params)
            log_to_wandb(table, result)

    wandb.log({"mathvista_eval": table})
    wandb.finish()


In [5]:
# evaluate()
