In [None]:
# 基础库
!pip install --upgrade transformers datasets peft evaluate qwen-vl-utils



# 1) 卸载可能已经装过但元数据不全的版本


# Notebook cell
!pip uninstall -y bitsandbytes
!pip install bitsandbytes       # 会自动匹配 CUDA 版本




# 评估与可视化
!pip install scikit-learn matplotlib

# 图像处理
!pip install pillow
!pip install rouge_score
!pip install huggingface_hub[hf_xet]
!pip install git+https://github.com/salaniz/pycocoevalcap.git
!pip install bert_score
!pip install --upgrade transformers


In [None]:
import random
import numpy as np
import torch
import matplotlib.pyplot as plt
from captum.attr import LayerGradCam, LayerAttribution
from PIL import Image
from datasets import load_dataset

# ← Add this
from qwen_vl_utils import process_vision_info

from peft import LoraConfig, get_peft_model
from transformers import (
    AutoProcessor,
    Qwen2VLForConditionalGeneration,
    BitsAndBytesConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
import evaluate
from nltk.translate.bleu_score import SmoothingFunction
from pycocoevalcap.cider.cider import Cider
from bert_score import BERTScorer
import matplotlib
matplotlib.use("Agg")   # 使用 Agg 后端，不会尝试打开 GUI 窗口
import matplotlib.pyplot as plt
# ─── 1. Prompt 池 & 随机采样 ───────────────────────────────────────────
prompts = [
    "Q: Does this image support the given statement? Explain your reasoning.",
    "Question: Is the sentence true for the picture shown? Give your reasons.",
    "Is the description accurate for this image? Explain why or why not.",
    "Please determine whether the image entails the sentence, and provide a detailed explanation.",
    "Analyze the image–text pair and explain if they match or conflict.",
    "Assess whether the statement is consistent with the image, and justify your answer.",
    "I want you to judge if this sentence correctly describes the image and explain your judgment.",
    "Your task: verify the sentence against the image and articulate the reasons behind your decision."
]
def sample_prompt():
    return random.choice(prompts)

# ─── 2. 加载 & 简化数据集 ─────────────────────────────────────────────

from datasets import load_dataset# 0. DATASET: 加载 J1mb0o/e-snli-ve
raw = load_dataset("J1mb0o/e-snli-ve")
from datasets import load_dataset

# 只下载并加载前 8 条 train；前 2 条 dev
train_ds = (
    load_dataset("J1mb0o/e-snli-ve", split="train[:8]")
    .rename_column("hypothesis","caption")
    .remove_columns(["gold_label","flickr_id"])
)
eval_ds = (
    load_dataset("J1mb0o/e-snli-ve", split="dev[:2]")
    .rename_column("hypothesis","caption")
    .remove_columns(["gold_label","flickr_id"])
)

print(f"✔️ Train: {len(train_ds)}, Eval: {len(eval_ds)}")

# ─── 3. Processor & Model Setup ─────────────────────────────────────
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", use_fast=True)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    device_map="auto",
    torch_dtype=torch.float16,    # 使用半精度
    low_cpu_mem_usage=True        # Hugging Face 加速加载权重
)

peft_cfg = LoraConfig(
    r=32, lora_alpha=32, lora_dropout=0.1,
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
    bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_cfg)

# ─── 4. collate_fn_train: 带 labels ────────────────────────────────────
def collate_fn_train(examples):
    texts, images = [], []
    for ex in examples:
        prompt = sample_prompt()
        img = ex["image"]
        if isinstance(img, dict):
            img = Image.open(img["path"])
        elif not isinstance(img, Image.Image):
            img = Image.open(img)
        msgs = [
            {"role":"user","content":[
                {"type":"image","image":img},
                {"type":"text","text":prompt}
            ]},
            {"role":"assistant","content":ex["caption"]}
        ]
        txt = processor.apply_chat_template(
            msgs, tokenize=False, add_generation_prompt=False
        )
        texts.append(txt)
        vis, _ = process_vision_info(msgs)
        images.append(vis[0])
    batch = processor(text=texts, images=images,
                      padding=True, return_tensors="pt")
    labels = batch.input_ids.clone()
    # 把视觉 token & pad token 标记位置的 label 全置 -100
    vs = processor.tokenizer.convert_tokens_to_ids("<|vision_start|>")
    ve = processor.tokenizer.convert_tokens_to_ids("<|vision_end|>")
    vp = processor.tokenizer.convert_tokens_to_ids("<|image_pad|>")
    pad = processor.tokenizer.pad_token_id
    mask = (labels==vs)|(labels==ve)|(labels==vp)|(labels==pad)
    labels[mask] = -100
    batch["labels"] = labels
    return batch

# ─── 5. collate_fn_eval: 只给 prompt+image，用于生成 ────────────────────
def collate_fn_eval(examples):
    texts, images, captions = [], [], []
    for ex in examples:
        prompt = sample_prompt()
        img = ex["image"]
        if isinstance(img, dict):
            img = Image.open(img["path"])
        elif not isinstance(img, Image.Image):
            img = Image.open(img)

        msgs = [
            {"role":"user","content":[
                {"type":"image","image":img},
                {"type":"text","text":prompt}
            ]}
        ]
        # add the generation prompt token(s)
        txt = processor.apply_chat_template(
            msgs, tokenize=False, add_generation_prompt=True
        )
        texts.append(txt)
        vis, _ = process_vision_info(msgs)
        images.append(vis[0])

        # **new**: store the reference caption
        captions.append(ex["caption"])

    batch = processor(
        text=texts, images=images,
        padding=True, return_tensors="pt"
    )
    # **new**: keep captions for decoding
    batch["captions"] = captions
    return batch


# ─── 6. compute_metrics_from_texts：纯文本版本指标 ─────────────────────
_smooth = SmoothingFunction().method1
rouge_metric     = evaluate.load("rouge")
bleu_metric      = evaluate.load("bleu")
meteor_metric    = evaluate.load("meteor")
bertscore_metric = evaluate.load("bertscore")
cider_scorer     = Cider()
bert_scorer      = BERTScorer(lang="en", rescale_with_baseline=True)

def compute_metrics_from_texts(preds, refs):
    # refs 要是 list of list
    refs_list = [[r] for r in refs]
    r = rouge_metric.compute(predictions=preds, references=refs_list,
                              use_stemmer=True)["rougeL"]
    b = bleu_metric.compute(predictions=preds, references=refs_list)["bleu"]
    m = meteor_metric.compute(predictions=preds, references=refs_list)["meteor"]
    c,_ = cider_scorer.compute_score(
        {i:[refs[i]] for i in range(len(refs))},
        {i:[preds[i]] for i in range(len(preds))}
    )
    bs = bertscore_metric.compute(predictions=preds, references=refs,
                                  lang="en", rescale_with_baseline=True)
    f1 = float(sum(bs["f1"]) / len(bs["f1"]))
    return r, b, m, c, f1
from torch.utils.data import DataLoader
from transformers import TrainerCallback
import torch, evaluate
from nltk.translate.bleu_score import SmoothingFunction
from pycocoevalcap.cider.cider import Cider

class LowMemEvalCallback(TrainerCallback):
    def __init__(
        self,
        eval_dataset,
        collate_fn_eval,
        processor,
        model,
        batch_size: int = 8,
        max_eval_samples: int = None
    ):
        self.processor       = processor
        self.model           = model
        self.collate_fn_eval = collate_fn_eval

        # 只截取前 N 条
        if max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(max_eval_samples))

        self.loader = DataLoader(
            eval_dataset,
            batch_size=batch_size,
            collate_fn=collate_fn_eval,
            shuffle=False
        )

        self.rouge  = evaluate.load("rouge")
        self.bleu   = evaluate.load("bleu")
        self.meteor = evaluate.load("meteor")
        self.cider  = Cider()
        self.smooth = SmoothingFunction().method1

    def on_epoch_end(self, args, state, control, **kwargs):
        device = self.model.device
        all_preds, all_refs = [], []

        self.model.eval()
        with torch.no_grad():
            for batch in self.loader:
                # 1) 移到 device
                inputs = {
                    k: v.to(device)
                    for k, v in batch.items()
                    if isinstance(v, torch.Tensor)
                }
                # 2) 生成
                outs = self.model.generate(
                    **inputs,
                    max_new_tokens=100,
                    num_beams=4,
                    early_stopping=True
                )
                # 3) 解码
                #  注意：outs 维度 [B, seq_len]
                prefix_len = inputs["input_ids"].shape[-1]
                texts = self.processor.batch_decode(
                    outs[:, prefix_len:],
                    skip_special_tokens=True
                )
                all_preds.extend(texts)

                # 4) 原始 captions 从 batch["captions"] 拿
                all_refs.extend(batch["captions"])

                # 5) 清理显存
                del inputs, outs, texts
                torch.cuda.empty_cache()

        # 准备 BLEU-1/2 的输入
        refs_for_bleu = [[r] for r in all_refs]

        rL  = self.rouge.compute(predictions=all_preds, references=all_refs, use_stemmer=True)["rougeL"]
        b1 = self.bleu.compute(predictions=all_preds, references=refs_for_bleu, max_order=1)["bleu"]
        b2 = self.bleu.compute(predictions=all_preds, references=refs_for_bleu, max_order=2)["bleu"]


        m   = self.meteor.compute(predictions=all_preds, references=all_refs)["meteor"]
        c,_ = self.cider.compute_score(
            {i:[r] for i,r in enumerate(all_refs)},
            {i:[p] for i,p in enumerate(all_preds)}
        )
        print(f"BLEU-1 {b1:.4f}, BLEU-2 {b2:.4f}")
        print(f"\nEpoch {int(state.epoch)} ➜ "
              f"ROUGE-L {rL:.4f} "
              f"METEOR {m:.4f}, CIDEr {c:.4f}\n")

        return control



# ─── 7. 训练 & 手动 Eval ─────────────────────────────────────────────
training_args = Seq2SeqTrainingArguments(
    output_dir="output",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    warmup_steps=20,
    weight_decay=0.01,

    # —— 日志设置 —— #
    report_to=["none"],           # 关掉 WandB，上控制台
    logging_strategy="steps",      # 按 step 打印
    logging_steps=1,               # 每 1 步输出一次 loss
    logging_first_step=True,       # 第 1 步也打印

    # —— 自动评估 —— #
    eval_strategy="epoch",   # 每个 epoch 跑一次 eval
    save_strategy="no",            # 不保存 checkpoint（可改成 "epoch"）
    predict_with_generate=False,
    remove_unused_columns=False,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=collate_fn_train,
    callbacks=[
      LowMemEvalCallback(
        eval_dataset=eval_ds,
        collate_fn_eval=collate_fn_eval,
        processor=processor,
        model=model,
        batch_size=4,
        max_eval_samples=None
      )
    ],
)
trainer.train()

# 手动生成 + 计算指标
pred_out = trainer.predict(
    test_dataset=eval_ds,
    data_collator=collate_fn_eval,
    predict_with_generate=True,
    max_length=512,
    num_beams=4
)
logs     = trainer.state.log_history
train_ep = [x["epoch"] for x in logs if "loss"      in x]
train_ls = [x["loss"]  for x in logs if "loss"      in x]

plt.figure(figsize=(6,4))
plt.plot(train_ep, train_ls, marker='o', label="train_loss")
plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title("Train Loss")
plt.legend(); plt.grid(True)
plt.show()



def visualize_gradcam(
    img: Image.Image,
    prompt: str,
    model: Qwen2VLForConditionalGeneration,
    processor: AutoProcessor,
    target_layer: torch.nn.Module,
    device: torch.device = None,
    upsample_size: tuple = None
):
    """
    对 single image+prompt 做 Grad-CAM，可视化目标层的激活热力图叠加。
    参数：
      img           : PIL.Image，原始输入图
      prompt        : str，文字提示
      model         : 你的 Qwen2VL 模型
      processor     : 对应的 AutoProcessor
      target_layer  : 参与 Grad-CAM 的卷积/投影层
      device        : torch 设备，默认为 model.device
      upsample_size : 热力图放大尺寸，默认为原图大小
    """
    model.eval()
    device = device or next(model.parameters()).device
    upsample_size = upsample_size or img.size[::-1]  # PIL size: (W,H) → numpy( H, W )

    # 1) 构造输入
    msgs = [{"role":"user","content":[
        {"type":"image","image":img},
        {"type":"text","text":prompt}
    ]}]
    pixel_vals, _ = process_vision_info(msgs)
    pixel_vals = pixel_vals.unsqueeze(0).to(device)  # [1,3,H,W]
    text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    enc = processor(text=[text], images=[pixel_vals.squeeze(0)],
                    return_tensors="pt", padding=True)
    enc = {k: v.to(device) for k, v in enc.items()}

    # 2) 定义 Grad-CAM
    gradcam = LayerGradCam(model, target_layer)

    # 3) 选择一个“目标得分”——这里取模型对第一个生成 token 最大 logit
    #    你也可以根据任务选其他目标索引
    def score_function(pixel_values, input_ids, attention_mask):
        # 前向
        out = model(pixel_values=pixel_values,
                    input_ids=input_ids,
                    attention_mask=attention_mask)
        logits = out.logits  # [1, seq_len, vocab_size]
        last_logits = logits[:, -1, :]  # [1, vocab_size]
        # 取概率最高的那个词的 logit 作为目标
        target_idx = last_logits.argmax(dim=-1)
        return last_logits[:, target_idx]

    # 4) 计算归因
    attributions = gradcam.attribute(
        inputs=(enc["pixel_values"], enc["input_ids"], enc["attention_mask"]),
        target=None,
        additional_forward_args=(),
        attribute_to_layer_input=True,
        return_convergence_delta=False,
        n_steps=20,
        internal_batch_size=1,
        **{"forward_func": score_function}
    )[0]  # 取 pixel_values 那部分，形状 [C,H',W']

    # 5) 插值到原图大小
    heatmap = LayerAttribution.interpolate(
        attributions.cpu().detach(),
        upsample_size
    ).sum(dim=0).numpy()  # 通道求和 → [H, W]
    heatmap = np.maximum(heatmap, 0)
    heatmap = heatmap / heatmap.max()

    # 6) 可视化
    plt.figure(figsize=(6,6))
    plt.imshow(img)
    plt.imshow(heatmap, cmap="jet", alpha=0.5)
    plt.axis("off")
    plt.title("Grad-CAM")
    plt.show()


# ─── 示例调用 ────────────────────────────────────────────────
if __name__ == "__main__":
    # 随机从 eval_ds 里拿一张
    ex = eval_ds[0]
    img = ex["image"]
    if not isinstance(img, Image.Image):
        img = Image.open(img["path"] if isinstance(img, dict) else img)

    prompt = sample_prompt()
    # 选一个适合做 Grad-CAM 的层，例如 patch_embed 的投影层
    target_layer = model.qwen2vl.visual_encoder.patch_embed.proj

    visualize_gradcam(img, prompt, model, processor, target_layer)



In [None]:
!pip install captum


Collecting captum
  Downloading captum-0.8.0-py3-none-any.whl.metadata (26 kB)
Collecting numpy<2.0 (from captum)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.10->captum)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.10->captum)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.10->captum)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.10->captum)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1