In [None]:
# Complete Notebook with Grad-CAM Visualization


# ─── 0. Install Dependencies ────────────────────────────────────────────────
!pip install --upgrade transformers datasets peft evaluate qwen-vl-utils
!pip uninstall -y bitsandbytes
!pip install --upgrade bitsandbytes
!pip install scikit-learn matplotlib pillow rouge_score huggingface_hub[hf_xet] bert_score
!pip install git+https://github.com/salaniz/pycocoevalcap.git
!pip install captum

# ─── 1. Imports ────────────────────────────────────────────────────────────
import random
import numpy as np
import torch
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from PIL import Image

from datasets import load_dataset
from transformers import (
    AutoProcessor,
    Qwen2VLForConditionalGeneration,
    BitsAndBytesConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from peft import LoraConfig, get_peft_model
from qwen_vl_utils import process_vision_info
import evaluate
from nltk.translate.bleu_score import SmoothingFunction
from pycocoevalcap.cider.cider import Cider
from bert_score import BERTScorer

# Captum for Grad-CAM
from captum.attr import LayerGradCam, LayerAttribution

# ─── 2. Prompt Pool ─────────────────────────────────────────────────────────
prompts = [ ... ]  # (same as before)
def sample_prompt():
    return random.choice(prompts)

# ─── 3. Load & Sample Dataset ──────────────────────────────────────────────
raw = load_dataset("J1mb0o/e-snli-ve")
train_ds = raw["train"].rename_column("hypothesis","caption").remove_columns(["gold_label","flickr_id"]).shuffle(42).select(range(80))
eval_ds  = raw["dev"].rename_column("hypothesis","caption").remove_columns(["gold_label","flickr_id"]).shuffle(42).select(range(20))




Found existing installation: bitsandbytes 0.46.1
Uninstalling bitsandbytes-0.46.1:
  Successfully uninstalled bitsandbytes-0.46.1
Collecting bitsandbytes
  Using cached bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Using cached bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl (72.9 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.1
Collecting git+https://github.com/salaniz/pycocoevalcap.git
  Cloning https://github.com/salaniz/pycocoevalcap.git to /tmp/pip-req-build-4jkv0ppn
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap.git /tmp/pip-req-build-4jkv0ppn
  Resolved https://github.com/salaniz/pycocoevalcap.git to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Preparing metadata (setup.py) ... [?25l[?25hdone


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/729 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/116 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/116 [00:00<?, ?it/s]

test-00000-of-00005.parquet:   0%|          | 0.00/93.8M [00:00<?, ?B/s]

test-00001-of-00005.parquet:   0%|          | 0.00/87.2M [00:00<?, ?B/s]

test-00002-of-00005.parquet:   0%|          | 0.00/90.1M [00:00<?, ?B/s]

test-00003-of-00005.parquet:   0%|          | 0.00/328M [00:00<?, ?B/s]

test-00004-of-00005.parquet:   0%|          | 0.00/390M [00:00<?, ?B/s]

dev-00000-of-00004.parquet:   0%|          | 0.00/470M [00:00<?, ?B/s]

dev-00001-of-00004.parquet:   0%|          | 0.00/470M [00:00<?, ?B/s]

dev-00002-of-00004.parquet:   0%|          | 0.00/469M [00:00<?, ?B/s]

dev-00003-of-00004.parquet:   0%|          | 0.00/457M [00:00<?, ?B/s]

Downloading data:   0%|          | 0/116 [00:00<?, ?files/s]

train-00000-of-00116.parquet:   0%|          | 0.00/39.2M [00:00<?, ?B/s]

train-00001-of-00116.parquet:   0%|          | 0.00/41.1M [00:00<?, ?B/s]

train-00002-of-00116.parquet:   0%|          | 0.00/44.4M [00:00<?, ?B/s]

train-00003-of-00116.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

train-00004-of-00116.parquet:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

train-00005-of-00116.parquet:   0%|          | 0.00/42.6M [00:00<?, ?B/s]

train-00006-of-00116.parquet:   0%|          | 0.00/38.8M [00:00<?, ?B/s]

train-00007-of-00116.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

train-00008-of-00116.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

train-00009-of-00116.parquet:   0%|          | 0.00/42.8M [00:00<?, ?B/s]

train-00010-of-00116.parquet:   0%|          | 0.00/43.9M [00:00<?, ?B/s]

train-00011-of-00116.parquet:   0%|          | 0.00/44.7M [00:00<?, ?B/s]

train-00012-of-00116.parquet:   0%|          | 0.00/39.1M [00:00<?, ?B/s]

train-00013-of-00116.parquet:   0%|          | 0.00/41.1M [00:00<?, ?B/s]

train-00014-of-00116.parquet:   0%|          | 0.00/39.7M [00:00<?, ?B/s]

train-00015-of-00116.parquet:   0%|          | 0.00/42.7M [00:00<?, ?B/s]

train-00016-of-00116.parquet:   0%|          | 0.00/38.6M [00:00<?, ?B/s]

train-00017-of-00116.parquet:   0%|          | 0.00/43.0M [00:00<?, ?B/s]

train-00018-of-00116.parquet:   0%|          | 0.00/41.0M [00:00<?, ?B/s]

train-00019-of-00116.parquet:   0%|          | 0.00/41.5M [00:00<?, ?B/s]

train-00020-of-00116.parquet:   0%|          | 0.00/41.4M [00:00<?, ?B/s]

train-00021-of-00116.parquet:   0%|          | 0.00/42.4M [00:00<?, ?B/s]

train-00022-of-00116.parquet:   0%|          | 0.00/39.3M [00:00<?, ?B/s]

train-00023-of-00116.parquet:   0%|          | 0.00/43.0M [00:00<?, ?B/s]

train-00024-of-00116.parquet:   0%|          | 0.00/40.9M [00:00<?, ?B/s]

train-00025-of-00116.parquet:   0%|          | 0.00/38.8M [00:00<?, ?B/s]

train-00026-of-00116.parquet:   0%|          | 0.00/40.0M [00:00<?, ?B/s]

train-00027-of-00116.parquet:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

train-00028-of-00116.parquet:   0%|          | 0.00/41.4M [00:00<?, ?B/s]

train-00029-of-00116.parquet:   0%|          | 0.00/43.0M [00:00<?, ?B/s]

train-00030-of-00116.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

train-00031-of-00116.parquet:   0%|          | 0.00/38.9M [00:00<?, ?B/s]

train-00032-of-00116.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

train-00033-of-00116.parquet:   0%|          | 0.00/42.9M [00:00<?, ?B/s]

train-00034-of-00116.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

train-00035-of-00116.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

train-00036-of-00116.parquet:   0%|          | 0.00/42.8M [00:00<?, ?B/s]

train-00037-of-00116.parquet:   0%|          | 0.00/40.3M [00:00<?, ?B/s]

train-00038-of-00116.parquet:   0%|          | 0.00/40.9M [00:00<?, ?B/s]

train-00039-of-00116.parquet:   0%|          | 0.00/40.7M [00:00<?, ?B/s]

train-00040-of-00116.parquet:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

train-00041-of-00116.parquet:   0%|          | 0.00/40.8M [00:00<?, ?B/s]

train-00042-of-00116.parquet:   0%|          | 0.00/43.7M [00:00<?, ?B/s]

train-00043-of-00116.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

train-00044-of-00116.parquet:   0%|          | 0.00/39.9M [00:00<?, ?B/s]

train-00045-of-00116.parquet:   0%|          | 0.00/40.9M [00:00<?, ?B/s]

train-00046-of-00116.parquet:   0%|          | 0.00/39.8M [00:00<?, ?B/s]

train-00047-of-00116.parquet:   0%|          | 0.00/44.0M [00:00<?, ?B/s]

train-00048-of-00116.parquet:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

train-00049-of-00116.parquet:   0%|          | 0.00/40.8M [00:00<?, ?B/s]

train-00050-of-00116.parquet:   0%|          | 0.00/42.9M [00:00<?, ?B/s]

train-00051-of-00116.parquet:   0%|          | 0.00/40.0M [00:00<?, ?B/s]

train-00052-of-00116.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

train-00053-of-00116.parquet:   0%|          | 0.00/44.5M [00:00<?, ?B/s]

train-00054-of-00116.parquet:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

train-00055-of-00116.parquet:   0%|          | 0.00/40.3M [00:00<?, ?B/s]

train-00056-of-00116.parquet:   0%|          | 0.00/38.3M [00:00<?, ?B/s]

train-00057-of-00116.parquet:   0%|          | 0.00/42.2M [00:00<?, ?B/s]

train-00058-of-00116.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

train-00059-of-00116.parquet:   0%|          | 0.00/42.5M [00:00<?, ?B/s]

train-00060-of-00116.parquet:   0%|          | 0.00/42.8M [00:00<?, ?B/s]

train-00061-of-00116.parquet:   0%|          | 0.00/43.0M [00:00<?, ?B/s]

train-00062-of-00116.parquet:   0%|          | 0.00/42.2M [00:00<?, ?B/s]

train-00063-of-00116.parquet:   0%|          | 0.00/41.0M [00:00<?, ?B/s]

train-00064-of-00116.parquet:   0%|          | 0.00/41.5M [00:00<?, ?B/s]

train-00065-of-00116.parquet:   0%|          | 0.00/39.6M [00:00<?, ?B/s]

train-00066-of-00116.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

train-00067-of-00116.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

train-00068-of-00116.parquet:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

train-00069-of-00116.parquet:   0%|          | 0.00/41.0M [00:00<?, ?B/s]

train-00070-of-00116.parquet:   0%|          | 0.00/41.5M [00:00<?, ?B/s]

train-00071-of-00116.parquet:   0%|          | 0.00/40.2M [00:00<?, ?B/s]

train-00072-of-00116.parquet:   0%|          | 0.00/43.0M [00:00<?, ?B/s]

train-00073-of-00116.parquet:   0%|          | 0.00/44.8M [00:00<?, ?B/s]

train-00074-of-00116.parquet:   0%|          | 0.00/43.3M [00:00<?, ?B/s]

train-00075-of-00116.parquet:   0%|          | 0.00/41.4M [00:00<?, ?B/s]

train-00076-of-00116.parquet:   0%|          | 0.00/40.2M [00:00<?, ?B/s]

train-00077-of-00116.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

train-00078-of-00116.parquet:   0%|          | 0.00/44.4M [00:00<?, ?B/s]

train-00079-of-00116.parquet:   0%|          | 0.00/39.8M [00:00<?, ?B/s]

train-00080-of-00116.parquet:   0%|          | 0.00/40.2M [00:00<?, ?B/s]

train-00081-of-00116.parquet:   0%|          | 0.00/43.2M [00:00<?, ?B/s]

train-00082-of-00116.parquet:   0%|          | 0.00/40.7M [00:00<?, ?B/s]

train-00083-of-00116.parquet:   0%|          | 0.00/40.5M [00:00<?, ?B/s]

train-00084-of-00116.parquet:   0%|          | 0.00/40.9M [00:00<?, ?B/s]

train-00085-of-00116.parquet:   0%|          | 0.00/40.9M [00:00<?, ?B/s]

train-00086-of-00116.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

train-00087-of-00116.parquet:   0%|          | 0.00/42.4M [00:00<?, ?B/s]

train-00088-of-00116.parquet:   0%|          | 0.00/39.7M [00:00<?, ?B/s]

train-00089-of-00116.parquet:   0%|          | 0.00/43.6M [00:00<?, ?B/s]

train-00090-of-00116.parquet:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

train-00091-of-00116.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

train-00092-of-00116.parquet:   0%|          | 0.00/41.2M [00:00<?, ?B/s]

train-00093-of-00116.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

train-00094-of-00116.parquet:   0%|          | 0.00/42.3M [00:00<?, ?B/s]

train-00095-of-00116.parquet:   0%|          | 0.00/41.5M [00:00<?, ?B/s]

train-00096-of-00116.parquet:   0%|          | 0.00/41.2M [00:00<?, ?B/s]

train-00097-of-00116.parquet:   0%|          | 0.00/43.4M [00:00<?, ?B/s]

train-00098-of-00116.parquet:   0%|          | 0.00/40.7M [00:00<?, ?B/s]

train-00099-of-00116.parquet:   0%|          | 0.00/40.4M [00:00<?, ?B/s]

train-00100-of-00116.parquet:   0%|          | 0.00/40.0M [00:00<?, ?B/s]

train-00101-of-00116.parquet:   0%|          | 0.00/40.3M [00:00<?, ?B/s]

train-00102-of-00116.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

train-00103-of-00116.parquet:   0%|          | 0.00/42.9M [00:00<?, ?B/s]

train-00104-of-00116.parquet:   0%|          | 0.00/39.2M [00:00<?, ?B/s]

train-00105-of-00116.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

train-00106-of-00116.parquet:   0%|          | 0.00/43.0M [00:00<?, ?B/s]

train-00107-of-00116.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

train-00108-of-00116.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

train-00109-of-00116.parquet:   0%|          | 0.00/42.9M [00:00<?, ?B/s]

train-00110-of-00116.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

train-00111-of-00116.parquet:   0%|          | 0.00/50.9M [00:00<?, ?B/s]

train-00112-of-00116.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00113-of-00116.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

train-00114-of-00116.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00115-of-00116.parquet:   0%|          | 0.00/207M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/14740 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/14339 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/401717 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/111 [00:00<?, ?it/s]

In [None]:
# ─── 4. Processor & Model Setup ────────────────────────────────────────────
from captum.attr import LayerGradCam, LayerAttribution

# ─── 3. Processor & Model Setup ────────────────────────────────────────────
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    device_map="auto",
    quantization_config=bnb_cfg
)
peft_cfg = LoraConfig(
    r=32, lora_alpha=32, lora_dropout=0.1,
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
    bias="none", task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_cfg)

# ─── 4. Collate functions ──────────────────────────────────────────────────
def collate_fn_train(examples):
    texts, images = [], []
    for ex in examples:
        prompt = sample_prompt()
        img = ex["image"]
        if isinstance(img, dict): img = Image.open(img["path"])
        elif not isinstance(img, Image.Image): img = Image.open(img)
        msgs = [
            {"role":"user","content":[{"type":"image","image":img},{"type":"text","text":prompt}]},
            {"role":"assistant","content":ex["caption"]}
        ]
        txt = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)
        vis, _ = process_vision_info(msgs)
        texts.append(txt); images.append(vis[0])
    batch = processor(text=texts, images=images, padding=True, return_tensors="pt")
    labels = batch.input_ids.clone()
    vs = processor.tokenizer.convert_tokens_to_ids("<|vision_start|>")
    ve = processor.tokenizer.convert_tokens_to_ids("<|vision_end|>")
    vp = processor.tokenizer.convert_tokens_to_ids("<|image_pad|>")
    pad = processor.tokenizer.pad_token_id
    mask = (labels==vs)|(labels==ve)|(labels==vp)|(labels==pad)
    labels[mask] = -100
    batch["labels"] = labels
    return batch

def collate_fn_eval(examples):
    texts, images, captions = [], [], []
    for ex in examples:
        prompt = sample_prompt()
        img = ex["image"]
        if isinstance(img, dict): img = Image.open(img["path"])
        elif not isinstance(img, Image.Image): img = Image.open(img)
        msgs = [{"role":"user","content":[{"type":"image","image":img},{"type":"text","text":prompt}]}]
        txt = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
        vis, _ = process_vision_info(msgs)
        texts.append(txt); images.append(vis[0]); captions.append(ex["caption"])
    batch = processor(text=texts, images=images, padding=True, return_tensors="pt")
    batch["captions"] = captions
    return batch

# ─── 5. Metrics & Callback ─────────────────────────────────────────────────
_smooth = SmoothingFunction().method1
rouge_metric, bleu_metric = evaluate.load("rouge"), evaluate.load("bleu")
meteor_metric = evaluate.load("meteor")
cider_scorer, bert_scorer = Cider(), BERTScorer(lang="en", rescale_with_baseline=True)

def compute_metrics_from_texts(preds, refs):
    refs_list = [[r] for r in refs]
    rL = rouge_metric.compute(predictions=preds, references=refs_list, use_stemmer=True)["rougeL"]
    b  = bleu_metric.compute(predictions=preds, references=refs_list)["bleu"]
    m  = meteor_metric.compute(predictions=preds, references=refs)["meteor"]
    c,_= cider_scorer.compute_score({i:[refs[i]] for i in range(len(refs))},
                                    {i:[preds[i]] for i in range(len(preds))})
    bs = bert_scorer.compute(predictions=preds, references=refs)["f1"]
    return {"rougeL":rL, "bleu":b, "meteor":m, "cider":c, "bertscore": float(sum(bs)/len(bs))}

from torch.utils.data import DataLoader
from transformers import TrainerCallback

class LowMemEvalCallback(TrainerCallback):
    def __init__(self, eval_dataset, collate_fn_eval, processor, model, batch_size=8, max_eval_samples=None):
        self.processor, self.model, self.collate_fn_eval = processor, model, collate_fn_eval
        if max_eval_samples: eval_dataset = eval_dataset.select(range(max_eval_samples))
        self.loader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=collate_fn_eval, shuffle=False)
        self.rouge, self.bleu, self.meteor, self.cider = \
            evaluate.load("rouge"), evaluate.load("bleu"), evaluate.load("meteor"), Cider()
        self.smooth = SmoothingFunction().method1

    def on_epoch_end(self, args, state, control, **kwargs):
        device = self.model.device
        all_preds, all_refs = [], []
        self.model.eval()
        with torch.no_grad():
            for batch in self.loader:
                inputs = {k:v.to(device) for k,v in batch.items() if isinstance(v, torch.Tensor)}
                outs = self.model.generate(**inputs, max_new_tokens=100, num_beams=4, early_stopping=True)
                prefix = inputs["input_ids"].shape[-1]
                texts = processor.batch_decode(outs[:,prefix:], skip_special_tokens=True)
                all_preds.extend(texts); all_refs.extend(batch["captions"])
                torch.cuda.empty_cache()
        refs_bleu = [[r] for r in all_refs]
        rL = self.rouge.compute(predictions=all_preds, references=all_refs, use_stemmer=True)["rougeL"]
        b1 = self.bleu.compute(predictions=all_preds, references=refs_bleu, max_order=1)["bleu"]
        b2 = self.bleu.compute(predictions=all_preds, references=refs_bleu, max_order=2)["bleu"]
        m  = self.meteor.compute(predictions=all_preds, references=all_refs)["meteor"]
        c,_= self.cider.compute_score({i:[r] for i,r in enumerate(all_refs)},
                                       {i:[p] for i,p in enumerate(all_preds)})
        print(f"Epoch {int(state.epoch)} ➜ ROUGE-L {rL:.4f}  BLEU-1 {b1:.4f}  BLEU-2 {b2:.4f}  METEOR {m:.4f}  CIDEr {c:.4f}\n")
        return control

# ─── 6. Training ────────────────────────────────────────────────────────────
training_args = Seq2SeqTrainingArguments(
    output_dir="output", num_train_epochs=5,
    per_device_train_batch_size=4, per_device_eval_batch_size=4,
    learning_rate=5e-5, warmup_steps=20, weight_decay=0.01,
    logging_strategy="epoch", eval_strategy="no", save_strategy="no",
    predict_with_generate=False, remove_unused_columns=False
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    data_collator=collate_fn_train,
    callbacks=[LowMemEvalCallback(eval_dataset=eval_ds,
                                  collate_fn_eval=collate_fn_eval,
                                  processor=processor,
                                  model=model,
                                  batch_size=4)]
)
trainer.train()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
No label_names provided fo

Step,Training Loss
20,3.572
40,2.0832
60,1.5282
80,1.4688
100,1.4424


Epoch 1 ➜ ROUGE-L 0.0807  BLEU-1 0.0833  BLEU-2 0.0325  METEOR 0.0758  CIDEr 0.0653

Epoch 2 ➜ ROUGE-L 0.2485  BLEU-1 0.2599  BLEU-2 0.1349  METEOR 0.2741  CIDEr 0.3208

Epoch 3 ➜ ROUGE-L 0.2940  BLEU-1 0.3318  BLEU-2 0.1910  METEOR 0.2917  CIDEr 0.7989

Epoch 4 ➜ ROUGE-L 0.3305  BLEU-1 0.3754  BLEU-2 0.2100  METEOR 0.2921  CIDEr 0.8655

Epoch 5 ➜ ROUGE-L 0.3305  BLEU-1 0.3754  BLEU-2 0.2100  METEOR 0.2921  CIDEr 0.8655



TrainOutput(global_step=100, training_loss=2.01894193649292, metrics={'train_runtime': 178.7918, 'train_samples_per_second': 2.237, 'train_steps_per_second': 0.559, 'total_flos': 1325181990494208.0, 'train_loss': 2.01894193649292, 'epoch': 5.0})

In [None]:
# ─── 8. Grad-CAM Visualization (Final Robust Implementation) ───────────────

# 1) 从 eval_ds 中取一张样例图
ex = eval_ds[16]
img = ex["image"]
if isinstance(img, dict):
    img = Image.open(img["path"]).convert("RGB")
elif not isinstance(img, Image.Image):
    img = Image.open(img).convert("RGB")
original_size = img.size  # (width, height)

# 2) 使用与训练一致的预处理流程
prompt = sample_prompt()
msgs = [{"role": "user", "content": [
    {"type": "image", "image": img},
    {"type": "text", "text": prompt}
]}]

# 使用 processor 处理图像和文本
inputs = processor(
    text=processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True),
    images=[img],
    return_tensors="pt",
    padding=True
)

# 3) 处理输入
pixel_values = inputs["pixel_values"].to(model.device)  # [864, 1176]
image_grid_thw = inputs["image_grid_thw"].to(model.device)  # [1, 3]

# 4) 文本预处理
input_ids = inputs["input_ids"].to(model.device)
attention_mask = inputs["attention_mask"].to(model.device)

# 5) 获取模型组件
wrapped = model.base_model if hasattr(model, "base_model") else model
vision = wrapped.model.visual

# 使用 Patch Embedding 层作为目标层（更可靠的空间特征）
target_layer = vision.patch_embed.proj
print(f"Using target layer: {target_layer}")

# 6) 定义钩子函数捕获目标层输出
activation = {}
def forward_hook(module, input, output):
    activation["features"] = output

# 注册前向钩子
hook = target_layer.register_forward_hook(forward_hook)

# 7) 运行前向传播捕获特征图
with torch.no_grad():
    # 运行模型前向传播
    outputs = model(
        pixel_values=pixel_values.unsqueeze(0),  # 添加批次维度 [1, 864, 1176]
        input_ids=input_ids,
        attention_mask=attention_mask,
        image_grid_thw=image_grid_thw
    )

    # 获取目标 token id
    logits = outputs.logits
    target_token = logits[0, -1].argmax().item()
    word = processor.tokenizer.decode([target_token]).strip()
    print(f"Target token: {target_token} ({word})")

# 8) 获取特征图并移除钩子
features = activation["features"]
hook.remove()
print(f"Feature map shape: {features.shape}")

# 9) 重塑特征图为空间布局
# 从 image_grid_thw 获取空间尺寸
t, h, w = image_grid_thw[0].tolist()
print(f"Spatial dimensions: H={h}, W={w}")

# 计算总patch数
num_patches = h * w
print(f"Total patches: {num_patches}")

# 检查特征图形状
if len(features.shape) == 2:
    # 二维特征图：[batch_size, features]
    batch_size, feature_dim = features.shape

    # 计算每个patch的特征维度
    if feature_dim % num_patches == 0:
        hidden_dim = feature_dim // num_patches
        print(f"Reshaping to: [1, {hidden_dim}, {h}, {w}]")

        # 重塑为四维张量 [batch, channels, height, width]
        features = features.reshape(1, hidden_dim, h, w)
    else:
        # 无法重塑，使用简单处理
        print("Cannot reshape feature map, using as is")
        features = features.unsqueeze(0).unsqueeze(0)  # [1, 1, 864, 1280]
        h, w = 1, 1  # 设置为1x1热力图
else:
    # 其他形状直接使用
    pass

print(f"Final feature map shape: {features.shape}")

# 10) 确保特征图可训练
features_for_grad = features.clone().detach().requires_grad_(True)

# 11) 运行前向传播计算梯度
model.zero_grad()
outputs = model(
    pixel_values=pixel_values.unsqueeze(0),
    input_ids=input_ids,
    attention_mask=attention_mask,
    image_grid_thw=image_grid_thw
)

# 获取目标token的分数
target_score = outputs.logits[0, -1, target_token]

# 计算梯度
target_score.backward()

# 获取特征图的梯度
if features_for_grad.grad is None:
    print("Warning: Gradients are None, creating dummy gradients")
    gradients = torch.ones_like(features_for_grad)
else:
    gradients = features_for_grad.grad
    print(f"Gradients shape: {gradients.shape}")

# 12) 计算 Grad-CAM
# 简单有效的 Grad-CAM 计算
if len(features_for_grad.shape) == 4:
    # 对于四维特征图
    cam = torch.mean(gradients, dim=1, keepdim=True) * features_for_grad
    cam = torch.relu(torch.sum(cam, dim=1, keepdim=True))  # 沿通道维度求和
    cam = cam[0, 0]  # 提取热力图
elif len(features_for_grad.shape) == 3:
    # 对于三维特征图
    cam = torch.mean(gradients, dim=1, keepdim=True) * features_for_grad
    cam = torch.relu(torch.sum(cam, dim=2, keepdim=True))  # 沿特征维度求和
    cam = cam[0, 0]  # 提取热力图
else:
    # 其他形状使用简单方法
    cam = torch.mean(features_for_grad, dim=1)
    cam = cam[0]  # 提取热力图

# 归一化热力图
cam = cam - cam.min()
cam = cam / (cam.max() + 1e-8)
cam_np = cam.cpu().detach().numpy()

# 13) 上采样到原始图像尺寸
# 如果热力图不是2D，重塑为2D
if len(cam_np.shape) > 2:
    cam_np = cam_np.squeeze()

if cam_np.ndim == 1:
    # 如果是一维，尝试重塑为2D
    try:
        cam_np = cam_np.reshape(h, w)
    except:
        cam_np = np.array([0.5])  # 使用默认值

# 创建热力图图像
heatmap_img = Image.fromarray((cam_np * 255).astype(np.uint8))

# 上采样
if heatmap_img.size != original_size:
    heatmap_img = heatmap_img.resize(original_size, resample=Image.BILINEAR)

heatmap = np.array(heatmap_img) / 255.0

# 14) 可视化
plt.figure(figsize=(12, 6))

# 原始图像
plt.subplot(1, 2, 1)
plt.imshow(img)
plt.axis('off')
plt.title('Original Image')

# Grad-CAM 叠加
plt.subplot(1, 2, 2)
plt.imshow(img)
plt.imshow(heatmap, cmap='jet', alpha=0.5)
plt.axis('off')
plt.title(f'Grad-CAM for: "{word}"')

plt.tight_layout()
plt.savefig("qwen_vl_gradcam.png", bbox_inches='tight', dpi=300)
plt.show()

print(f"Grad-CAM visualization saved to qwen_vl_gradcam.png")

Using target layer: Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
Target token: 32 (A)
Feature map shape: torch.Size([936, 1280])
Spatial dimensions: H=36, W=26
Total patches: 936
Cannot reshape feature map, using as is
Final feature map shape: torch.Size([1, 1, 936, 1280])
Grad-CAM visualization saved to qwen_vl_gradcam.png
