# 🔧 微调 BLIP 模型用于图像字幕生成
使用 `train.csv` 和 `test.csv` 中的图文对，对 `Salesforce/blip-image-captioning-base` 模型进行微调，并使用测试集验证生成效果。

In [1]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

In [2]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import Dataset
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    Trainer,
    TrainingArguments,
    default_data_collator
)
import evaluate
from torchvision import transforms


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df = pd.read_csv("data/train.csv").dropna(subset=["txt", "new_image_id"])
test_df = pd.read_csv("data/test.csv").dropna(subset=["txt", "new_image_id"])

print(f"训练样本数: {len(train_df)}, 测试样本数: {len(test_df)}")


训练样本数: 2852, 测试样本数: 713


In [4]:
class CaptionDataset(Dataset):
    def __init__(self, dataframe, processor, img_dir, max_length=64):
        self.df = dataframe
        self.processor = processor
        self.img_dir = img_dir
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.img_dir, row["new_image_id"])
        try:
            image = Image.open(image_path).convert("RGB")
            caption = row["txt"]
            
            # 使用processor处理图像和文本
            inputs = self.processor(
                images=image,
                text=caption,
                padding="max_length",
                max_length=self.max_length,
                truncation=True,
                return_tensors="pt"
            )
            
            # 移除批量维度
            inputs = {k: v.squeeze(0) for k, v in inputs.items()}
            inputs["labels"] = inputs["input_ids"]
            return inputs
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
            # 返回一个空字典，Trainer会跳过这些样本
            return {}

In [5]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [6]:
train_dataset = CaptionDataset(train_df, processor, "data/image")
test_dataset = CaptionDataset(test_df, processor, "data/image")

In [7]:
train_dataset[0]

{'pixel_values': tensor([[[-1.2083, -1.2083, -1.2229,  ...,  1.0982,  1.0836,  1.0544],
          [-1.2083, -1.2083, -1.2229,  ...,  1.0982,  1.0836,  1.0544],
          [-1.2083, -1.2229, -1.2229,  ...,  1.0982,  1.0836,  1.0544],
          ...,
          [-0.8726, -1.0039, -1.1791,  ..., -1.4127, -1.5149, -1.5587],
          [-0.8288, -0.9602, -1.1499,  ..., -1.3835, -1.4711, -1.5587],
          [-0.7996, -0.9310, -1.1499,  ..., -1.3543, -1.4565, -1.5587]],
 
         [[-1.7371, -1.7371, -1.7521,  ...,  1.2344,  1.2194,  1.1894],
          [-1.7371, -1.7371, -1.7521,  ...,  1.2344,  1.2194,  1.1894],
          [-1.7371, -1.7521, -1.7521,  ...,  1.2344,  1.2194,  1.1894],
          ...,
          [-0.9867, -1.1068, -1.2718,  ..., -1.4820, -1.5870, -1.6320],
          [-1.0167, -1.1368, -1.2869,  ..., -1.4669, -1.5720, -1.6621],
          [-1.0467, -1.1518, -1.2869,  ..., -1.4519, -1.5570, -1.6621]],
 
         [[-1.3949, -1.3949, -1.4091,  ...,  1.2643,  1.2500,  1.2216],
          [-

In [8]:
# 定义评估指标
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    # 使用更有效的方式计算指标
    predictions, labels = eval_pred
    
    # 仅解码预测和实际标签，跳过特殊标记
    decoded_preds = processor.batch_decode(predictions, skip_special_tokens=True)
    labels = [[l for l in label if l != -100] for label in labels]  # 过滤填充标记
    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
    
    # 计算BLEU分数
    result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    return result

In [9]:
# 定义训练参数 - 优化显存使用
training_args = TrainingArguments(
    output_dir="./blip_finetune_output",
    per_device_train_batch_size=1,        # 降低单设备批量大小
    per_device_eval_batch_size=1,         # 显式设置评估批量大小
    gradient_accumulation_steps=4,        # 使用梯度累积来有效增加批量大小
    num_train_epochs=3,                   # 减少训练轮次
    logging_steps=50,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=1,
    fp16=torch.cuda.is_available(),       # 使用混合精度训练
    remove_unused_columns=False,
    report_to="none",
    load_best_model_at_end=True,          # 加载表现最好的模型
    metric_for_best_model="bleu",         # 以BLEU分数为指标
    greater_is_better=True                # BLEU分数越高越好
)



In [10]:
# 定义训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

# 训练模型
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# 保存微调后的模型
model.save_pretrained("blip_finetuned")
processor.save_pretrained("blip_finetuned")
print("微调完成，模型保存在 blip_finetuned")

In [None]:
# 添加一个简单的推理示例
sample_image_path = os.path.join("data/image", test_df.iloc[0]["new_image_id"])
sample_image = Image.open(sample_image_path).convert("RGB")

In [None]:
# 推理
device = "cuda" if torch.cuda.is_available() else "cpu"
inputs = processor(images=sample_image, return_tensors="pt").to(device)
generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50)
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(f"生成的描述: {generated_caption}")
print(f"实际描述: {test_df.iloc[0]['txt']}")