In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor,AutoTokenizer
model_path = "./model"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path, torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
from datasets import load_from_disk

dataset = load_from_disk('./dataset')
small_dataset = dataset.select(range(500))

In [3]:
small_dataset['image']

[<PIL.PngImagePlugin.PngImageFile image mode=RGB size=500x100>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=200x50>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=200x50>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=120x50>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=200x40>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=120x50>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=120x50>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=400x50>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=240x40>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=120x50>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=500x100>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=360x40>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=280x40>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=200x40>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size=200x50>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGB size

In [4]:
small_dataset['text'][0]

'd s ^ { 2 } = ( 1 - { \\frac { q c o s \\theta } { r } } ) ^ { \\frac { 2 } { 1 + \\alpha ^ { 2 } } } \\lbrace d r ^ { 2 } + r ^ { 2 } d \\theta ^ { 2 } + r ^ { 2 } s i n ^ { 2 } \\theta d \\varphi ^ { 2 } \\rbrace - { \\frac { d t ^ { 2 } } { ( 1 - { \\frac { q c o s \\theta } { r } } ) ^ { \\frac { 2 } { 1 + \\alpha ^ { 2 } } } } } .'

In [5]:
tokenizer.pad_token_id

151643

In [20]:
import torch
from torchvision import transforms

# 定义图像预处理transform
image_transform = transforms.Compose([
    transforms.Resize((300, 300)),
])
def process_data(example):
    MAX_LEGTH = 8192//4
    image = example['image']
    image = image_transform(image)
    text = example['text']
    message = [
        {"role":"user",
        "content":[
            {
                "type":"image",
                "image":"./1.png",
                "width": 300,
                "height": 300
            },
            {
                "type":"text",
                "text":"请识别图片种的latex公式，并用文本描述："
            }
            ]
        }, 
    ]
    q_tokens = processor.apply_chat_template(message,tokenize=False,add_generation_prompt=True)
    q_inputs = processor(text=[q_tokens],images=[image],vedios=None,padding=True,return_tensor='pt')
    response = tokenizer(text)
    
    input_ids = q_inputs['input_ids'][0] + response['input_ids']
    attention_mask = q_inputs['attention_mask'][0] + response['attention_mask']
    labels = [-100]*len(q_inputs['input_ids'][0]) + response['input_ids']
    if len(input_ids)>MAX_LEGTH:
        input_ids = input_ids[:MAX_LEGTH]
        attention_mask = attention_mask[:MAX_LEGTH]
        labels = labels[:MAX_LEGTH]
    else:
        padding_length = MAX_LEGTH-  len(input_ids)
        input_ids = [tokenizer.pad_token_id]*padding_length + input_ids
        attention_mask = [0]*padding_length+attention_mask
        labels = [-100]*padding_length+labels
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    labels = torch.tensor(labels)
    #### 多模态模型多接受的参数
    q_inputs['pixel_values'] = torch.tensor(q_inputs['pixel_values'])
    q_inputs['image_grid_thw'] = torch.tensor(q_inputs['image_grid_thw']).squeeze(0)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "pixel_values": q_inputs['pixel_values'],
        "image_grid_thw": q_inputs['image_grid_thw']
    }

In [21]:
dataset = small_dataset.map(process_data)

Map: 100%|██████████| 500/500 [00:24<00:00, 20.33 examples/s]


In [22]:
dataset

Dataset({
    features: ['image', 'text', 'input_ids', 'attention_mask', 'labels', 'pixel_values', 'image_grid_thw'],
    num_rows: 500
})

In [23]:
from tqdm import tqdm
from torch.utils.data import DataLoader
device = 'cuda:0'
columns = ["input_ids", "attention_mask", "pixel_values","labels",'image_grid_thw']
processed_dataset = dataset
# processed_dataset.set_format(type="torch", columns=columns)
def custom_collate_fn(batch):
    input_ids = torch.stack([torch.tensor(b['input_ids']) for b in batch])
    attention_mask = torch.stack([torch.tensor(b['attention_mask']) for b in batch])
    pixel_values = torch.stack([torch.tensor(b['pixel_values']) for b in batch])
    labels = torch.stack([torch.tensor(b['labels']) for b in batch])
    grid_thw = torch.stack([torch.tensor(b['image_grid_thw']) for b in batch])
    return {
        'input_ids': input_ids,#
        'attention_mask': attention_mask,
        'pixel_values': pixel_values, # 
        'labels': labels,
        'grid_thw':grid_thw
    }


train_loader = DataLoader(processed_dataset, batch_size=1, shuffle=True,collate_fn=custom_collate_fn)

# 6. 优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# 7. 训练循环
num_epochs = 3

for epoch in tqdm(range(num_epochs)):
    print(f"Epoch {epoch+1}/{num_epochs}")
    loop = tqdm(train_loader)
    for batch in loop:
        # 将batch数据搬到GPU
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        
        # 直接传入模型，模型会自动计算 loss
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            pixel_values=batch["pixel_values"],
            labels=batch["labels"], # 语言模型用输入文本作为标签
            image_grid_thw=batch['grid_thw']
        )
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f"Loss: {loss.item():.4f}")

# 8. 保存微调后的模型
model.save_pretrained("./qwen2vl-finetuned")
processor.save_pretrained("./qwen2vl-finetuned")

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/3



  return F.conv3d(
  return F.conv3d(
  0%|          | 0/500 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 

In [None]:
example = small_dataset[0]

message = [
        {"role":"user",
        "content":[
            {
                "type":"image",
                "image":example['image'],
                "width": 300,
                "height": 300
                
            },
            {
                "type":"text",
                "text":"请识别图片种的latex公式，并用文本描述："
            }
            ]
        },]


In [None]:
tokenizer(example['text'])

In [None]:
q_tokens = processor.apply_chat_template(message,tokenize=False)
q_inputs = processor(text=[q_tokens],images=[example['image']],vedios=None,padding=True,return_tensor='pt')
q_inputs.keys()

In [None]:
tokenizer.decode(151655)

In [None]:
from tqdm import tqdm
def process_data(example):
    MAX_LENGTH = 8192
    image = example['image']
    image_descripiton = example['text']
    messages = [
        {
            "role":"user",
            "content":[{
                "type":"image",
                 "image":image,   
                },
                {"type":"text","text":"请描述图片种的内容"}]
        }
    ]
    q_tokens = processor.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
    inputs = processor(
        text=[q_tokens],
        images=[image],
        videos=None,
        padding=True,
        return_tensors="pt",
    )
    response = tokenizer(image_descripiton,add_special_tokens=False)
    input_ids = inputs['input_ids'][0]+response['input_ids']
    attention_mask = inputs['attention_mask'][0]+response['attention_mask']
    labels = [-100]*len(inputs['input_ids'][0]) + response['label']
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    else :
        paddding_length = MAX_LENGTH-len(input_ids)
        input_ids = paddding_length *[tokenizer.pad_token_id] + input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    labels = torch.tensor(labels)
    #### 多模态模型多接受的参数
    inputs['pixel_values'] = torch.tensor(inputs['pixel_values'])
    inputs['image_grid_thw'] = torch.tensor(inputs['image_grid_thw']).squeeze(0)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "pixel_values": inputs['pixel_values'],
        "image_grid_thw": inputs['image_grid_thw']
    }

    

In [None]:
messages = [
        {
            "role":"user",
            "content":[{
                "type":"image",
                 "image":small_dataset[0]['image'],   
                },
                {"type":"text","text":"请描述图片种的内容"}]
        }
    ]
q_tokens = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
inputs = processor(
        text=[q_tokens],
        images=[small_dataset[0]['image']],
        videos=None,
        padding=True,
        return_tensors="pt",
    )
inputs.keys()

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration,AutoTokenizer
from datasets import load_from_disk
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. 加载本地数据集
dataset = load_from_disk("./dataset")  # 这里是你之前保存的dataset路径

# 2. 加载模型和processor
model_path = "./model"
processor = AutoProcessor.from_pretrained(model_path)
model = Qwen2VLForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model.train()

# 3. 数据预处理函数
def process_data(example):
    MAX_LENGTH = 8192
    image = example['image']
    image_descripiton = example['text']
    messages = [
        {
            "role":"user",
            "content":[{
                "type":"image",
                 "image":image,   
                },
                {"type":"text","text":"请描述图片种的内容"}]
        }
    ]
    q_tokens = processor.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
    inputs = processor(
        text=[q_tokens],
        images=[image],
        videos=None,
        padding=True,
        return_tensors="pt",
    )
    response = tokenizer(image_descripiton,add_special_tokens=False)
    input_ids = inputs['input_ids'][0].tolist() + response['input_ids']
    attention_mask = inputs['attention_mask'][0].tolist() + [1] * len(response['input_ids'])
    labels = [-100] * len(inputs['input_ids'][0]) + response['input_ids']
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    else :
        paddding_length = MAX_LENGTH-len(input_ids)
        input_ids = paddding_length *[tokenizer.pad_token_id] + input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    labels = torch.tensor(labels)
    #### 多模态模型多接受的参数
    inputs['pixel_values'] = torch.tensor(inputs['pixel_values'])
    inputs['image_grid_thw'] = torch.tensor(inputs['image_grid_thw']).squeeze(0)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "pixel_values": inputs['pixel_values'],
        "image_grid_thw": inputs['image_grid_thw']
    }

# 4. 对整个数据集映射预处理，注意batched=True
processed_dataset = dataset.map(process_data, batched=True, batch_size=8)

# 5. 将处理后的数据格式调整为 DataLoader 需要的格式
# 只保留模型输入所需的字段，且把每个字段的 tensor 提取出来
columns = ["input_ids", "attention_mask", "pixel_values"]
processed_dataset.set_format(type="torch", columns=columns)

train_loader = DataLoader(processed_dataset, batch_size=8, shuffle=True)

# 6. 优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# 7. 训练循环
num_epochs = 3

for epoch in tqdm(range(num_epochs)):
    print(f"Epoch {epoch+1}/{num_epochs}")
    loop = tqdm(train_loader)
    for batch in loop:
        # 将batch数据搬到GPU
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        
        # 直接传入模型，模型会自动计算 loss
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            pixel_values=batch["pixel_values"],
            labels=batch["labels"],   # 语言模型用输入文本作为标签
        )
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f"Loss: {loss.item():.4f}")

# 8. 保存微调后的模型
model.save_pretrained("./qwen2vl-finetuned")
processor.save_pretrained("./qwen2vl-finetuned")
