In [1]:
import torch
from transformers import (
    Qwen2VLForConditionalGeneration,  # Qwen2-VL
    AutoModelForCausalLM,                 # Qwen2 / Qwen2.5 coder
)

# 1) 加载 Qwen2-VL 模型
qwen2vl_model_path = "/mnt/lingjiejiang/textual_aesthetics/model_checkpoint/vlm_checkpoints/Qwen2-VL-7B-Instruct"
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
    qwen2vl_model_path,
    torch_dtype="auto",  
    device_map="auto"
)

# 2) 加载 Qwen2.5 coder 模型 (或者 Qwen2-7B-Instruct 等)
qwen2_5_model_path = "/mnt/lingjiejiang/multimodal_code/checkpoints/llms/Qwen2.5-7B-Instruct"
qwen2_5 = AutoModelForCausalLM.from_pretrained(
    qwen2_5_model_path,
    torch_dtype="auto",
    device_map="auto"
)

# 取出 Qwen2VL 里 “LLM 部分”的 state_dict
qwen2vl_backbone_sd = qwen2vl.model.state_dict()

# 取出 Qwen2Coder (Qwen2.5) 里 “LLM 部分”的 state_dict
qwen2_5_sd = qwen2_5.model.state_dict()

# 遍历 coder 的每个参数，如果在 vl 里有同名参数且形状一致，就直接覆盖
for name, param in qwen2_5_sd.items():
    if name in qwen2vl_backbone_sd and qwen2vl_backbone_sd[name].shape == param.shape:
        # 用 coder 的参数覆盖 VL 的对应参数
        with torch.no_grad():
            qwen2vl_backbone_sd[name].copy_(param)

# 将更新后的 state_dict load 回去
qwen2vl.model.load_state_dict(qwen2vl_backbone_sd)

qwen2vl_lm_head_sd = qwen2vl.lm_head.state_dict()
qwen2_5_sd_lm_head_sd = qwen2_5.lm_head.state_dict()
for name, param in qwen2_5_sd_lm_head_sd.items():
    if name in qwen2vl_lm_head_sd and qwen2vl_lm_head_sd[name].shape == param.shape:
        with torch.no_grad():
            qwen2vl_lm_head_sd[name].copy_(param)
qwen2vl.lm_head.load_state_dict(qwen2vl_lm_head_sd)

qwen2vl.save_pretrained("/mnt/lingjiejiang/multimodal_code/checkpoints/vlms/Qwenvl2-text2.5-7B-Instruct_merge")


import os, shutil
from transformers import AutoTokenizer

# 你已经在代码中加载了 Qwen2-VL
qwen2vl_model_path = "/mnt/lingjiejiang/textual_aesthetics/model_checkpoint/vlm_checkpoints/Qwen2-VL-7B-Instruct"
# qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(...)
# /mnt/lingjiejiang/multimodal_code/checkpoints/llms/Qwen2.5-Coder-7B-Instruct
# 你的新模型保存目录
merged_path = "/mnt/lingjiejiang/multimodal_code/checkpoints/vlms/Qwenvl2-text2.5-7B-Instruct_merge"

# 如果你也加载了原 Qwen2-VL 的 tokenizer:
vl_tokenizer = AutoTokenizer.from_pretrained(qwen2vl_model_path)
# 直接保存到新目录
vl_tokenizer.save_pretrained(merged_path)

# 某些 Qwen2-VL 版本还带有 "processor" 或 "chat_template.json"，也可类似保存或复制
# 如果原目录下有 chat_template.json，可以直接 shutil.copy
src_chat_template = os.path.join(qwen2vl_model_path, "chat_template.json")
dst_chat_template = os.path.join(merged_path, "chat_template.json")
if os.path.exists(src_chat_template):
    shutil.copy(src_chat_template, dst_chat_template)

# 如果有 preprocessor_config.json，也可以类似处理
src_preproc_config = os.path.join(qwen2vl_model_path, "preprocessor_config.json")
dst_preproc_config = os.path.join(merged_path, "preprocessor_config.json")
if os.path.exists(src_preproc_config):
    shutil.copy(src_preproc_config, dst_preproc_config)

print("All done. Check the merged_path folder to see if tokenizer & chat_template are there.")


  from .autonotebook import tqdm as notebook_tqdm
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|██████████| 5/5 [00:49<00:00,  9.95s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:54<00:00, 13.57s/it]


All done. Check the merged_path folder to see if tokenizer & chat_template are there.


In [12]:
qwen2vl.save_pretrained("/mnt/lingjiejiang/multimodal_code/checkpoints/vlms/Qwenvl2-text2.5-7B-Instruct_merge")


import os, shutil
from transformers import AutoTokenizer

# 你已经在代码中加载了 Qwen2-VL
qwen2vl_model_path = "/mnt/lingjiejiang/textual_aesthetics/model_checkpoint/vlm_checkpoints/Qwen2-VL-7B-Instruct"
# qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(...)
# /mnt/lingjiejiang/multimodal_code/checkpoints/llms/Qwen2.5-Coder-7B-Instruct
# 你的新模型保存目录
merged_path = "/mnt/lingjiejiang/multimodal_code/checkpoints/vlms/Qwenvl2-text2.5-7B-Instruct_merge"

# 如果你也加载了原 Qwen2-VL 的 tokenizer:
vl_tokenizer = AutoTokenizer.from_pretrained(qwen2vl_model_path)
# 直接保存到新目录
vl_tokenizer.save_pretrained(merged_path)

# 某些 Qwen2-VL 版本还带有 "processor" 或 "chat_template.json"，也可类似保存或复制
# 如果原目录下有 chat_template.json，可以直接 shutil.copy
src_chat_template = os.path.join(qwen2vl_model_path, "chat_template.json")
dst_chat_template = os.path.join(merged_path, "chat_template.json")
if os.path.exists(src_chat_template):
    shutil.copy(src_chat_template, dst_chat_template)

# 如果有 preprocessor_config.json，也可以类似处理
src_preproc_config = os.path.join(qwen2vl_model_path, "preprocessor_config.json")
dst_preproc_config = os.path.join(merged_path, "preprocessor_config.json")
if os.path.exists(src_preproc_config):
    shutil.copy(src_preproc_config, dst_preproc_config)

print("All done. Check the merged_path folder to see if tokenizer & chat_template are there.")

All done. Check the merged_path folder to see if tokenizer & chat_template are there.


In [13]:
qwen2vl_merged_path = "/mnt/lingjiejiang/multimodal_code/checkpoints/vlms/Qwenvl2-text2.5-7B-Instruct_merge"
qwen2vl_merged = Qwen2VLForConditionalGeneration.from_pretrained(
    qwen2vl_merged_path,
    torch_dtype="auto",  
    device_map="auto"
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 4/4 [00:40<00:00, 10.08s/it]


In [8]:
qwen2vl

Qwen2VLForConditionalGeneration(
  (visual): Qwen2VisionTransformerPretrainedModel(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
    )
    (rotary_pos_emb): VisionRotaryEmbedding()
    (blocks): ModuleList(
      (0-31): 32 x Qwen2VLVisionBlock(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): VisionSdpaAttention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (mlp): VisionMlp(
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): QuickGELUActivation()
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        )
      )
    )
    (merger): PatchMerger(
      (ln_q): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (mlp): Seq

In [11]:
qwen2vl.model.state_dict()

OrderedDict([('embed_tokens.weight',
              tensor([[-1.5503e-02, -4.0588e-03,  1.4832e-02,  ...,  1.0681e-02,
                        4.1748e-02, -1.7700e-02],
                      [ 1.5198e-02,  1.5564e-02,  1.7456e-02,  ..., -1.9409e-02,
                        1.6724e-02,  7.6599e-03],
                      [-8.4229e-03, -4.6082e-03,  4.3945e-03,  ..., -1.1292e-03,
                       -2.7954e-02,  7.6904e-03],
                      ...,
                      [-1.1755e-37,  1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
                        1.1755e-37,  1.1755e-37],
                      [ 1.1755e-37, -1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
                        1.1755e-37, -1.1755e-37],
                      [ 1.1755e-37, -1.1755e-37, -1.1755e-37,  ...,  1.1755e-37,
                        1.1755e-37, -1.1755e-37]], device='cuda:0', dtype=torch.bfloat16)),
             ('layers.0.self_attn.q_proj.weight',
              tensor([[-5.7220e-05, -5.1270e-03,  1.6602e-

In [14]:
qwen2vl_merged.model.state_dict()

OrderedDict([('embed_tokens.weight',
              tensor([[-1.5503e-02, -4.0588e-03,  1.4832e-02,  ...,  1.0681e-02,
                        4.1748e-02, -1.7700e-02],
                      [ 1.5198e-02,  1.5564e-02,  1.7456e-02,  ..., -1.9409e-02,
                        1.6724e-02,  7.6599e-03],
                      [-8.4229e-03, -4.6082e-03,  4.3945e-03,  ..., -1.1292e-03,
                       -2.7954e-02,  7.6904e-03],
                      ...,
                      [-1.1755e-37,  1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
                        1.1755e-37,  1.1755e-37],
                      [ 1.1755e-37, -1.1755e-37,  1.1755e-37,  ..., -1.1755e-37,
                        1.1755e-37, -1.1755e-37],
                      [ 1.1755e-37, -1.1755e-37, -1.1755e-37,  ...,  1.1755e-37,
                        1.1755e-37, -1.1755e-37]], device='cuda:0', dtype=torch.bfloat16)),
             ('layers.0.self_attn.q_proj.weight',
              tensor([[-5.7220e-05, -5.1270e-03,  1.6602e-

In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

qwen2vl_merged_model_path = "/mnt/lingjiejiang/multimodal_code/checkpoints/vlms/Qwenvl2-text2.5-7B-Instruct_merge"
# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    qwen2vl_merged_model_path, torch_dtype="auto", device_map="auto"
)

# default processer
processor = AutoProcessor.from_pretrained(qwen2vl_merged_model_path)

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

  from .autonotebook import tqdm as notebook_tqdm
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|██████████| 4/4 [00:54<00:00, 13.65s/it]


["I apologize, but I'm unable to generate, create, edit, manipulate, or produce images. However, I can certainly help you describe a scene in detail or suggest ways to enhance your description. Would you like me to help you with a detailed description of the scene you've described, or do you have any other questions or topics you'd like to discuss related to this image?"]


In [2]:
prompt = "write a quick sort algorithm."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

["Certainly! Quick sort is a popular and efficient sorting algorithm that uses a divide-and-conquer approach. Here's a simple implementation of the quick sort algorithm in Python:\n\n```python\ndef quick_sort(arr):\n    if len(arr) <= 1:\n        return arr\n    else:\n        pivot = arr[len(arr) // 2]\n        left = [x for x in arr if x < pivot]\n        middle = [x for x in arr if x == pivot]\n        right = [x for x in arr if x > pivot]\n        return quick_sort(left) + middle + quick_sort(right)\n\n# Example usage:\narr"]
