## Learning Trasnformers with HuggingFace

![huggingface](https://transformers.run/assets/title.jpg)

### Load model

In [1]:
!pip list

Package                           Version                          Editable project location
--------------------------------- -------------------------------- -----------------------------
accelerate                        1.8.1
addict                            2.4.0
aiofiles                          25.1.0
aiohappyeyeballs                  2.6.1
aiohttp                           3.13.2
aiosignal                         1.4.0
annotated-doc                     0.0.4
annotated-types                   0.7.0
anyio                             4.12.0
anykeystore                       0.2
asttokens                         3.0.0
async-timeout                     4.0.3
attrs                             25.4.0
av                                14.4.0
backoff                           2.2.1
beautifulsoup4                    4.14.3
beir                              2.2.0
bitarray                          3.8.0
certifi                           2025.11.12
cffi                              2.0.0
c

In [7]:
import torch
from transformers import Qwen3VLForConditionalGeneration,AutoProcessor

In [10]:
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "/mnt/data1/ygm/models/Qwen3-VL-2B-Instruct",
)

In [11]:
model

Qwen3VLForConditionalGeneration(
  (model): Qwen3VLModel(
    (visual): Qwen3VLVisionModel(
      (patch_embed): Qwen3VLVisionPatchEmbed(
        (proj): Conv3d(3, 1024, kernel_size=(2, 16, 16), stride=(2, 16, 16))
      )
      (pos_embed): Embedding(2304, 1024)
      (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-23): 24 x Qwen3VLVisionBlock(
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (attn): Qwen3VLVisionAttention(
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (mlp): Qwen3VLVisionMLP(
            (linear_fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (linear_fc2): Linear(in_features=4096, out_features=1024, bias=True)
            (act_fn): GELUTanh()
          )
        )
      )
 

In [14]:
model.config

Qwen3VLConfig {
  "architectures": [
    "Qwen3VLForConditionalGeneration"
  ],
  "dtype": "float32",
  "image_token_id": 151655,
  "model_type": "qwen3_vl",
  "text_config": {
    "attention_bias": false,
    "attention_dropout": 0.0,
    "bos_token_id": 151643,
    "dtype": "float32",
    "eos_token_id": 151645,
    "head_dim": 128,
    "hidden_act": "silu",
    "hidden_size": 2048,
    "initializer_range": 0.02,
    "intermediate_size": 6144,
    "max_position_embeddings": 262144,
    "model_type": "qwen3_vl_text",
    "num_attention_heads": 16,
    "num_hidden_layers": 28,
    "num_key_value_heads": 8,
    "rms_norm_eps": 1e-06,
    "rope_scaling": {
      "mrope_interleaved": true,
      "mrope_section": [
        24,
        20,
        20
      ],
      "rope_type": "default"
    },
    "rope_theta": 5000000,
    "tie_word_embeddings": true,
    "use_cache": true,
    "vocab_size": 151936
  },
  "tie_word_embeddings": true,
  "transformers_version": "4.57.1",
  "video_token_id":

In [12]:
processor = AutoProcessor.from_pretrained("/mnt/data1/ygm/models/Qwen3-VL-2B-Instruct")

In [13]:
processor

Qwen3VLProcessor:
- image_processor: Qwen2VLImageProcessorFast {
  "crop_size": null,
  "data_format": "channels_first",
  "default_to_square": true,
  "device": null,
  "disable_grouping": null,
  "do_center_crop": null,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_pad": null,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "Qwen2VLImageProcessorFast",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "input_data_format": null,
  "max_pixels": null,
  "merge_size": 2,
  "min_pixels": null,
  "pad_size": null,
  "patch_size": 16,
  "processor_class": "Qwen3VLProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "return_tensors": null,
  "size": {
    "longest_edge": 16777216,
    "shortest_edge": 65536
  },
  "temporal_patch_size": 2
}

- tokenizer: Qwen2TokenizerFast(name_or_path='/mnt/data1/ygm/models/Qwen3-VL-2B-Instruct', vocab_size=151643, model_max_length=262144, is_fast=True, paddi

In [15]:
total_params = sum(param.numel() for param in model.parameters())
print(f"Mosel Size: {total_params / 1e6:.2f}M parameters")

Mosel Size: 2127.53M parameters


In [16]:
for name,param in model.named_parameters():
    print(name)

model.visual.patch_embed.proj.weight
model.visual.patch_embed.proj.bias
model.visual.pos_embed.weight
model.visual.blocks.0.norm1.weight
model.visual.blocks.0.norm1.bias
model.visual.blocks.0.norm2.weight
model.visual.blocks.0.norm2.bias
model.visual.blocks.0.attn.qkv.weight
model.visual.blocks.0.attn.qkv.bias
model.visual.blocks.0.attn.proj.weight
model.visual.blocks.0.attn.proj.bias
model.visual.blocks.0.mlp.linear_fc1.weight
model.visual.blocks.0.mlp.linear_fc1.bias
model.visual.blocks.0.mlp.linear_fc2.weight
model.visual.blocks.0.mlp.linear_fc2.bias
model.visual.blocks.1.norm1.weight
model.visual.blocks.1.norm1.bias
model.visual.blocks.1.norm2.weight
model.visual.blocks.1.norm2.bias
model.visual.blocks.1.attn.qkv.weight
model.visual.blocks.1.attn.qkv.bias
model.visual.blocks.1.attn.proj.weight
model.visual.blocks.1.attn.proj.bias
model.visual.blocks.1.mlp.linear_fc1.weight
model.visual.blocks.1.mlp.linear_fc1.bias
model.visual.blocks.1.mlp.linear_fc2.weight
model.visual.blocks.1.ml

In [32]:
model.print_trainable_parameters()

trainable params: 8,716,288 || all params: 2,136,248,320 || trainable%: 0.4080


In [18]:
num_trainable_params = 0
for name,param in model.named_parameters():
    if "language_model.layers.27" in name or "language_model.norm" in name:
        param.requires_grad = True
        num_trainable_params += param.numel()
    else:
        param.requires_grad = False

In [21]:
print(f"Number of trainable parameters (last layer and norm):{num_trainable_params / 1e6:.2f}M parameters")
print(f"Trainable parameters ratio: {num_trainable_params / total_params:.6f}")

Number of trainable parameters (last layer and norm):50.34M parameters
Trainable parameters ratio: 0.023660


In [22]:
from peft import PromptTuningConfig, get_peft_model,TaskType,PromptTuningInit

# 硬提示
config = PromptTuningConfig(task_type=TaskType.CAUSAL_LM,
                            prompt_tuning_init=PromptTuningInit.TEXT,
                            prompt_tuning_init_text="根据用户和机器人对话,学习生成文本",
                            num_virtual_tokens=20,
                            tokenizer_name_or_path="/mnt/data1/ygm/models/Qwen3-VL-2B-Instruct",
)

In [23]:
config

PromptTuningConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.PROMPT_TUNING: 'PROMPT_TUNING'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, num_virtual_tokens=20, token_dim=None, num_transformer_submodules=None, num_attention_heads=None, num_layers=None, prompt_tuning_init=<PromptTuningInit.TEXT: 'TEXT'>, prompt_tuning_init_text='根据用户和机器人对话,学习生成文本', tokenizer_name_or_path='/mnt/data1/ygm/models/Qwen3-VL-2B-Instruct', tokenizer_kwargs=None)

In [27]:
from peft import LoraConfig,TaskType, get_peft_model

config = LoraConfig(
    bias="none",
    task_type="FEATURE_EXTRACTION",
    target_modules="(.*(model)(?!.*visual).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)",
)
config

LoraConfig(task_type='FEATURE_EXTRACTION', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules='(.*(model)(?!.*visual).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)', exclude_modules=None, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [28]:
model = get_peft_model(model,config)

In [29]:
model

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): Qwen3VLForConditionalGeneration(
      (model): Qwen3VLModel(
        (visual): Qwen3VLVisionModel(
          (patch_embed): Qwen3VLVisionPatchEmbed(
            (proj): Conv3d(3, 1024, kernel_size=(2, 16, 16), stride=(2, 16, 16))
          )
          (pos_embed): Embedding(2304, 1024)
          (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()
          (blocks): ModuleList(
            (0-23): 24 x Qwen3VLVisionBlock(
              (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
              (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
              (attn): Qwen3VLVisionAttention(
                (qkv): Linear(in_features=1024, out_features=3072, bias=True)
                (proj): Linear(in_features=1024, out_features=1024, bias=True)
              )
              (mlp): Qwen3VLVisionMLP(
                (linear_fc1): Linear(in_features=1024, out_features=4096, bias=True)
      

In [30]:
for name,param in model.named_parameters():
    print(name)

base_model.model.model.visual.patch_embed.proj.weight
base_model.model.model.visual.patch_embed.proj.bias
base_model.model.model.visual.pos_embed.weight
base_model.model.model.visual.blocks.0.norm1.weight
base_model.model.model.visual.blocks.0.norm1.bias
base_model.model.model.visual.blocks.0.norm2.weight
base_model.model.model.visual.blocks.0.norm2.bias
base_model.model.model.visual.blocks.0.attn.qkv.weight
base_model.model.model.visual.blocks.0.attn.qkv.bias
base_model.model.model.visual.blocks.0.attn.proj.weight
base_model.model.model.visual.blocks.0.attn.proj.bias
base_model.model.model.visual.blocks.0.mlp.linear_fc1.weight
base_model.model.model.visual.blocks.0.mlp.linear_fc1.bias
base_model.model.model.visual.blocks.0.mlp.linear_fc2.weight
base_model.model.model.visual.blocks.0.mlp.linear_fc2.bias
base_model.model.model.visual.blocks.1.norm1.weight
base_model.model.model.visual.blocks.1.norm1.bias
base_model.model.model.visual.blocks.1.norm2.weight
base_model.model.model.visual.b

In [31]:
model.print_trainable_parameters()

trainable params: 8,716,288 || all params: 2,136,248,320 || trainable%: 0.4080
