In [36]:
# 导入AutoModel类，该类允许自动从预训练模型库加载模型
from modelscope import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch
import torch.nn as nn
# 设置预训练模型的检查点名称，这里使用THUDM维护的ChatGLM3-6B模型
check_point = "Qwen/Qwen2.5-0.5B-Instruct"
# 将可选的本地模型路径注释掉，如果需要从本地加载模型，则取消注释并指定正确的本地路径
# model_path = "/home/egcs/models/chatglm3-6b"
# 使用AutoModel的from_pretrained方法加载模型表示信任远程代码，允许从模型仓库执行未验证的代码
model: nn.Module = AutoModel.from_pretrained(pretrained_model_name_or_path=check_point, trust_remote_code=True, dtype="auto").half().cuda()
# 加载模型对应的分词器
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=check_point)
# 加载模型对应的配置
conf = AutoConfig.from_pretrained(pretrained_model_name_or_path=check_point)
model

Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct


2026-01-22 15:04:49,935 - modelscope - INFO - Target directory already exists, skipping creation.


Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct


2026-01-22 15:04:52,443 - modelscope - INFO - Target directory already exists, skipping creation.


Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct


2026-01-22 15:04:53,645 - modelscope - INFO - Target directory already exists, skipping creation.


Qwen2Model(
  (embed_tokens): Embedding(151936, 896)
  (layers): ModuleList(
    (0-23): 24 x Qwen2DecoderLayer(
      (self_attn): Qwen2Attention(
        (q_proj): Linear(in_features=896, out_features=896, bias=True)
        (k_proj): Linear(in_features=896, out_features=128, bias=True)
        (v_proj): Linear(in_features=896, out_features=128, bias=True)
        (o_proj): Linear(in_features=896, out_features=896, bias=False)
      )
      (mlp): Qwen2MLP(
        (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
        (up_proj): Linear(in_features=896, out_features=4864, bias=False)
        (down_proj): Linear(in_features=4864, out_features=896, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
    )
  )
  (norm): Qwen2RMSNorm((896,), eps=1e-06)
  (rotary_emb): Qwen2RotaryEmbedding()
)

In [21]:
# model.add_module(module=nn.Linear(896, 2), name="classification_layer") # 兼容PyTorch操作
model_from_conf = AutoModel.from_config(config=conf)
model_from_conf

Qwen2Model(
  (embed_tokens): Embedding(151936, 896)
  (layers): ModuleList(
    (0-23): 24 x Qwen2DecoderLayer(
      (self_attn): Qwen2Attention(
        (q_proj): Linear(in_features=896, out_features=896, bias=True)
        (k_proj): Linear(in_features=896, out_features=128, bias=True)
        (v_proj): Linear(in_features=896, out_features=128, bias=True)
        (o_proj): Linear(in_features=896, out_features=896, bias=False)
      )
      (mlp): Qwen2MLP(
        (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
        (up_proj): Linear(in_features=896, out_features=4864, bias=False)
        (down_proj): Linear(in_features=4864, out_features=896, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
    )
  )
  (norm): Qwen2RMSNorm((896,), eps=1e-06)
  (rotary_emb): Qwen2RotaryEmbedding()
)

In [22]:
tokenizer.decode(tokenizer.encode("“你好”"))
# 编码使用
inputs = tokenizer(["你好，你是谁？", "我谁也不是，我是你", "出口成章"], padding=True, truncation=True, return_tensors="pt")
inputs_ids = inputs['input_ids'].to("cuda")
inputs_ids.shape
inputs_ids
# 解码使用
tokenizer.decode(inputs_ids[2])
inputs

{'input_ids': tensor([[108386,   3837, 105043, 100165,  11319, 151643],
        [ 35946, 100165, 104993,   3837, 104198,  56568],
        [102048,  12857,  44928, 151643, 151643, 151643]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0]])}

# 检视模型（Qwen/Qwen2.5-0.5B-Instruct）

## embed_tokens
Embedding(151936, 896)，词嵌入层，输入维度为vocab_size=151936，转为896维向量
## layers（ModuleList）
一个标准的Decoder Only Transformers架构模型，这里Layer为TransformersBlocks；
1. 多头自注意力层重复24次：
    1. query：表示权重，维度为896*896，使用Linear层便于权重和input序列的矩阵乘法和权重参数的初始化；
    2. key：维度为896*128，query @ key.T，query的行数=key.T的列数；
    3. value：与key的维度相同，这里k和v的维度为128是因为当前为多头注意力，将128*7分为了7个头；
    4. output：最后一个线性输出将输出维度重新设置为896*896；
2. MLP（Qwen2MLP，多层感知机层）：是一个多层感知机，负责对经过自注意力机制处理后的向量表示进行非线性变换，以便捕捉更复杂的语义模式。
3. layernorm：包含两个两个层归一化，与GPT一致，均为前层归一化和后层归一化，确保反向传播过程中的数值的稳定性；
## norm
模型的最后一层层归一化
## rotary_emb
### 概述
Rotary Position Embedding, RoPE模块，是一种位置编码技术，用于在Transformer模型中引入序列中token的相对位置信息，与传统绝对位置编码不同，RoPE通过旋转变换将位置信息融入到Query和Key向量中。
### 原理
将Query和Key的每一对维度（q2i, q2i+1）视作一个二维坐标；
根据位置索引pos和一个频率参数，将这个坐标旋转一个角度；
旋转角度θ=pos * base^(-2i/d)，角度随位置线性增长，i越大，角度越大。

In [23]:
# 检视模型
model

Qwen2Model(
  (embed_tokens): Embedding(151936, 896)
  (layers): ModuleList(
    (0-23): 24 x Qwen2DecoderLayer(
      (self_attn): Qwen2Attention(
        (q_proj): Linear(in_features=896, out_features=896, bias=True)
        (k_proj): Linear(in_features=896, out_features=128, bias=True)
        (v_proj): Linear(in_features=896, out_features=128, bias=True)
        (o_proj): Linear(in_features=896, out_features=896, bias=False)
      )
      (mlp): Qwen2MLP(
        (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
        (up_proj): Linear(in_features=896, out_features=4864, bias=False)
        (down_proj): Linear(in_features=4864, out_features=896, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
    )
  )
  (norm): Qwen2RMSNorm((896,), eps=1e-06)
  (rotary_emb): Qwen2RotaryEmbedding()
)

In [None]:
# 模型输出
inputs_on_gpu = {key: value.to("cuda") for key, value in inputs.items()}
print(inputs_on_gpu)
outputs = model(**inputs_on_gpu) # **表示解包字典为关键字参数形式，传入函数
print(outputs.last_hidden_state.shape) # batch_size, sequence_length, hidden_size
outputs

{'input_ids': tensor([[108386,   3837, 105043, 100165,  11319, 151643],
        [ 35946, 100165, 104993,   3837, 104198,  56568],
        [102048,  12857,  44928, 151643, 151643, 151643]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0]], device='cuda:0')}
torch.Size([3, 6, 896])


BaseModelOutputWithPast(last_hidden_state=tensor([[[ -1.1582,  -0.6299,   1.4277,  ...,   6.9180,   0.4622,   1.4111],
         [  2.8145,   5.9492,   4.4766,  ...,   2.1406,   3.9102,  -3.5078],
         [  5.2148,   6.3672,   4.6055,  ...,   8.7422,   0.4136,  -8.5625],
         [ -2.2637,   3.3730,  -1.2910,  ...,   2.7227,   0.6890,  -7.5312],
         [  3.9941,   4.8789,   2.0703,  ...,   2.5059,  -3.2148,  -8.0234],
         [ -2.3848,  -0.6816,   5.1914,  ...,   4.0547,   2.9180, -15.1484]],

        [[  1.8525,   4.4531,   1.5273,  ...,   7.7227,   1.2178,   0.0757],
         [ -4.9336,  -1.3486,  -6.9180,  ...,   2.4961,  -2.9961,  11.0469],
         [ -0.1310,   3.7656,   0.7329,  ...,   4.9414,   0.6860,  -4.6328],
         [  1.7012,  -0.8330,  -1.4219,  ...,   2.9258,   0.3335,   0.7085],
         [  0.9697,   0.5176,  -0.3689,  ...,   0.5933,  -0.9019,  -7.4922],
         [ -6.7305,  -0.1810,  -0.9126,  ...,   3.7031,  -0.3247,   7.4336]],

        [[ -0.9585,  -3.6055, 

In [54]:
import torch.nn as nn

class ClassificationModel(nn.Module):
    """LLM 二分类任务模型（基于Base Model：Qwen/Qwen2.5-0.5B-Instruct）

    Args:
        nn (_type_): _description_
    """
    def __init__(self, base_model: nn.Module, hidden_size: int) -> None:
        """_summary_

        Args:
            base_model (torch.Module): 基础模型
            hidden_size (int): 基础Transformers层hidden size
        """
        super().__init__()
        self.base_model = base_model
        self.net = nn.Sequential(nn.Linear(in_features=hidden_size, out_features=2, dtype=torch.half))
        
    def forward(self, inputs_ids: torch.Tensor, attention_mask: torch.Tensor=None) -> torch.Tensor:
        """前向传播

        Args:
            inputs_ids (torch.Tensor): 输入Token IDS
            attention_mask (torch.Tensor, optional): 填充向量表示. Defaults to None.

        Returns:
            torch.Tensor: 分类输出
        """
        base_model_outs = self.base_model(inputs_ids, attention_mask)
        return self.net(base_model_outs.last_hidden_state)
classification_model = ClassificationModel(base_model=model, hidden_size=model.config.hidden_size).to("cuda")
classification_model(inputs_ids=inputs_ids)
classification_model

ClassificationModel(
  (base_model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb

In [None]:
# 以下即可进行PyTorch的分类微调