In [1]:
# 导入AutoModel类，该类允许自动从预训练模型库加载模型
from modelscope import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch
import torch.nn as nn
# 设置预训练模型的检查点名称，这里使用THUDM维护的ChatGLM3-6B模型
check_point = "Qwen/Qwen2.5-0.5B-Instruct"
# 将可选的本地模型路径注释掉，如果需要从本地加载模型，则取消注释并指定正确的本地路径
# model_path = "/home/egcs/models/chatglm3-6b"
# 使用AutoModel的from_pretrained方法加载模型表示信任远程代码，允许从模型仓库执行未验证的代码
model: nn.Module = AutoModel.from_pretrained(pretrained_model_name_or_path=check_point, trust_remote_code=True, dtype="auto").half().cuda()
# 加载模型对应的分词器
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=check_point)
# 加载模型对应的配置
conf = AutoConfig.from_pretrained(pretrained_model_name_or_path=check_point)
model

2026-01-30 10:07:34,768 - modelscope - INFO - Target directory already exists, skipping creation.


Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct


2026-01-30 10:07:35.671910: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-30 10:07:36.269869: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-30 10:07:38.097978: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-30 10:07:41,456 - modelscope - INFO - Target directory already exists, skipping cre

Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct


2026-01-30 10:07:42,460 - modelscope - INFO - Target directory already exists, skipping creation.


Qwen2Model(
  (embed_tokens): Embedding(151936, 896)
  (layers): ModuleList(
    (0-23): 24 x Qwen2DecoderLayer(
      (self_attn): Qwen2Attention(
        (q_proj): Linear(in_features=896, out_features=896, bias=True)
        (k_proj): Linear(in_features=896, out_features=128, bias=True)
        (v_proj): Linear(in_features=896, out_features=128, bias=True)
        (o_proj): Linear(in_features=896, out_features=896, bias=False)
      )
      (mlp): Qwen2MLP(
        (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
        (up_proj): Linear(in_features=896, out_features=4864, bias=False)
        (down_proj): Linear(in_features=4864, out_features=896, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
    )
  )
  (norm): Qwen2RMSNorm((896,), eps=1e-06)
  (rotary_emb): Qwen2RotaryEmbedding()
)

In [2]:
# model.add_module(module=nn.Linear(896, 2), name="classification_layer") # 兼容PyTorch操作
model_from_conf = AutoModel.from_config(config=conf)
model_from_conf

Qwen2Model(
  (embed_tokens): Embedding(151936, 896)
  (layers): ModuleList(
    (0-23): 24 x Qwen2DecoderLayer(
      (self_attn): Qwen2Attention(
        (q_proj): Linear(in_features=896, out_features=896, bias=True)
        (k_proj): Linear(in_features=896, out_features=128, bias=True)
        (v_proj): Linear(in_features=896, out_features=128, bias=True)
        (o_proj): Linear(in_features=896, out_features=896, bias=False)
      )
      (mlp): Qwen2MLP(
        (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
        (up_proj): Linear(in_features=896, out_features=4864, bias=False)
        (down_proj): Linear(in_features=4864, out_features=896, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
    )
  )
  (norm): Qwen2RMSNorm((896,), eps=1e-06)
  (rotary_emb): Qwen2RotaryEmbedding()
)

In [3]:
tokenizer.decode(tokenizer.encode("“你好”"))
# 编码使用
inputs = tokenizer(["你好，你是谁？", "我谁也不是，我是你", "出口成章"], padding=True, truncation=True, return_tensors="pt")
inputs_ids = inputs['input_ids'].to("cuda")
inputs_ids.shape
inputs_ids
# 解码使用
tokenizer.decode(inputs_ids[2])
inputs

{'input_ids': tensor([[108386,   3837, 105043, 100165,  11319, 151643],
        [ 35946, 100165, 104993,   3837, 104198,  56568],
        [102048,  12857,  44928, 151643, 151643, 151643]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0]])}

# 检视模型（Qwen/Qwen2.5-0.5B-Instruct）

## embed_tokens
Embedding(151936, 896)，词嵌入层，输入维度为vocab_size=151936，转为896维向量
## layers（ModuleList）
一个标准的Decoder Only Transformers架构模型，这里Layer为TransformersBlocks；
1. 多头自注意力层重复24次：
    1. query：表示权重，维度为896*896，使用Linear层便于权重和input序列的矩阵乘法和权重参数的初始化；
    2. key：维度为896*128，query @ key.T，query的行数=key.T的列数；
    3. value：与key的维度相同，这里k和v的维度为128是因为当前为多头注意力，将128*7分为了7个头；
    4. output：最后一个线性输出将输出维度重新设置为896*896；
2. MLP（Qwen2MLP，多层感知机层）：是一个多层感知机，负责对经过自注意力机制处理后的向量表示进行非线性变换，以便捕捉更复杂的语义模式。
3. layernorm：包含两个两个层归一化，与GPT一致，均为前层归一化和后层归一化，确保反向传播过程中的数值的稳定性；
## norm
模型的最后一层层归一化
## rotary_emb
### 概述
Rotary Position Embedding, RoPE模块，是一种位置编码技术，用于在Transformer模型中引入序列中token的相对位置信息，与传统绝对位置编码不同，RoPE通过旋转变换将位置信息融入到Query和Key向量中。
### 原理
将Query和Key的每一对维度（q2i, q2i+1）视作一个二维坐标；
根据位置索引pos和一个频率参数，将这个坐标旋转一个角度；
旋转角度θ=pos * base^(-2i/d)，角度随位置线性增长，i越大，角度越大。

In [4]:
# 检视模型
model

Qwen2Model(
  (embed_tokens): Embedding(151936, 896)
  (layers): ModuleList(
    (0-23): 24 x Qwen2DecoderLayer(
      (self_attn): Qwen2Attention(
        (q_proj): Linear(in_features=896, out_features=896, bias=True)
        (k_proj): Linear(in_features=896, out_features=128, bias=True)
        (v_proj): Linear(in_features=896, out_features=128, bias=True)
        (o_proj): Linear(in_features=896, out_features=896, bias=False)
      )
      (mlp): Qwen2MLP(
        (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
        (up_proj): Linear(in_features=896, out_features=4864, bias=False)
        (down_proj): Linear(in_features=4864, out_features=896, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
    )
  )
  (norm): Qwen2RMSNorm((896,), eps=1e-06)
  (rotary_emb): Qwen2RotaryEmbedding()
)

In [5]:
# 模型输出
inputs_on_gpu = {key: value.to("cuda") for key, value in inputs.items()}
print(inputs_on_gpu)
outputs = model(**inputs_on_gpu) # **表示解包字典为关键字参数形式，传入函数
print(outputs.last_hidden_state.shape) # batch_size, sequence_length, hidden_size
outputs

{'input_ids': tensor([[108386,   3837, 105043, 100165,  11319, 151643],
        [ 35946, 100165, 104993,   3837, 104198,  56568],
        [102048,  12857,  44928, 151643, 151643, 151643]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0]], device='cuda:0')}
torch.Size([3, 6, 896])


BaseModelOutputWithPast(last_hidden_state=tensor([[[ -1.1582,  -0.6299,   1.4277,  ...,   6.9180,   0.4622,   1.4111],
         [  2.8145,   5.9492,   4.4766,  ...,   2.1406,   3.9102,  -3.5078],
         [  5.2148,   6.3672,   4.6055,  ...,   8.7422,   0.4136,  -8.5625],
         [ -2.2637,   3.3730,  -1.2910,  ...,   2.7227,   0.6890,  -7.5312],
         [  3.9941,   4.8789,   2.0703,  ...,   2.5059,  -3.2148,  -8.0234],
         [ -2.3848,  -0.6816,   5.1914,  ...,   4.0547,   2.9180, -15.1484]],

        [[  1.8525,   4.4531,   1.5273,  ...,   7.7227,   1.2178,   0.0757],
         [ -4.9336,  -1.3486,  -6.9180,  ...,   2.4961,  -2.9961,  11.0469],
         [ -0.1310,   3.7656,   0.7329,  ...,   4.9414,   0.6860,  -4.6328],
         [  1.7012,  -0.8330,  -1.4219,  ...,   2.9258,   0.3335,   0.7085],
         [  0.9697,   0.5176,  -0.3689,  ...,   0.5933,  -0.9019,  -7.4922],
         [ -6.7305,  -0.1810,  -0.9126,  ...,   3.7031,  -0.3247,   7.4336]],

        [[ -0.9585,  -3.6055, 

In [6]:
import torch.nn as nn

class ClassificationModel(nn.Module):
    """LLM 二分类任务模型（基于Base Model：Qwen/Qwen2.5-0.5B-Instruct）

    Args:
        nn (_type_): _description_
    """
    def __init__(self, base_model: nn.Module, hidden_size: int) -> None:
        """_summary_

        Args:
            base_model (torch.Module): 基础模型
            hidden_size (int): 基础Transformers层hidden size
        """
        super().__init__()
        self.base_model = base_model
        self.net = nn.Sequential(nn.Linear(in_features=hidden_size, out_features=1, dtype=torch.half))
        
    def forward(self, inputs_ids: torch.Tensor, attention_mask: torch.Tensor=None) -> torch.Tensor:
        """前向传播

        Args:
            inputs_ids (torch.Tensor): 输入Token IDS
            attention_mask (torch.Tensor, optional): 填充向量表示. Defaults to None.

        Returns:
            torch.Tensor: 分类输出
        """
        base_model_outs = self.base_model(inputs_ids, attention_mask)
        return self.net(base_model_outs.last_hidden_state)
classification_model = ClassificationModel(base_model=model, hidden_size=model.config.hidden_size).to("cuda")
classification_model(inputs_ids=inputs_ids)
classification_model

ClassificationModel(
  (base_model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb

In [7]:
# 首先加载Kaggle数据集
!kaggle competitions download -c llm-classification-finetuning
import zipfile
import os
import pandas as pd
path = 'llm-classification-finetuning'
if not os.path.exists(path=path):
    os.makedirs(name=path)
    fp = zipfile.ZipFile(file='llm-classification-finetuning.zip', mode='r')
    fp.extractall(path)

train_csv_data = pd.read_csv(filepath_or_buffer='llm-classification-finetuning/train.csv')
test_csv_data = pd.read_csv(filepath_or_buffer='llm-classification-finetuning/test.csv')

HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /kaggle-competitions-data/kaggle-v2/86518/9809560/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1769826625&Signature=gAC3WlQ3n%2FdYz%2FYRcxTC%2B3%2FShRxfftaiiw2DUO3JHCOvnmfKzCBZ2hzxy%2FIp1MPfBItsfYdYDtf1rTOHCyAnC3yWP1FyKTS58PXWurSLL%2BK4BdYZMHH9OgBTA3x6806JeZpsANqcwyDS3RupZWI434wA7eC8Cy6uB10ALBsvAAnUYroMk6h%2B5URmYAtAKcM9lS%2BVPEnvETPLWV6CaesjLHpWtQYW43%2FPQUvzPp1RXq4EW9wfgkQxVvY0gkkZ%2FGRw5Z1XUt6wlNAOKqBq7gct4%2FZ1RvD8d6EGjR0eCF402Vz6uF7QYpKikxclRWHLVt%2Fo1jYo8Q7pID%2BYdjbYcLKv5w%3D%3D&response-content-disposition=attachment%3B+filename%3Dllm-classification-finetuning.zip (Caused by SSLError(SSLZeroReturnError(6, 'TLS/SSL connection has been closed (EOF) (_ssl.c:1147)')))


In [27]:
import pandas as pd
# 特征工程，将数据集划分为text：prompt+response，labels：1/0
train_data = []

for _, row in train_csv_data.iterrows():
    if row['winner_model_a'] == 1:
        train_data.append([row['prompt'] + row['response_a'], 1])
    else:
        train_data.append([row['prompt'] + row['response_b'], 0])
        
    if row['winner_model_b'] == 1:
        train_data.append([row['prompt'] + row['response_b'], 1])
    else:
        train_data.append([row['prompt'] + row['response_a'], 0])
        
    if row['winner_tie'] == 1:
        train_data.append([row['prompt'] + row['response_a'], 1])
        train_data.append([row['prompt'] + row['response_b'], 1])

all_data = pd.DataFrame(train_data, columns=["text", "label"])
# train_data.to_csv("llm-classification-finetuning/train_data.csv", index=False)

In [29]:
# 划分数据集7:2:1（训练、测试、验证）
train_end_idx = int(len(all_data) * 0.7)
test_end_idx = train_end_idx + int(len(all_data) * 0.2)
train_data = all_data[:train_end_idx]
test_data = all_data[train_end_idx:test_end_idx]
val_data = all_data[test_end_idx:]

print(f"Size of train、test、val dataset is {len(train_data)}, {len(test_data)}, {len(val_data)}")

Size of train、test、val dataset is 105333, 30095, 15048


In [30]:
# 保存数据集
train_data.to_csv("llm-classification-finetuning/train_data.csv", index=False)
test_data.to_csv("llm-classification-finetuning/test_data.csv", index=False)
val_data.to_csv("llm-classification-finetuning/val_data.csv", index=False)

In [31]:
train_data

Unnamed: 0,text,label
0,"[""Is it morally right to try to have a certain...",1
1,"[""Is it morally right to try to have a certain...",0
2,"[""What is the difference between marriage lice...",0
3,"[""What is the difference between marriage lice...",1
4,"[""explain function calling. how would you call...",0
...,...,...
105328,"[""What did Bilbo have in his pocket?"",""When di...",0
105329,"[""What did Bilbo have in his pocket?"",""When di...",1
105330,"[""Reescreva as fun\u00e7\u00f5es, agrupando-as...",0
105331,"[""Reescreva as fun\u00e7\u00f5es, agrupando-as...",0


In [32]:
# Datasets库（兼容处理Kaggle数据集）
from datasets import load_dataset
# 读取数据集（并且可以同时加载多个数据集，并且对不同数据集划分指定文件）
# train_dataset = load_dataset("csv", data_files="llm-classification-finetuning/train.csv")
# all_dataset = load_dataset("csv", data_files=["llm-classification-finetuning/train.csv", "llm-classification-finetuning/test.csv"])
all_dataset = load_dataset("csv", data_files={
    "train": "llm-classification-finetuning/train_data.csv",
    "test" : "llm-classification-finetuning/test_data.csv",
    "val"  : "llm-classification-finetuning/val_data.csv"
})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

In [33]:
all_dataset['train'][1]

{'text': '["Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."]["The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination.\\n\\nHere are some arguments in favor of and against such policies:\\n\\n**Arguments in favor:**\\n\\n1. **Correcting Historical Inequities:** Women have historically been underrepresented in leadership roles due to various cultural, institutional, and social barriers. Aiming for a specific percentage can be seen as a corrective measure to address past and ongoing discrimination.\\n\\n2. **Promoting Diversity:** Diverse leadership teams can enhance decision-making and represent a broader range of perspectives. This can lead to better outcomes for organizations and society as a whole.\\n\\n3. **E

In [34]:
from typing_extensions import Any

def tokenize_function(dataset: dict[str, Any]):
    """对每个输入进行tokenize操作

    Args:
        dataset (dict[str, Any]): 数据集

    Returns:
        DatasetDict: 数据集Dict
    """
    return tokenizer(dataset['text'], truncation=True, padding=True)

tokenized_datasets = all_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

Map:   0%|          | 0/105333 [00:00<?, ? examples/s]

Map:   0%|          | 0/30095 [00:00<?, ? examples/s]

Map:   0%|          | 0/15048 [00:00<?, ? examples/s]

In [13]:
tokenized_datasets['train'][0]

{'label': 1,
 'input_ids': [1183,
  3872,
  432,
  56223,
  1290,
  311,
  1430,
  311,
  614,
  264,
  3654,
  11414,
  315,
  27485,
  389,
  90150,
  9892,
  30,
  2198,
  3925,
  11,
  1558,
  77082,
  9173,
  389,
  264,
  22502,
  30,
  67585,
  323,
  2968,
  752,
  2464,
  4226,
  1189,
  9868,
  785,
  3405,
  315,
  3425,
  432,
  374,
  56223,
  1290,
  311,
  9213,
  369,
  264,
  3654,
  11414,
  315,
  27485,
  304,
  90150,
  9892,
  374,
  264,
  6351,
  30208,
  4265,
  429,
  17601,
  37764,
  315,
  50741,
  11,
  21777,
  11,
  19492,
  11,
  323,
  21240,
  7110,
  77,
  1699,
  8420,
  525,
  1045,
  5977,
  304,
  4694,
  315,
  323,
  2348,
  1741,
  10186,
  7190,
  77,
  1699,
  334,
  19139,
  304,
  4694,
  66963,
  59,
  77,
  1699,
  16,
  13,
  3070,
  33092,
  287,
  40043,
  758,
  25310,
  1361,
  66963,
  10973,
  614,
  34801,
  1012,
  1212,
  52759,
  304,
  11438,
  12783,
  4152,
  311,
  5257,
  12752,
  11,
  32132,
  11,
  323,
  3590,
  29640

In [35]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, return_tensors="pt")
train_dataloader = DataLoader(dataset=tokenized_datasets['train'], batch_size=500, shuffle=True, collate_fn=data_collator)
test_dataloader = DataLoader(dataset=tokenized_datasets['test'], batch_size=500, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(dataset=tokenized_datasets['val'], batch_size=500, shuffle=True, collate_fn=data_collator)

In [15]:
for batch in train_dataloader:
    print(batch)
    break

{'input_ids': tensor([[  1183,  36337,    311,  ..., 151643, 151643, 151643],
        [  1183,    531,    438,  ..., 151643, 151643, 151643]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 1])}


In [16]:
from transformers import TrainingArguments, AutoModelForSequenceClassification
# 模型保存路径
model_dir = "/root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct"
# 初始化一个TrainingArguments对象，用于存储和管理训练参数配置（实例构造函数参数说明）
training_args = TrainingArguments(
    output_dir=f"{model_dir}/trainer",
    logging_dir=f"{model_dir}/trainer/runs",
    logging_steps=10,
    learning_rate=0.001,
    weight_decay=0.1
)

In [24]:
checkpoint = "/root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct"
# 对于分类任务，亦可通过AutoModelForSequenceClassification指定num_labels（分类头数量）来进行分类任务
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=checkpoint, num_labels=1).cuda()
model

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Qwen2ForSequenceClassification(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rota

In [None]:
# 创建Trainer
from transformers import Trainer
# 初始化TrainingArguments对象，传入一个参数“test-trainer”作为输出目录的基础名称
training_args = TrainingArguments(
    output_dir="test-trainer",        # 检查点保存目录
    save_strategy="epoch",            # 保存策略（每次epoch后保存）
    save_total_limit=3,               # 仅保存最新3个检查点（自动删除旧检查点）
    load_best_model_at_end=True,      # 训练结束自动加载验证集最佳模型
    metric_for_best_model="accuracy", # 选择最佳模型的指标
    greater_is_better=True,           # 指标越大越好（如：准确率）False表示越小越好，用于loss
    num_train_epochs=3                # 设置迭代次数
)
# 冻结模型Transformers层
for param in model.parameters():
    param.requires_grad = False
# 解冻分类层
for param in model.score.parameters():
    param.requires_grad = True
    
# 创建Trainer实例
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [26]:
# 调用Trainer对象的train()方法启动模型的训练过程，设置自动加载output_dir中最新检查点
trainer.train(resume_from_checkpoint=True)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


RuntimeError: Found dtype Long but expected Float