In [1]:
# 导入AutoModel类，该类允许自动从预训练模型库加载模型
from modelscope import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch
import torch.nn as nn
checkpoint = "openai-community/gpt2"

In [2]:
# 加载GPT-2模型 & 分词器
model: nn.Module = AutoModel.from_pretrained(pretrained_model_name_or_path=checkpoint, num_labels=1).cuda()
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=checkpoint)

Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/openai-community/gpt2


2026-02-05 09:51:31.830438: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-02-05 09:51:32.988260: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-02-05 09:51:36.658900: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/openai-community/gpt2


In [3]:
model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [None]:
# transformers模型标准输出
from transformers.modeling_outputs import SequenceClassifierOutput
# 自定义模型分类层
class GPT2Classification(nn.Module):
    """基于GPT2的分类层

    Args:
        nn (Module): PyTorch Module
    """
    def __init__(self, pretrained_model: nn.Module, label_num: int):
        super().__init__()
        self.label_num = label_num
        # 兼容transformers库的自定义PyTorch分类层实现
        self.transformer = pretrained_model
        self.classifier  = nn.Linear(in_features=pretrained_model.ln_f.weight.shape[0], out_features=label_num)
        
    def forward(self,
                input_ids: torch.Tensor=None,
                attention_mask: torch.Tensor=None,
                token_type_ids: torch.Tensor=None,
                labels: torch.Tensor=None,
                **kwargs
        ) -> torch.Tensor:
        """前向传播

        Args:
            x (torch.Tensor): 输入张量

        Returns:
            torch.Tensor: 分类结果
        """
        pretrained_model_outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs
        )
        
        # 取出最后一个有效token的隐藏状态（GPT-2无[CLS]）
        hidden_states = pretrained_model_outputs.last_hidden_state
        if attention_mask is not None:
            # 安全获取每个序列最后一个非padding token位置
            last_token_idx = attention_mask.sum(dim=1)-1
            bacth_indices = torch.arange(hidden_states.size(0), device=hidden_states.device)
            pooled_output = hidden_states[bacth_indices, last_token_idx] # 过滤非padding token位置
        else:
            # 安全获取每个序列最后一个非padding token位置
            pooled_output = hidden_states[:, -1, :]
        
        # 分类层输出
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            # 表示需要进行反向传播，需要定义损失函数
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.label_num), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=pretrained_model_outputs.hidden_states,
            attentions=pretrained_model_outputs.attentions
        )
classification_model = GPT2Classification(pretrained_model=model, label_num=2).to("cuda")

text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt').to("cuda")
output = classification_model(**encoded_input)
print(output)
classification_model

SequenceClassifierOutput(loss=None, logits=tensor([[ 5.0728, -1.6670]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


GPT2Classification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (classifier): Linear(in_features=768, out_features=2, bias=True)
)

In [20]:
# 加载数据集
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
from typing_extensions import Any

all_dataset = load_dataset("csv", data_files={
    "train": "llm-classification-finetuning/train_data.csv",
    "test" : "llm-classification-finetuning/test_data.csv",
    "val"  : "llm-classification-finetuning/val_data.csv"
}) # 从CSV中加载数据

# 基于load_dataset的DatasetDict.map对数据集进行分词
def tokenize_function(dataset: dict[str, Any]):
    """分词方法

    Args:
        dataset (dict[str, Any]): 数据集
    """
    return tokenizer(dataset['text'], truncation=True, padding=True)

tokenizer.pad_token = tokenizer.eos_token # 设置填充token为eos_token

tokenized_datasets = all_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_datasets

Map:   0%|          | 0/105333 [00:00<?, ? examples/s]

Map:   0%|          | 0/30095 [00:00<?, ? examples/s]

Map:   0%|          | 0/15048 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 105333
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 30095
    })
    val: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 15048
    })
})

In [9]:
# 配置参数
from transformers import TrainingArguments
model_dir = "/root/.cache/modelscope/hub/models/openai-community/gpt2" # 模型目录
checkpoint_output_dir = "gpt2-finetuning-checkpoints"

train_args = TrainingArguments(
    output_dir=checkpoint_output_dir, # 检查点保存目录
    save_strategy="epoch",            # 保存策略（每次epoch后保存）
    eval_strategy="epoch",            # 评估策略（每次epoch后评估）
    save_total_limit=3,               # 仅保存最新3个检查点（自动删除旧检查点）
    load_best_model_at_end=True,      # 训练结束自动加载验证集最佳模型
    metric_for_best_model="accuracy", # 选择最佳模型的指标
    greater_is_better=True,           # 指标越大越好（如：准确率）False表示越小越好，用于loss
    num_train_epochs=3,               # 设置迭代次数
    learning_rate=2e-5,               # 学习率
    weight_decay=0.01                 # 权重衰减（L2正则化：在训练过程中对模型权重施加惩罚）
)

In [39]:
# 冻结策略（逐步解冻）
## 1.先冻结大部分层，仅解冻分类层
for param in classification_model.parameters():
    param.requires_grad = False
## 2.解冻分类层
for param in classification_model.classifier.parameters():
    param.requires_grad = True

In [21]:
## 开启训练
from transformers import Trainer
from transformers import DataCollatorWithPadding
# 创建Trainer
trainer = Trainer(
    model=classification_model,
    args=train_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    # data_collator=DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt", padding=False),
    # tokenizer=tokenizer
)

In [41]:
# 调用Trainer对象的train()方法启动模型的训练过程，设置自动加载output_dir中最新检查点（注意：如果一开始没有检查点，则无法进行训练）
trainer.train(resume_from_checkpoint=False)

TypeError: forward() got an unexpected keyword argument 'labels'