In [3]:
# 导入AutoModel类，该类允许自动从预训练模型库加载模型
from modelscope import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch
import torch.nn as nn
checkpoint = "openai-community/gpt2"

In [None]:
# 加载GPT-2模型 & 分词器
model: nn.Module = AutoModel.from_pretrained(pretrained_model_name_or_path=checkpoint, num_labels=1).cuda()
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=checkpoint)

Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/openai-community/gpt2
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/openai-community/gpt2


LayerNorm((768,), eps=1e-05, elementwise_affine=True)

In [21]:
model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [23]:
# 自定义模型分类层
class GPT2Classification(nn.Module):
    """基于GPT2的分类层

    Args:
        nn (Module): PyTorch Module
    """
    def __init__(self, pretrained_model: nn.Module, label_num: int):
        super().__init__()
        # 单层label_num结构
        self.net = nn.Sequential(pretrained_model, nn.Linear(in_features=model.ln_f.weight.shape[0], out_features=label_num))
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """前向传播

        Args:
            x (torch.Tensor): 输入张量

        Returns:
            torch.Tensor: 分类结果
        """
        return self.net(x)
classification_model = GPT2Classification(pretrained_model=model, label_num=2)
classification_model

GPT2Classification(
  (net): Sequential(
    (0): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (1): Linear(in_features=768, out_features=2, bias=True)
  

In [5]:
# 加载数据集
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

all_dataset = load_dataset("csv", data_files={
    "train": "llm-classification-finetuning/train_data.csv",
    "test" : "llm-classification-finetuning/test_data.csv",
    "val"  : "llm-classification-finetuning/val_data.csv"
}) # 从CSV中加载数据

In [None]:
# 配置参数
from transformers import TrainingArguments
model_dir = "/root/.cache/modelscope/hub/models/openai-community/gpt2" # 模型目录
checkpoint_output_dir = "gpt2-finetuning-checkpoints"

train_args = TrainingArguments(
    output_dir=checkpoint_output_dir, # 检查点保存目录
    save_strategy="epoch",            # checkpoint保存策略（每次迭代保存）
    save_total_limit=3,               # 最多保存3个检查点，多余检查点删除
    load_best_model_at_end=True,      # 训练结束后自动加载验证集最优模型
    metric_for_best_model="accuracy", # 选择模型最佳指标
    greater_is_better=True,           # True：最佳指标越大越好、False：最佳指标越小越好
    num_train_epochs=3,               # 迭代次数
    learning_rate=2e-5,               # 学习率
    weight_decay=0.01                 # 权重衰减（L2正则化：在训练过程中对模型权重施加惩罚）
)

In [None]:
## 冻结策略（逐步解冻）
### 1.先冻结大部分层，仅解冻分类层