In [11]:
from transformers import BertForSequenceClassification
from transformers import BertTokenizer
import torch
from datasets import load_dataset, Features, Value

# 测试预训练模型

In [2]:
model_name = "IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)  # 预训练模型
texta = "鲸鱼是哺乳动物，所有哺乳动物都是恒温动物"
textb = "鲸鱼也是恒温动物"
output = model(torch.tensor([tokenizer.encode(texta, textb)]))
print(torch.nn.functional.softmax(output.logits, dim=-1))  # 测试一下

tensor([[0.0645, 0.9355]], grad_fn=<SoftmaxBackward0>)


# 数据预处理

In [3]:
label_to_id = {  # 分类
    "升学": 0,
    "志愿": 1,
    "教务": 2,
    "思政": 3,
    "心理": 4,
    "灾害": 5,
    "作业与考试": 6,
    "竞赛与机会": 7,
    "企业参观与就业": 8,
    "生活": 9,
    "重要通知": 10,
    "垃圾与乐子": 11,
}

# 将label_to_id进行反转
id_to_label = {value: key for key, value in label_to_id.items()}
print(id_to_label)

{0: '升学', 1: '志愿', 2: '教务', 3: '思政', 4: '心理', 5: '灾害', 6: '作业与考试', 7: '竞赛与机会', 8: '企业参观与就业', 9: '生活', 10: '重要通知', 11: '垃圾与乐子'}


In [4]:
# 明确地定义CSV数据的特征描述
features = Features({"类别": Value("string"), "通知内容": Value("string")})

# 使用提供的特征描述加载数据集
dataset = load_dataset("csv", data_files="../data/combined_data.csv", features=features)

# 预览数据集
print(dataset)

Found cached dataset csv (/Users/wangfiox/.cache/huggingface/datasets/csv/default-5873a662a0c59a4b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['类别', '通知内容'],
        num_rows: 307
    })
})


In [5]:
def preprocess_function(batch):
    # 对通知内容进行分词，并返回结果
    encoding = tokenizer(
        batch["通知内容"], truncation=True, padding="max_length", max_length=128
    )  # 分词，截断，填充
    encoding["labels"] = [
        label_to_id[label] for label in batch["类别"]
    ]  # 使用label_to_id将类别名转换为ID
    return encoding


# 使用map函数进行预处理
encoded_dataset = (
    dataset["train"]
    .map(preprocess_function, batched=True)
    .train_test_split(test_size=0.05)
)

train_dataset = encoded_dataset["train"]
test_dataset = encoded_dataset["test"]
print(train_dataset, test_dataset)

Loading cached processed dataset at /Users/wangfiox/.cache/huggingface/datasets/csv/default-5873a662a0c59a4b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-6887013febe0af5a.arrow


Dataset({
    features: ['类别', '通知内容', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 291
}) Dataset({
    features: ['类别', '通知内容', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 16
})


# 修改输出层

In [6]:
# 修改模型输出
num_labels = len(label_to_id)
model = BertForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels, ignore_mismatched_sizes=True
)

# 打印模型的最后一层，验证是12分类
print(model.classifier)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([12, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([12]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Linear(in_features=768, out_features=12, bias=True)


# 激动人心的trainer.train()

In [15]:
from transformers import Trainer, TrainingArguments
import evaluate

# 定义评估函数
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = evaluate.load("accuracy").compute(predictions=preds, references=labels)
    return {"accuracy": acc}


# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=10,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    save_steps=50,
    logging_steps=20,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

# 创建Trainer对象
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


# 开始训练
trainer.train()

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.224534034729004, 'eval_accuracy': {'accuracy': 0.6875}, 'eval_runtime': 1.9915, 'eval_samples_per_second': 8.034, 'eval_steps_per_second': 0.502, 'epoch': 2.0}
{'train_runtime': 29.7401, 'train_samples_per_second': 29.354, 'train_steps_per_second': 0.504, 'train_loss': 0.5012121518452962, 'epoch': 3.0}


TrainOutput(global_step=15, training_loss=0.5012121518452962, metrics={'train_runtime': 29.7401, 'train_samples_per_second': 29.354, 'train_steps_per_second': 0.504, 'train_loss': 0.5012121518452962, 'epoch': 3.0})