# 导包

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding    
import torch
from datasets import load_dataset

# 加载数据

In [None]:
dataset = load_dataset('csv',data_files='../dataset/ChnSentiCorp_htl_all.csv',split='train')
dataset = dataset.filter(lambda x: x['review'] is not None) #数据清洗
dataset = dataset.train_test_split(test_size=0.2)  #划分数据集
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 1553
    })
})

# 数据预处理

In [9]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-large")

def preprocess_function(examples):  #数据处理
    tokenized_examples = tokenizer(examples["review"], truncation=True, max_length=128, padding="max_length")
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["review", "label"])
tokenized_datasets 

Map:   0%|          | 0/6212 [00:00<?, ? examples/s]

Map:   0%|          | 0/1553 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1553
    })
})

# 创建模型

In [18]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-large").to("cuda")
print(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

# 创建评估函数

In [11]:
import evaluate as eva
acc_metric = eva.load("accuracy")
f1_metric  = eva.load("f1")

In [12]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1  = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

# Trainer的使用
使用文档：https://huggingface.co/docs/transformers/trainer

## 显存优化方案：
    1. 减小batch_size 并设置梯度累计来模拟更大的batch_size
    2. 设置梯度检查点
    3. 选择参数更少的优化器
    4. 冻结部分层

In [19]:
train_args = TrainingArguments(
    output_dir='./check_points',     #结果输出路径
    per_device_train_batch_size=4,   #训练批次大小

    #显存优化
    gradient_accumulation_steps=32,  #梯度累积
    gradient_checkpointing=True,     #梯度检查点 前向传播时只存储部分关键激活值，反向传播时重新计算其余的激活值
    optim='adafactor',                #优化器设置

    per_device_eval_batch_size=16,    #验证批次大小
    num_train_epochs=1,
    logging_steps=10,                #日志记录间隔
    logging_strategy='steps',
    eval_strategy='epoch',           #按epoch间隔打印验证结果
    save_strategy='epoch',
    learning_rate=1e-5,
    weight_decay=0.01,
    metric_for_best_model='f1',  #指定评估指标
    load_best_model_at_end=True,       #训练结束是否加载最佳模型
    run_name='runs'
)

In [None]:
# 冻结BERT参数
for name, param in model.bert.named_parameters():
    param.requires_grad = False
    
trainer = Trainer(model=model, 
                args=train_args, 
                train_dataset=tokenized_datasets["train"], 
                eval_dataset=tokenized_datasets["test"],
                data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                compute_metrics=eval_metric,
                )

# 模型训练

In [16]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7382,0.726313,0.344495,0.045028


TrainOutput(global_step=49, training_loss=0.7475773461010992, metrics={'train_runtime': 96.808, 'train_samples_per_second': 64.168, 'train_steps_per_second': 0.506, 'total_flos': 1447289407248384.0, 'train_loss': 0.7475773461010992, 'epoch': 1.0})

# 模型评估

In [17]:
trainer.evaluate()

{'eval_loss': 0.7263129353523254,
 'eval_accuracy': 0.34449452672247266,
 'eval_f1': 0.0450281425891182,
 'eval_runtime': 16.6381,
 'eval_samples_per_second': 93.34,
 'eval_steps_per_second': 5.89,
 'epoch': 1.0}

# 模型预测

In [None]:
seq = '我觉得这家酒店不错'
model.eval()
with torch.inference_mode():
    inputs = tokenizer(seq, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    prdict = torch.argmax(logits, dim=-1)
    print(f'输入：{seq} \n 模型预测结果：{prdict}')

In [None]:
from transformers import  pipeline
model.config.id2label = {0: 'negative', 1: 'positive'}
pipe = pipeline("text-classification", model=model ,tokenizer=tokenizer, device=0)
pipe(seq)