# 判断两个句子是否相似

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,TrainingArguments,Trainer,DataCollatorWithPadding
from datasets import load_dataset

# 1. 加载数据

In [3]:
datas = load_dataset("shibing624/sts-sohu2021",'dda')
# datas.save_to_disk("./data/sts-sohu2021/dda")

Saving the dataset (0/1 shards):   0%|          | 0/10512 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
# 数据预览
datas,datas['train'][0]

(DatasetDict({
     train: Dataset({
         features: ['sentence1', 'sentence2', 'label'],
         num_rows: 1000
     })
     test: Dataset({
         features: ['sentence1', 'sentence2', 'label'],
         num_rows: 1000
     })
 }),
 {'sentence1': '独有英雄：大臣以权谋私，竟把上等煤换成次煤，皇太后气得要晕倒',
  'sentence2': '独有英雄：马爷帮小孩解围，哪料孩子母亲却让他喊爹，有好戏看了',
  'label': 1})

In [6]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

In [7]:
# 定义数据处理函数
def process_fun(examples):
    tokenized_examples=tokenizer(examples['sentence1'],examples['sentence2'],padding=True,max_length=64,return_tensors='pt')
    tokenized_examples["labels"] = [label for label in examples["label"]]
    return tokenized_examples

In [8]:
data_tokenizer = datas.map(process_fun,batched=True,remove_columns=datas["train"].column_names)

# 2. 加载模型


In [9]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-base", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /data1/model/chinese-macbert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3.创建评估函数

In [10]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metirc = evaluate.load("f1")

def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels,average='macro')
    acc.update(f1)
    return acc

## 4. 设置训练参数

In [11]:
train_args = TrainingArguments(output_dir="./similarity_model",      # 输出文件夹
                               per_device_train_batch_size=32,  # 训练时的batch_size
                               per_device_eval_batch_size=32,  # 验证时的batch_size
                               logging_steps=10,                # log 打印的频率
                               evaluation_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True)     # 训练完成后加载最优模型

## 5. 定义训练器

In [12]:
trainer = Trainer(model=model,
                  args=train_args,
                  train_dataset=data_tokenizer["train"],
                  eval_dataset=data_tokenizer["test"],
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

## 6. 训练

In [13]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4809,0.442214,0.798,0.71485
2,0.3576,0.364873,0.833,0.787432
3,0.2961,0.360661,0.837,0.801068


TrainOutput(global_step=96, training_loss=0.3915957373877366, metrics={'train_runtime': 125.9323, 'train_samples_per_second': 23.822, 'train_steps_per_second': 0.762, 'total_flos': 464041568340000.0, 'train_loss': 0.3915957373877366, 'epoch': 3.0})

## 7. 评估

In [14]:
eval_result = trainer.evaluate(data_tokenizer["test"])
eval_result

{'eval_loss': 0.36066102981567383,
 'eval_accuracy': 0.837,
 'eval_f1': 0.8010678871090771,
 'eval_runtime': 9.8404,
 'eval_samples_per_second': 101.622,
 'eval_steps_per_second': 3.252,
 'epoch': 3.0}

## 8.推理

In [16]:
from transformers import pipeline, TextClassificationPipeline
model.config.id2label = {0: "不相似", 1: "相似"}
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer,device=0)
result = pipe({"text": "我喜欢北京", "text_pair": "北京是个好地方"}, function_to_apply="none")
result

{'label': '相似', 'score': 0.049160078167915344}