# 交互策略
input: `[CLS] [Sentence1] [SEP] [Sentence2] [SEP]`  
输出：  `Similarity(0/1)`

## Step1 导入相关包

In [1]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset
import evaluate

# step2 加载数据集

In [2]:
ds = load_dataset('json', data_files='../dataset/SimCLUE-dev.json', split='train[0:10000]')
ds

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10000
})

# step3 划分数据集

In [3]:
ds = ds.train_test_split(test_size=0.2)
ds

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

In [4]:
ds['train'][0]

{'sentence1': '三人；一人一边打鼓，一人弹钢琴，另一人骑着踏板和方向盘，在街上移动一种移动乐队。',
 'sentence2': '三个人在外面玩',
 'label': '1'}

# step4 数据预处理

In [5]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
tokenizer

BertTokenizerFast(name_or_path='hfl/chinese-macbert-base', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [6]:
def propress_function(examples):
    tokenized_examples = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=128)
    tokenized_examples["label"] = [float(label) for label in examples["label"]]
    return tokenized_examples

In [7]:
tokenized_ds = ds.map(propress_function, remove_columns=ds['train'].column_names, batched=True)
print(tokenized_ds['train'][0])

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'label': 1.0, 'input_ids': [101, 676, 782, 8039, 671, 782, 671, 6804, 2802, 7961, 8024, 671, 782, 2486, 7167, 4433, 8024, 1369, 671, 782, 7744, 4708, 6672, 3352, 1469, 3175, 1403, 4669, 8024, 1762, 6125, 677, 4919, 1220, 671, 4905, 4919, 1220, 727, 7339, 511, 102, 676, 702, 782, 1762, 1912, 7481, 4381, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [8]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

# step5 创建模型

In [9]:
model = AutoModelForSequenceClassification.from_pretrained('hfl/chinese-macbert-base', num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# step6 创建评估函数

In [10]:
acc_metirc = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [11]:
def eval_metric(pred):
    predictions, labels = pred
    predictions = [int( p>0.5) for p in predictions]
    labels = [int(l) for l in labels]
    acc = acc_metirc.compute(predictions=predictions, references=labels)
    f1  = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

# step7 创建训练参数

In [12]:
args = TrainingArguments(output_dir="./cross_model",      # 输出文件夹
                        per_device_train_batch_size=32,  # 训练时的batch_size
                        per_device_eval_batch_size=32,   # 验证时的batch_size
                        logging_steps=10,                # log 打印的频率
                        eval_strategy="epoch",           # 评估策略
                        save_strategy="epoch",           # 保存策略
                        save_total_limit=3,              # 最大保存数
                        learning_rate=2e-5,              # 学习率
                        weight_decay=0.01,               # weight_decay
                        metric_for_best_model="f1",      # 设定评估指标
                        load_best_model_at_end=True)     # 训练完成后加载最优模型
args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object=False,
fp16=False,
fp16

# step8 创建训练器

In [13]:
trainer = Trainer(
    model=model,
    args = args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=eval_metric
)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.177,0.143002,0.7875,0.783495
2,0.1166,0.130868,0.809,0.78491
3,0.1192,0.139981,0.813,0.78653


  predictions = [int( p>0.5) for p in predictions]
  predictions = [int( p>0.5) for p in predictions]
  predictions = [int( p>0.5) for p in predictions]


TrainOutput(global_step=750, training_loss=0.14015340844790142, metrics={'train_runtime': 227.6163, 'train_samples_per_second': 105.441, 'train_steps_per_second': 3.295, 'total_flos': 1210365764936448.0, 'train_loss': 0.14015340844790142, 'epoch': 3.0})

# step9 模型评估

In [15]:
trainer.evaluate(tokenized_ds['test'])

  predictions = [int( p>0.5) for p in predictions]


{'eval_loss': 0.13998089730739594,
 'eval_accuracy': 0.813,
 'eval_f1': 0.7865296803652968,
 'eval_runtime': 4.9521,
 'eval_samples_per_second': 403.867,
 'eval_steps_per_second': 12.722,
 'epoch': 3.0}

# step10 模型预测

In [16]:
from transformers import pipeline

In [24]:
model.config.id2label = {0: "不相似", 1: "相似"}

In [25]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

Device set to use cuda:0


In [28]:
result = pipe({"text": "你人很好", "text_pair": "你非常nice"}, function_to_apply="None")
result['label'] = "相似" if result['score'] > 0.5 else "不相似"
result

{'label': '相似', 'score': 0.9205779433250427}