## 文本相似度

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("json", data_files="./train_pair_1w.json", split="train")
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10000
})

In [4]:
datasets = dataset.train_test_split(test_size=0.2)
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

In [11]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

def process_func(examples):
    tokenizer_examples = tokenizer(examples["sentence1"], examples["sentence2"],truncation = True, max_length = 128)
    tokenizer_examples["labels"] = [float(label) for label in examples["label"]]
    return tokenizer_examples

tokenized_dataset = datasets.map(process_func,batched= True,remove_columns= datasets["train"].column_names)
tokenized_dataset

Map: 100%|██████████| 8000/8000 [00:01<00:00, 4886.56 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 4952.84 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [12]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-base",num_labels = 1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metirc = evaluate.load("f1")

In [13]:
def eval_metric(pred):
    predictions , labels = pred
    predictions = [int(p > 0.5) for p in predictions]
    # predictions = predictions.argmax(axis = -1)
    labels = [int(l) for l in labels]
    acc = acc_metric.compute(predictions= predictions , references= labels)
    f1 = f1_metirc.compute(predictions= predictions , references= labels)
    acc.update(f1)
    return acc


In [14]:
train_args = TrainingArguments(output_dir="./cross_model",      # 输出文件夹
                               per_device_train_batch_size=32,  # 训练时的batch_size
                               per_device_eval_batch_size=32,   # 验证时的batch_size
                               logging_steps=10,                # log 打印的频率
                               eval_strategy="epoch",           # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True)     # 训练完成后加载最优模型
train_args

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object=False,
fp16=False,
fp1

In [15]:
from transformers import DataCollatorWithPadding
trainer = Trainer(model=model, 
                  args=train_args, 
                  tokenizer=tokenizer,
                  train_dataset=tokenized_dataset["train"], 
                  eval_dataset=tokenized_dataset["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

  trainer = Trainer(model=model,


In [16]:
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [17]:
trainer.evaluate(tokenized_dataset["test"])



KeyboardInterrupt: 

In [18]:
from transformers import pipeline

model.config.id2label = {0:"不相似" , 1:"相似"}

pipe = pipeline("text-classification",model=model,tokenizer=tokenizer)

Device set to use cpu


In [27]:
result = pipe({"text":"北京不错","text_pair":"北京真不错"}, function_to_apply = "none")
if result["score"] > 0.5: result["label"] = "相似" 
else: result["label"] = "不相似"
result

{'label': '相似', 'score': 0.6953061819076538}