In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

In [8]:
dataset = load_dataset("json", data_files="./train_pair_1w.json", split="train")

In [9]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10000
})

In [10]:
dataset[0]


{'sentence1': '找一部小时候的动画片', 'sentence2': '求一部小时候的动画片。谢了', 'label': '1'}

In [11]:
datasets = dataset.train_test_split(test_size=0.2)

In [12]:
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

In [13]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

In [14]:
def process_func(examples):
    sentences = []
    labels = []
    for sent1, sent2, label in zip(examples["sentence1"], examples["sentence2"], examples["label"]):
        sentences.append(sent1)
        sentences.append(sent2)
        labels.append(1 if int(label) == 1 else -1)
    tokenized_examples = tokenizer(sentences, truncation=True, max_length=128, padding="max_length")
    # (batch_size*2, 128) -> (batch_size, 2, 128)
    # 不同于拼接的方案，这里是直接将两个sent分别进行tokenizer化, 然后分组
    tokenized_examples = {k: [v[i: i + 2] for i in range(0, len(v), 2)]for k, v in tokenized_examples.items()}
    tokenized_examples["labels"] = labels
    return tokenized_examples



In [15]:
tokenized_datasets = datasets.map(process_func, batched=True, remove_columns=datasets["train"].column_names)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [16]:
print(tokenized_datasets["train"][0])

{'input_ids': [[101, 2533, 1568, 8024, 2533, 1568, 8024, 2769, 1714, 6887, 8024, 1353, 3633, 2769, 812, 4500, 679, 4708, 5314, 5632, 2346, 2823, 7937, 4172, 8039, 1420, 4708, 8024, 2207, 1995, 100, 100, 6381, 857, 8024, 2769, 3221, 6432, 1168, 976, 1168, 4638, 131, 1963, 3362, 872, 2802, 5050, 1086, 1343, 1461, 1580, 2255, 2411, 8024, 3300, 2769, 7373, 4708, 738, 1962, 8024, 3766, 3300, 2769, 7373, 4708, 738, 5387, 8024, 2769, 2218, 6206, 1343, 1440, 6401, 3360, 3142, 1044, 4495, 8039, 7370, 7478, 2533, 1168, 800, 4638, 1398, 2692, 8024, 872, 6656, 872, 6134, 2475, 4638, 6929, 4905, 779, 2166, 1068, 5143, 2218, 679, 6387, 1086, 2612, 1908, 1568, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1962, 1568, 8024, 1962, 1568, 8024, 2769, 1373, 4708, 8024, 1353, 3633, 2769, 812, 4500, 679, 4708, 5314, 5632, 2346, 2823, 7937, 4172, 8039, 872, 1420, 4708, 8024, 2207, 1995, 671, 6381, 857, 8024, 2769, 6432, 6413, 1377, 3221, 5050, 3144, 4638, 671, 1963, 3362, 872, 2802, 505

In [17]:
import numpy as np
np.array(tokenized_datasets["train"][:10]["input_ids"]).shape

(10, 2, 128)

In [18]:
from transformers import BertForSequenceClassification, BertPreTrainedModel, BertModel
from transformers.configuration_utils import PretrainedConfig
from typing import Optional
import torch.nn as nn
import torch

In [19]:
# BertForxxxx, 都是继承自BertPreTrainedModel
class DualModel(BertPreTrainedModel):
    def __init__(self, config: PretrainedConfig, *input, **kwargs):
        super().__init__(config, *input, **kwargs)
        self.bert = BertModel(config)
        # Initialize weights and apply final processing
        self.post_init()
    
    def forward(self, 
                input_ids: Optional[torch.Tensor] = None,
                attention_mask: Optional[torch.Tensor] = None,
                token_type_ids: Optional[torch.Tensor] = None,
                position_ids: Optional[torch.Tensor] = None,
                head_mask: Optional[torch.Tensor] = None,
                inputs_embeds: Optional[torch.Tensor] = None,
                labels: Optional[torch.Tensor] = None,
                output_attentions: Optional[bool] = None,
                output_hidden_states: Optional[bool] = None,
                return_dict: Optional[bool] = None,):
        
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 1.分别获得sentA和sentB的token表达
        sentA_input_ids = input_ids[:, 0, :]  # (b, 128)
        sentB_input_ids = input_ids[:, 1, :]
        sentA_attention_mask = attention_mask[:, 0, :]
        sentB_attention_mask = attention_mask[:, 1, :] 
        sentA_token_type_ids = token_type_ids[:, 0, :]
        sentB_token_type_ids = token_type_ids[:, 1, :]        

        # 2.获得sentA和sentB的向量表征
        sentA_outputs = self.bert(
            sentA_input_ids,
            attention_mask=sentA_attention_mask,
            token_type_ids=sentA_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sentA_pooled_output = sentA_outputs[1]


        sentB_outputs = self.bert(sentB_input_ids,
                                    attention_mask=sentB_attention_mask,
                                    token_type_ids=sentB_token_type_ids,
                                    position_ids=position_ids,
                                    head_mask=head_mask,
                                    inputs_embeds=inputs_embeds,
                                    output_attentions=output_attentions,
                                    output_hidden_states=output_hidden_states,
                                    return_dict=return_dict,
        )
        sentB_pooled_output = sentB_outputs[1]
        # 3.计算余弦相似度(logits)
        cos_func = nn.CosineSimilarity()
        cos_sim = cos_func(sentA_pooled_output, sentB_pooled_output)  # (batch_size, )

        # 4.计算loss
        loss = None
        if labels is not None:
            loss_func = nn.CosineEmbeddingLoss(margin=0.3)
            loss = loss_func(sentA_pooled_output, sentB_pooled_output, labels)
        output = (cos_sim, )
        # 如果存在标签则同时返回similarities和loss, 否则只返回similarities(推理用)
        # 需要优先返回loss
        return (loss, ) + output if loss is not None else output

model = DualModel.from_pretrained("hfl/chinese-macbert-base")

In [20]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metirc = evaluate.load("f1")

In [1]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = [int(p > 0.7) for p in predictions]
    labels = [int(l > 0) for l in labels]
    # predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

In [23]:
train_args = TrainingArguments(output_dir="./dual_model",      # 输出文件夹
                               per_device_train_batch_size=32,  # 训练时的batch_size
                               per_device_eval_batch_size=32,  # 验证时的batch_size
                               logging_steps=10,                # log 打印的频率
                               evaluation_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True)     # 训练完成后加载最优模型

In [24]:
trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  compute_metrics=eval_metric)

In [None]:
trainer.train()

In [25]:
trainer.evaluate(tokenized_datasets["test"])

  0%|          | 0/63 [00:00<?, ?it/s]

ValueError: Mismatch in the number of predictions (63) and references (2000)

In [3]:
class SentenceSimilarityPipeline:

    def __init__(self, model, tokenizer) -> None:
        self.model = model.bert
        self.tokenizer = tokenizer
        self.device = model.device

    def preprocess(self, senA, senB):
        return self.tokenizer([senA, senB], max_length=128, truncation=True, return_tensors="pt", padding=True)

    def predict(self, inputs):
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        return self.model(**inputs)[1]  # [2, 768]

    def postprocess(self, logits):
        cos = nn.CosineSimilarity()(logits[None, 0, :], logits[None,1, :]).squeeze().cpu().item()
        return cos

    def __call__(self, senA, senB, return_vector=False):
        inputs = self.preprocess(senA, senB)
        logits = self.predict(inputs)
        result = self.postprocess(logits)
        if return_vector:
            return result, logits
        else:
            return result

In [4]:
pipe = SentenceSimilarityPipeline(model, tokenizer)

NameError: name 'model' is not defined

In [5]:
pipe("我喜欢北京", "明天不行", return_vector=True)

NameError: name 'pipe' is not defined