## 文本相似度

In [44]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

In [45]:
dataset = load_dataset("json", data_files="./train_pair_1w.json", split="train")
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10000
})

In [46]:
datasets = dataset.train_test_split(test_size=0.2)
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

In [47]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

def process_func(examples):
    tokenizer_examples = tokenizer(examples["sentence1"], examples["sentence2"],truncation = True, max_length = 128)
    tokenizer_examples["labels"] = [int(label) for label in examples["label"]]
    return tokenizer_examples

tokenized_dataset = datasets.map(process_func,batched= True,remove_columns= datasets["train"].column_names)
tokenized_dataset

loading configuration file config.json from cache at C:\Users\32721\.cache\huggingface\hub\models--hfl--chinese-macbert-base\snapshots\a986e004d2a7f2a1c2f5a3edef4e20604a974ed1\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}



DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [48]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-base")

loading configuration file config.json from cache at C:\Users\32721\.cache\huggingface\hub\models--hfl--chinese-macbert-base\snapshots\a986e004d2a7f2a1c2f5a3edef4e20604a974ed1\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}



Attempting to create safetensors variant


In [49]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metirc = evaluate.load("f1")

In [50]:
def eval_metric(pred):
    predictions , labels = pred
    predictions = predictions.argmax(axis = -1)
    acc = acc_metric.compute(predictions= predictions , references= labels)
    f1 = f1_metirc.compute(predictions= predictions , references= labels)
    acc.update(f1)
    return acc


In [65]:
train_args = TrainingArguments(
    output_dir="./cross_model",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=5,                # 每5步打印日志
    eval_strategy="no",          # 关键：禁用验证评估
    logging_strategy="steps",          # 仅按步打印训练日志
    save_strategy="steps",             
    learning_rate=2e-5,
    weight_decay=0.01,
    disable_tqdm=False,
    log_level="info",                 # 确保日志输出
    report_to="all"                   # 避免版本警告
)
train_args

PyTorch: setting up devices


TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
eval_use_gather_object=False,
fp16=False,
fp16_

In [66]:
from transformers import DataCollatorWithPadding
trainer = Trainer(model=model, 
                  args=train_args, 
                  tokenizer=tokenizer,
                  train_dataset=tokenized_dataset["train"], 
                  eval_dataset=tokenized_dataset["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

  trainer = Trainer(model=model,


In [67]:
trainer.train()

***** Running training *****
  Num examples = 8,000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 750
  Number of trainable parameters = 102,269,186


Step,Training Loss
5,0.0709
10,0.0419
15,0.2451
20,0.5909


KeyboardInterrupt: 

In [19]:
trainer.evaluate(tokenized_dataset["test"])



{'eval_loss': 0.2509779930114746,
 'eval_accuracy': 0.8985,
 'eval_f1': 0.8701215611004478}

In [20]:
from transformers import pipeline

model.config.id2label = {0:"不相似" , 1:"相似"}

pipe = pipeline("text-classification",model=model,tokenizer=tokenizer)

Device set to use cpu


In [68]:
pipe({"text":"我喜欢北京","text_pair":"北京真不错"})

{'label': '不相似', 'score': 0.9936758875846863}