In [1]:
from importlib.metadata import version

## 当前代码需要用到的包
pkgs = ['datasets', 'sentence_transformers']

for pkg in pkgs:
    print(f"{pkg}:", version(pkg))


import os
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"
os.environ['TRANSFORMERS_CACHE'] = "/root/autodl-tmp/LLMs/.cache/huggingface"
os.environ['HF_HOME'] = "/root/autodl-tmp/LLMs/.cache/huggingface"

import subprocess

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

datasets: 4.0.0
sentence_transformers: 5.1.0


# Supervised

In [4]:
from datasets import Dataset, load_dataset
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer


# prepare datasets
train_dataset = load_dataset(
    'glue', 'mnli', split='train'
).select(range(50000))
train_dataset = train_dataset.remove_columns('idx')
# (neutral/contradiction)=0 and (entailment)=1
mapping = {2: 0, 1: 0, 0: 1}
train_dataset = Dataset.from_dict({
    'sentence1': train_dataset['premise'],
    'sentence2': train_dataset['hypothesis'],
    'label': [float(mapping[label]) for label in train_dataset['label']]
})

# define model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# define loss function
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# create an embedding similarity evaluator for STSB
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']],
    main_similarity="cosine",
)

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='finetuned_embedding_model',
    num_train_epochs=5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
100,0.2113
200,0.1686
300,0.1609
400,0.162
500,0.1663
600,0.1578
700,0.1599
800,0.1461
900,0.0847
1000,0.089


TrainOutput(global_step=3910, training_loss=0.0772078822869474, metrics={'train_runtime': 319.7799, 'train_samples_per_second': 781.788, 'train_steps_per_second': 12.227, 'total_flos': 0.0, 'train_loss': 0.0772078822869474, 'epoch': 5.0})

In [5]:
evaluator(embedding_model)

{'pearson_cosine': 0.8433166340810299, 'spearman_cosine': 0.8424048535945733}

# Augmented SBERT

In [9]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset
from sentence_transformers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader
from sentence_transformers.cross_encoder import CrossEncoder

# prepare a small set of 10000 documents for the cross-encoder
dataset = load_dataset('glue', 'mnli', split='train').select(range(10000))
mapping = {2: 0, 1: 0, 0: 1}

# data loader
gold_examples = [
    InputExample(texts=[row['premise'], row['hypothesis']], label=mapping[row['label']]) for row in tqdm(dataset)
]
gold_dataloader = NoDuplicatesDataLoader(gold_examples, batch_size=64)
gold = pd.DataFrame(
    {
        "sentence1": dataset['premise'],
        'sentence2': dataset['hypothesis'],
        'label': [mapping[label] for label in dataset['label']]
    }
)

# train a cross-encoder on the gold dataset
cross_encoder = CrossEncoder('bert-base-uncased', num_labels=2)
cross_encoder.fit(
    train_dataloader=gold_dataloader,
    epochs=5,
    show_progress_bar=True,
    warmup_steps=100,
    use_amp=False
)

100%|██████████| 10000/10000 [00:00<00:00, 26164.92it/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.3538


In [10]:
import numpy as np

# prepare the silver dataset by predicting labels with cross-encoder
silver = load_dataset('glue', 'mnli', split='train').select(range(10000, 50000))
pairs = list(zip(silver['premise'], silver['hypothesis']))

# label the sentence pairs using our fine-tuned cross-encoder
output = cross_encoder.predict(
    pairs, apply_softmax=True,
    show_progress_bar=True
)
silver = pd.DataFrame(
    {
        'sentence1': silver['premise'],
        'sentence2': silver['hypothesis'],
        'label': np.argmax(output, axis=1)
    }
)

Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

In [15]:
from datasets import Dataset, load_dataset
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer

# define model
embedding_model = SentenceTransformer('bert-base-uncased')

# define loss function
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# combine gold + silver
data = pd.concat([gold, silver], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=['sentence1', 'sentence2'], keep='first')
train_dataset = Dataset.from_pandas(data, preserve_index=False)

# create an embedding similarity evaluator for STSB
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']],
    main_similarity="cosine",
)

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='argmented_embedding_model',
    num_train_epochs=5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train(
    resume_from_checkpoint=True
)

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
3100,0.0194
3200,0.0192
3300,0.0188
3400,0.0192
3500,0.0187
3600,0.0188
3700,0.0188
3800,0.0182
3900,0.0185


TrainOutput(global_step=3910, training_loss=0.0043829644656242315, metrics={'train_runtime': 303.4046, 'train_samples_per_second': 823.949, 'train_steps_per_second': 12.887, 'total_flos': 0.0, 'train_loss': 0.0043829644656242315, 'epoch': 5.0})

In [16]:
evaluator(embedding_model)

{'pearson_cosine': 0.6587175066431749, 'spearman_cosine': 0.6576858933778339}

In [17]:
import gc
import torch
del embedding_model

# Flush memory gc.collect() torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()