In [1]:
from importlib.metadata import version

## 当前代码需要用到的包
pkgs = ['datasets', 'sentence_transformers']

for pkg in pkgs:
    print(f"{pkg}:", version(pkg))


import os
os.environ['HF_ENDPOINT'] = "https://hf-mirror.com"
os.environ['TRANSFORMERS_CACHE'] = "/root/autodl-tmp/LLMs/.cache/huggingface"
os.environ['HF_HOME'] = "/root/autodl-tmp/LLMs/.cache/huggingface"

import subprocess

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

datasets: 4.0.0
sentence_transformers: 5.1.0


# Generating Contrastive Examples

In [3]:
from datasets import load_dataset

train_dataset = load_dataset(
    'glue', 'mnli', split='train'
).select(range(50000))

train_dataset = train_dataset.remove_columns('idx')

# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset[0]

{'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.',
 'hypothesis': 'Product and geography are what make cream skimming work. ',
 'label': 1}

# Train Model

In [4]:
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer

# use a base model
embedding_model = SentenceTransformer('bert-base-uncased')

# define the loss function
train_loss = losses.SoftmaxLoss(
    model = embedding_model,
    sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),
    num_labels=3
)

# create an embedding similarity evaluator for STSB
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']],
    main_similarity="cosine",
)

# define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='base_embedding_model',
    num_train_epochs=10,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# train the embedding model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator,
)

trainer.train()

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Column 'hypothesis' is at index 1, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,1.0564
200,0.8926
300,0.8411
400,0.8088
500,0.7856
600,0.7599
700,0.7476
800,0.7344
900,0.5637
1000,0.5537


TrainOutput(global_step=7820, training_loss=0.19865243578486888, metrics={'train_runtime': 2845.9517, 'train_samples_per_second': 175.688, 'train_steps_per_second': 2.748, 'total_flos': 0.0, 'train_loss': 0.19865243578486888, 'epoch': 10.0})

In [47]:
evaluator(embedding_model)

{'pearson_cosine': 0.5078312671547078, 'spearman_cosine': 0.5599198383639518}

# Loss Functions

## Consine Silimarity

In [51]:
from datasets import Dataset, load_dataset
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer


# prepare datasets
train_dataset = load_dataset(
    'glue', 'mnli', split='train'
).select(range(50000))
train_dataset = train_dataset.remove_columns('idx')
# (neutral/contradiction)=0 and (entailment)=1
mapping = {2: 0, 1: 0, 0: 1}
train_dataset = Dataset.from_dict({
    'sentence1': train_dataset['premise'],
    'sentence2': train_dataset['hypothesis'],
    'label': [float(mapping[label]) for label in train_dataset['label']]
})

# define model
embedding_model = SentenceTransformer('bert-base-uncased')

# define loss function
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# create an embedding similarity evaluator for STSB
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']],
    main_similarity="cosine",
)

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='cosineloss_embedding_model',
    num_train_epochs=5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train()

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
100,0.2228
200,0.1671
300,0.1594
400,0.1557
500,0.1498
600,0.15
700,0.146
800,0.1363
900,0.1029
1000,0.1015


TrainOutput(global_step=3910, training_loss=0.07639600438687502, metrics={'train_runtime': 1435.6475, 'train_samples_per_second': 174.137, 'train_steps_per_second': 2.724, 'total_flos': 0.0, 'train_loss': 0.07639600438687502, 'epoch': 5.0})

In [52]:
evaluator(embedding_model)

{'pearson_cosine': 0.6641990742941372, 'spearman_cosine': 0.6654706923445127}

## Multiple Negatives Ranking(MNR) Loss

In [73]:
import random
from tqdm import tqdm
from datasets import Dataset, load_dataset

# prepare dataset
mnli = load_dataset(
    'glue', 'mnli', split='train'
).select(range(50000))
mnli = mnli.remove_columns('idx')
mnli = mnli.filter(lambda x: True if x['label']==0 else False)
train_dataset = {'anchor': [], 'positive': [], 'negative': []}
soft_negatives = list(mnli['hypothesis'])
random.shuffle(soft_negatives)
for row, soft_negative in tqdm(zip(mnli, soft_negatives)):
    train_dataset['anchor'].append(row['premise'])
    train_dataset['positive'].append(row['hypothesis'])
    train_dataset['negative'].append(soft_negative)
train_dataset = Dataset.from_dict(train_dataset)

# define model
embedding_model = SentenceTransformer('bert-base-uncased')

# define loss function
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# create an embedding similarity evaluator for STSB
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']],
    main_similarity="cosine",
)

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='cosineloss_embedding_model',
    num_train_epochs=5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train()

16875it [00:00, 17279.32it/s]
No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
100,0.4345
200,0.118
300,0.0856
400,0.0332
500,0.0295
600,0.018
700,0.011
800,0.0137
900,0.0094
1000,0.0088


TrainOutput(global_step=1320, training_loss=0.05968279396042679, metrics={'train_runtime': 612.6874, 'train_samples_per_second': 137.713, 'train_steps_per_second': 2.154, 'total_flos': 0.0, 'train_loss': 0.05968279396042679, 'epoch': 5.0})

In [74]:
evaluator(embedding_model)

{'pearson_cosine': 0.8138034604286173, 'spearman_cosine': 0.8161864547180582}

In [75]:
import gc
import torch
del embedding_model

# Flush memory gc.collect() torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()