In [None]:
!pip install -q datasets sentence-transformers accelerate

In [1]:
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from sentence_transformers.losses import CoSENTLoss, MultipleNegativesRankingLoss, SoftmaxLoss


# 1. Load a model to finetune with 2. (Optional) model card data
model = SentenceTransformer(
    "distilbert/distilbert-base-uncased",
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="Distillbert base uncased model trained on AllNLI triplets",
    )
)

2024-07-13 10:07:22.246310: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-13 10:07:22.246365: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-13 10:07:22.247833: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# 2. Load several Datasets to train with
# (anchor, positive)
all_nli_pair_train = load_dataset("sentence-transformers/all-nli", "pair", split="train[:10000]")

# (sentence1, sentence2) + score
all_nli_pair_score_train = load_dataset("sentence-transformers/all-nli", "pair-score", split="train[:10000]")

# (anchor, positive, negative)
all_nli_triplet_train = load_dataset("sentence-transformers/all-nli", "triplet", split="train[:10000]")

# (sentence1, sentence2) + score
stsb_pair_score_train = load_dataset("sentence-transformers/stsb", split="train[:10000]")

# (anchor, positive)
quora_pair_train = load_dataset("sentence-transformers/quora-duplicates", "pair", split="train[:10000]")

# (query, answer)
natural_questions_train = load_dataset("sentence-transformers/natural-questions", split="train[:10000]")

# We can combine all datasets into a dictionary with dataset names to datasets
train_dataset = {
    "all-nli-pair": all_nli_pair_train,
    "all-nli-pair-score": all_nli_pair_score_train,
    "all-nli-triplet": all_nli_triplet_train,
    "stsb": stsb_pair_score_train,
    "quora": quora_pair_train,
    "natural-questions": natural_questions_train,
}

# 3. Load several Datasets to evaluate with
# (anchor, positive, negative)
all_nli_triplet_dev = load_dataset("sentence-transformers/all-nli", "triplet", split="dev")
# (sentence1, sentence2, score)
stsb_pair_score_dev = load_dataset("sentence-transformers/stsb", split="validation")
# (anchor, positive)
quora_pair_dev = load_dataset("sentence-transformers/quora-duplicates", "pair", split="train[10000:11000]")
# (query, answer)
natural_questions_dev = load_dataset("sentence-transformers/natural-questions", split="train[10000:11000]")

# We can use a dictionary for the evaluation dataset too, but we don't have to. We could also just use
# no evaluation dataset, or one dataset.
eval_dataset = {
    "all-nli-triplet": all_nli_triplet_dev,
    "stsb": stsb_pair_score_dev,
    "quora": quora_pair_dev,
    "natural-questions": natural_questions_dev,
}

# 4. Load several loss functions to train with
# (anchor, positive), (anchor, positive, negative)
mnrl_loss = MultipleNegativesRankingLoss(model)
# (sentence_A, sentence_B) + score
cosent_loss = CoSENTLoss(model)

# Create a mapping with dataset names to loss functions, so the trainer knows which loss to apply where.
# Note that you can also just use one loss if all of your training/evaluation datasets use the same loss
losses = {
    "all-nli-pair": mnrl_loss,
    "all-nli-pair-score": cosent_loss,
    "all-nli-triplet": mnrl_loss,
    "stsb": cosent_loss,
    "quora": mnrl_loss,
    "natural-questions": mnrl_loss,
}



In [3]:
# 4. Specifc training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/distilbert-base-uncased-all-nli",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    auto_find_batch_size=False,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_strategy="epoch",
    #run_name="distilbert-base-uncased-nli-triplet",  # Will be used in W&B if `wandb` is installed
)

In [4]:
import wandb
wandb.init(mode="disabled")



In [5]:
# 6. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=losses,
    #evaluator=dev_evaluator,
)

trainer.train()



Epoch,Training Loss,Validation Loss,All-nli-triplet Loss,Stsb Loss,Quora Loss,Natural-questions Loss
1,1.7873,No log,0.775467,6.950783,0.038007,0.058657


Computing widget examples:   0%|          | 0/3 [00:00<?, ?example/s]

TrainOutput(global_step=3485, training_loss=1.787275824964132, metrics={'train_runtime': 995.4569, 'train_samples_per_second': 56.003, 'train_steps_per_second': 3.501, 'total_flos': 0.0, 'train_loss': 1.787275824964132, 'epoch': 1.0})

In [6]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction


# Load the STSB dataset (https://huggingface.co/datasets/sentence-transformers/stsb)
test_dataset = load_dataset("sentence-transformers/stsb", split="test")

# Initialize the evaluator
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["sentence1"],
    sentences2=test_dataset["sentence2"],
    scores=test_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="distilbert-base-uncased-all-nli-sts-test",
)

test_evaluator(model)

{'distilbert-base-uncased-all-nli-sts-test_pearson_cosine': 0.7774674252504533,
 'distilbert-base-uncased-all-nli-sts-test_spearman_cosine': 0.7978656569955651,
 'distilbert-base-uncased-all-nli-sts-test_pearson_manhattan': 0.8002165724113339,
 'distilbert-base-uncased-all-nli-sts-test_spearman_manhattan': 0.7983410609487319,
 'distilbert-base-uncased-all-nli-sts-test_pearson_euclidean': 0.8007164914288766,
 'distilbert-base-uncased-all-nli-sts-test_spearman_euclidean': 0.7991835974428595,
 'distilbert-base-uncased-all-nli-sts-test_pearson_dot': 0.4716203511085343,
 'distilbert-base-uncased-all-nli-sts-test_spearman_dot': 0.4535374154660375,
 'distilbert-base-uncased-all-nli-sts-test_pearson_max': 0.8007164914288766,
 'distilbert-base-uncased-all-nli-sts-test_spearman_max': 0.7991835974428595}