<a href="https://colab.research.google.com/github/LxYuan0420/nlp/blob/main/notebooks/Finetuning_Embedding_Model_Matryoshka_768_64_NLI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a Matryoshka Embedding Model 🪆

It uses `MultipleNegativesRankingLoss` with `MatryoshkaLoss` to train a strong embedding model at output dimensions `[768, 512, 256, 128, 64]` using Natural Language Inference datasets (`AllNLI` in this case).



> Colab by: [mrm8488](https://twitter.com/mrm8488) adapted from [Sentence-Transformers](https://www.sbert.net/examples) script

In [1]:
! nvidia-smi

Sat Aug 10 10:21:55 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Install required dependencies 📦

In [2]:
! pip install -q sentence-transformers datasets "accelerate>=0.21.0"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Imports

In [3]:
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    losses,
)
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator, SimilarityFunction
from sentence_transformers.training_args import BatchSamplers

### Set main variables ⚙️

In [4]:
model_name = "distilroberta-base" # Choose the model you want
batch_size = 128  # The larger you select this, the better the results (usually). But it requires more GPU memory
num_train_epochs = 1
matryoshka_dims = [768, 512, 256, 128, 64]

In [5]:
# Save path of the model
output_dir = f"output/matryoshka_nli_{model_name.replace('/', '-')}_{batch_size}_bs_{num_train_epochs}_e"

In [6]:
# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
# create one with "mean" pooling.
model = SentenceTransformer(model_name)
# If we want, we can limit the maximum sequence length for the model
# model.max_seq_length = 75



config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

### Load the Dataset 📚

In [25]:
# 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
dataset = load_dataset("gowitheflow/allnli-withnegs", split="train").rename_columns(
    {
        "sentence1": "anchor",
        "sentence2": "positive",
        "sentence3": "negative"
    }
)

train_dataset, eval_test_dataset = dataset.train_test_split(test_size=0.6).values()
eval_dataset, test_dataset = eval_test_dataset.train_test_split(test_size=0.5).values()

In [26]:
train_dataset, train_dataset[0]

(Dataset({
     features: ['anchor', 'positive', 'negative'],
     num_rows: 110910
 }),
 {'anchor': 'A female skier holding two ski poles wearing a white knit hat.',
  'positive': 'The skier held two ski poles.',
  'negative': 'The skiier fell down the slope.'})

In [33]:
test_dataset = test_dataset.select(range(1000))
test_dataset, test_dataset[0]

(Dataset({
     features: ['anchor', 'positive', 'negative'],
     num_rows: 1000
 }),
 {'anchor': 'a man wears a red bandanna',
  'positive': 'A person wearing a piece of clothing.',
  'negative': 'A man wearing a blue bandanna.'})

#### (Optional) Training on the entire dataset can take a long time, so for demonstration purposes, let's use only a small portion.



In [9]:
MAX_EXAMPLES = 10000
train_dataset = train_dataset.shuffle(seed=21).select(range(MAX_EXAMPLES))

### Define our training loss functions 📉

In [10]:
inner_train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.MatryoshkaLoss(model, inner_train_loss, matryoshka_dims=matryoshka_dims)

### Set an evaluator to keep track of alongside the evaluation loss.

In [11]:
stsb_eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
evaluators = []
for dim in matryoshka_dims:
    evaluators.append(
        EmbeddingSimilarityEvaluator(
            sentences1=stsb_eval_dataset["sentence1"],
            sentences2=stsb_eval_dataset["sentence2"],
            scores=stsb_eval_dataset["score"],
            main_similarity=SimilarityFunction.COSINE,
            name=f"sts-dev-{dim}",
            truncate_dim=dim,
        )
    )

Downloading readme:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/142k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/108k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [12]:
dev_evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[0])

### Define the training args ⚙️

In [13]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=output_dir,
    # Optional training parameters:
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=30,
    save_strategy="steps",
    save_steps=30,
    save_total_limit=2,
    logging_steps=30,
    run_name="matryoshka-nli_128_bs_1e",  # Will be used in W&B if `wandb` is installed
)

### Create the Trainer and run it 🏋️‍♀️

In [14]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=train_loss,
    evaluator=dev_evaluator,
)

In [15]:
trainer.train()

Step,Training Loss,Validation Loss,Sts-dev-768 Pearson Cosine,Sts-dev-768 Spearman Cosine,Sts-dev-768 Pearson Manhattan,Sts-dev-768 Spearman Manhattan,Sts-dev-768 Pearson Euclidean,Sts-dev-768 Spearman Euclidean,Sts-dev-768 Pearson Dot,Sts-dev-768 Spearman Dot,Sts-dev-768 Pearson Max,Sts-dev-768 Spearman Max,Sts-dev-512 Pearson Cosine,Sts-dev-512 Spearman Cosine,Sts-dev-512 Pearson Manhattan,Sts-dev-512 Spearman Manhattan,Sts-dev-512 Pearson Euclidean,Sts-dev-512 Spearman Euclidean,Sts-dev-512 Pearson Dot,Sts-dev-512 Spearman Dot,Sts-dev-512 Pearson Max,Sts-dev-512 Spearman Max,Sts-dev-256 Pearson Cosine,Sts-dev-256 Spearman Cosine,Sts-dev-256 Pearson Manhattan,Sts-dev-256 Spearman Manhattan,Sts-dev-256 Pearson Euclidean,Sts-dev-256 Spearman Euclidean,Sts-dev-256 Pearson Dot,Sts-dev-256 Spearman Dot,Sts-dev-256 Pearson Max,Sts-dev-256 Spearman Max,Sts-dev-128 Pearson Cosine,Sts-dev-128 Spearman Cosine,Sts-dev-128 Pearson Manhattan,Sts-dev-128 Spearman Manhattan,Sts-dev-128 Pearson Euclidean,Sts-dev-128 Spearman Euclidean,Sts-dev-128 Pearson Dot,Sts-dev-128 Spearman Dot,Sts-dev-128 Pearson Max,Sts-dev-128 Spearman Max,Sts-dev-64 Pearson Cosine,Sts-dev-64 Spearman Cosine,Sts-dev-64 Pearson Manhattan,Sts-dev-64 Spearman Manhattan,Sts-dev-64 Pearson Euclidean,Sts-dev-64 Spearman Euclidean,Sts-dev-64 Pearson Dot,Sts-dev-64 Spearman Dot,Sts-dev-64 Pearson Max,Sts-dev-64 Spearman Max,Sequential Score
30,13.7037,5.005939,0.808005,0.813105,0.805539,0.803816,0.805224,0.803399,0.534185,0.558204,0.808005,0.813105,0.814092,0.817793,0.804632,0.803051,0.805322,0.803628,0.60509,0.619524,0.814092,0.817793,0.810619,0.815705,0.801649,0.800376,0.801958,0.800734,0.596054,0.61443,0.810619,0.815705,0.79974,0.808969,0.795182,0.795964,0.793577,0.794621,0.575518,0.593552,0.79974,0.808969,0.793102,0.806407,0.785254,0.789185,0.784401,0.789097,0.542469,0.56869,0.793102,0.806407,0.813105
60,5.3279,3.985119,0.826958,0.82981,0.813885,0.813229,0.814282,0.813257,0.611474,0.636086,0.826958,0.82981,0.828411,0.831144,0.813806,0.813093,0.814893,0.814065,0.65367,0.673051,0.828411,0.831144,0.824915,0.82913,0.8112,0.810496,0.81159,0.811226,0.653226,0.673059,0.824915,0.82913,0.814829,0.822203,0.804301,0.805439,0.802773,0.804568,0.630822,0.656137,0.814829,0.822203,0.803642,0.815347,0.792133,0.795439,0.790291,0.794202,0.583674,0.601792,0.803642,0.815347,0.82981


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=79, training_loss=8.321745377552661, metrics={'train_runtime': 350.0401, 'train_samples_per_second': 28.568, 'train_steps_per_second': 0.226, 'total_flos': 0.0, 'train_loss': 8.321745377552661, 'epoch': 1.0})

### Evaluate on the STS Benchmark test dataset 🧪

In [16]:
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
evaluators = []
for dim in matryoshka_dims:
    evaluators.append(
        EmbeddingSimilarityEvaluator(
            sentences1=test_dataset["sentence1"],
            sentences2=test_dataset["sentence2"],
            scores=test_dataset["score"],
            main_similarity=SimilarityFunction.COSINE,
            name=f"sts-test-{dim}",
            truncate_dim=dim,
        )
    )

In [17]:
test_evaluator = SequentialEvaluator(evaluators)

In [18]:
test_evaluator(model)

{'sts-test-768_pearson_cosine': 0.7973892061443318,
 'sts-test-768_spearman_cosine': 0.7919350448068558,
 'sts-test-768_pearson_manhattan': 0.7764969490647854,
 'sts-test-768_spearman_manhattan': 0.7594746543730431,
 'sts-test-768_pearson_euclidean': 0.777772824605156,
 'sts-test-768_spearman_euclidean': 0.7605872898275478,
 'sts-test-768_pearson_dot': 0.5414371752679614,
 'sts-test-768_spearman_dot': 0.5192348145099092,
 'sts-test-768_pearson_max': 0.7973892061443318,
 'sts-test-768_spearman_max': 0.7919350448068558,
 'sts-test-512_pearson_cosine': 0.8035608197577276,
 'sts-test-512_spearman_cosine': 0.7922073818020418,
 'sts-test-512_pearson_manhattan': 0.7764124264466753,
 'sts-test-512_spearman_manhattan': 0.7596300248427789,
 'sts-test-512_pearson_euclidean': 0.7786492355233788,
 'sts-test-512_spearman_euclidean': 0.7619251329751368,
 'sts-test-512_pearson_dot': 0.6197771400220254,
 'sts-test-512_spearman_dot': 0.600599277811478,
 'sts-test-512_pearson_max': 0.8035608197577276,
 '

### Evaluate on the self-generated test set from `gowitheflow/allnli-withnegs`

In [34]:
from sentence_transformers.evaluation import TripletEvaluator

evaluators = []
for dim in matryoshka_dims:
    evaluators.append(
        TripletEvaluator(
            anchors=test_dataset["anchor"],
            positives=test_dataset["positive"],
            negatives=test_dataset["negative"],
            name=f"allnli-withnegs-test-dim-{dim}",
            truncate_dim=dim,
        )
    )


In [35]:
test_evaluator = SequentialEvaluator(evaluators)

test_evaluator(model)

{'allnli-withnegs-test-dim-768_cosine_accuracy': 0.886,
 'allnli-withnegs-test-dim-768_dot_accuracy': 0.133,
 'allnli-withnegs-test-dim-768_manhattan_accuracy': 0.864,
 'allnli-withnegs-test-dim-768_euclidean_accuracy': 0.859,
 'allnli-withnegs-test-dim-768_max_accuracy': 0.886,
 'allnli-withnegs-test-dim-512_cosine_accuracy': 0.886,
 'allnli-withnegs-test-dim-512_dot_accuracy': 0.11,
 'allnli-withnegs-test-dim-512_manhattan_accuracy': 0.864,
 'allnli-withnegs-test-dim-512_euclidean_accuracy': 0.862,
 'allnli-withnegs-test-dim-512_max_accuracy': 0.886,
 'allnli-withnegs-test-dim-256_cosine_accuracy': 0.893,
 'allnli-withnegs-test-dim-256_dot_accuracy': 0.108,
 'allnli-withnegs-test-dim-256_manhattan_accuracy': 0.861,
 'allnli-withnegs-test-dim-256_euclidean_accuracy': 0.856,
 'allnli-withnegs-test-dim-256_max_accuracy': 0.893,
 'allnli-withnegs-test-dim-128_cosine_accuracy': 0.884,
 'allnli-withnegs-test-dim-128_dot_accuracy': 0.119,
 'allnli-withnegs-test-dim-128_manhattan_accuracy': 

### Save the model locally

In [None]:
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)

Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

### Push to the Hugging Face Hub 🤗
You may need an token. Get it here: https://huggingface.co/settings/tokens

In [None]:
model.push_to_hub(f"{model_name}-nli-matryoshka", token="<your_token>")