In [3]:
%%capture
!pip install --upgrade sentence-transformers datasets transformers torch tensorboard

In [4]:
from huggingface_hub import login
from google.colab import userdata

login(token=userdata.get('HF_TOKEN'), add_to_git_credential=True)

In [5]:
from datasets import load_dataset


dataset = load_dataset("philschmid/finanical-rag-embedding-dataset", split="train")


dataset = dataset.rename_column("question", "anchor")
dataset = dataset.rename_column("context", "positive")


dataset = dataset.add_column("id", range(len(dataset)))


dataset = dataset.train_test_split(test_size=0.1)


dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

README.md:   0%|          | 0.00/882 [00:00<?, ?B/s]

Creating json from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

240574

In [7]:
!pip uninstall torch torchvision
!pip install torch torchvision --upgrade

Found existing installation: torch 2.6.0
Uninstalling torch-2.6.0:
  Would remove:
    /usr/local/bin/torchfrtrace
    /usr/local/bin/torchrun
    /usr/local/lib/python3.11/dist-packages/functorch/*
    /usr/local/lib/python3.11/dist-packages/torch-2.6.0.dist-info/*
    /usr/local/lib/python3.11/dist-packages/torch/*
    /usr/local/lib/python3.11/dist-packages/torchgen/*
Proceed (Y/n)? y
  Successfully uninstalled torch-2.6.0
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/torchvision-0.21.0+cu124.dist-info/*
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libcudart.41118559.so.12
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libjpeg.1c1c4b09.so.8
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libnvjpeg.02b6d700.so.12
    /usr/local/lib/python3.11/dist-packages/torchvision.libs/libpng16.0364a1db.so.16
    /usr/local/lib/python3.11/dist-packa

In [1]:
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim
from datasets import load_dataset, concatenate_datasets

model_id = "BAAI/bge-base-en-v1.5"
matryoshka_dimensions = [768, 512, 256, 128, 64]


model = SentenceTransformer(
    model_id, device="cuda" if torch.cuda.is_available() else "cpu"
)


test_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])


corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
)


relevant_docs = {}
for q_id in queries:
    relevant_docs[q_id] = [q_id]

matryoshka_evaluators = []

for dim in matryoshka_dimensions:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,
        score_functions={"cosine": cos_sim},
    )
    matryoshka_evaluators.append(ir_evaluator)


evaluator = SequentialEvaluator(matryoshka_evaluators)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [2]:
results = evaluator(model)


for dim in matryoshka_dimensions:
    key = f"dim_{dim}_cosine_ndcg@10"
    print
    print(f"{key}: {results[key]}")

dim_768_cosine_ndcg@10: 0.7171344776427323
dim_512_cosine_ndcg@10: 0.7181327615047775
dim_256_cosine_ndcg@10: 0.7087020303406385
dim_128_cosine_ndcg@10: 0.6924167201423836
dim_64_cosine_ndcg@10: 0.6279222320906258


In [3]:
from sentence_transformers import SentenceTransformerModelCardData, SentenceTransformer


model_id = "BAAI/bge-base-en-v1.5"


model = SentenceTransformer(
    model_id,
    model_kwargs={"attn_implementation": "sdpa"},
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="BGE base Financial Matryoshka",
    ),
)

In [4]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [14]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers


train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")


args = SentenceTransformerTrainingArguments(
    output_dir="bge-base-financial-matryoshka",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=8,
    warmup_ratio=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",
    tf32=False,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_dim_128_cosine_ndcg@10",
)

In [15]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset.select_columns(
        ["anchor", "positive"]
    ),
    loss=train_loss,
    evaluator=evaluator,
)

In [16]:
trainer.train()


trainer.save_model()


# trainer.model.push_to_hub("bge-base-financial-matryoshka")

Epoch,Training Loss,Validation Loss,Dim 768 Cosine Accuracy@1,Dim 768 Cosine Accuracy@3,Dim 768 Cosine Accuracy@5,Dim 768 Cosine Accuracy@10,Dim 768 Cosine Precision@1,Dim 768 Cosine Precision@3,Dim 768 Cosine Precision@5,Dim 768 Cosine Precision@10,Dim 768 Cosine Recall@1,Dim 768 Cosine Recall@3,Dim 768 Cosine Recall@5,Dim 768 Cosine Recall@10,Dim 768 Cosine Ndcg@10,Dim 768 Cosine Mrr@10,Dim 768 Cosine Map@100,Dim 512 Cosine Accuracy@1,Dim 512 Cosine Accuracy@3,Dim 512 Cosine Accuracy@5,Dim 512 Cosine Accuracy@10,Dim 512 Cosine Precision@1,Dim 512 Cosine Precision@3,Dim 512 Cosine Precision@5,Dim 512 Cosine Precision@10,Dim 512 Cosine Recall@1,Dim 512 Cosine Recall@3,Dim 512 Cosine Recall@5,Dim 512 Cosine Recall@10,Dim 512 Cosine Ndcg@10,Dim 512 Cosine Mrr@10,Dim 512 Cosine Map@100,Dim 256 Cosine Accuracy@1,Dim 256 Cosine Accuracy@3,Dim 256 Cosine Accuracy@5,Dim 256 Cosine Accuracy@10,Dim 256 Cosine Precision@1,Dim 256 Cosine Precision@3,Dim 256 Cosine Precision@5,Dim 256 Cosine Precision@10,Dim 256 Cosine Recall@1,Dim 256 Cosine Recall@3,Dim 256 Cosine Recall@5,Dim 256 Cosine Recall@10,Dim 256 Cosine Ndcg@10,Dim 256 Cosine Mrr@10,Dim 256 Cosine Map@100,Dim 128 Cosine Accuracy@1,Dim 128 Cosine Accuracy@3,Dim 128 Cosine Accuracy@5,Dim 128 Cosine Accuracy@10,Dim 128 Cosine Precision@1,Dim 128 Cosine Precision@3,Dim 128 Cosine Precision@5,Dim 128 Cosine Precision@10,Dim 128 Cosine Recall@1,Dim 128 Cosine Recall@3,Dim 128 Cosine Recall@5,Dim 128 Cosine Recall@10,Dim 128 Cosine Ndcg@10,Dim 128 Cosine Mrr@10,Dim 128 Cosine Map@100,Dim 64 Cosine Accuracy@1,Dim 64 Cosine Accuracy@3,Dim 64 Cosine Accuracy@5,Dim 64 Cosine Accuracy@10,Dim 64 Cosine Precision@1,Dim 64 Cosine Precision@3,Dim 64 Cosine Precision@5,Dim 64 Cosine Precision@10,Dim 64 Cosine Recall@1,Dim 64 Cosine Recall@3,Dim 64 Cosine Recall@5,Dim 64 Cosine Recall@10,Dim 64 Cosine Ndcg@10,Dim 64 Cosine Mrr@10,Dim 64 Cosine Map@100,Sequential Score
1,2.0067,No log,0.658571,0.798571,0.84,0.881429,0.658571,0.26619,0.168,0.088143,0.658571,0.798571,0.84,0.881429,0.772262,0.737018,0.741559,0.66,0.791429,0.837143,0.882857,0.66,0.26381,0.167429,0.088286,0.66,0.791429,0.837143,0.882857,0.772345,0.736759,0.741396,0.658571,0.784286,0.831429,0.872857,0.658571,0.261429,0.166286,0.087286,0.658571,0.784286,0.831429,0.872857,0.765674,0.731284,0.736253,0.655714,0.785714,0.824286,0.86,0.655714,0.261905,0.164857,0.086,0.655714,0.785714,0.824286,0.86,0.759618,0.727189,0.732527,0.627143,0.747143,0.791429,0.838571,0.627143,0.249048,0.158286,0.083857,0.627143,0.747143,0.791429,0.838571,0.73194,0.697927,0.703557,0.73194
2,1.163,No log,0.665714,0.808571,0.845714,0.892857,0.665714,0.269524,0.169143,0.089286,0.665714,0.808571,0.845714,0.892857,0.780375,0.744244,0.74814,0.671429,0.804286,0.841429,0.891429,0.671429,0.268095,0.168286,0.089143,0.671429,0.804286,0.841429,0.891429,0.782132,0.747041,0.751093,0.66,0.797143,0.831429,0.874286,0.66,0.265714,0.166286,0.087429,0.66,0.797143,0.831429,0.874286,0.768712,0.734673,0.739663,0.667143,0.791429,0.822857,0.868571,0.667143,0.26381,0.164571,0.086857,0.667143,0.791429,0.822857,0.868571,0.768423,0.736306,0.741357,0.628571,0.761429,0.792857,0.851429,0.628571,0.25381,0.158571,0.085143,0.628571,0.761429,0.792857,0.851429,0.738394,0.702503,0.707703,0.738394
3,1.0817,No log,0.664286,0.811429,0.84,0.894286,0.664286,0.270476,0.168,0.089429,0.664286,0.811429,0.84,0.894286,0.780346,0.743837,0.747764,0.671429,0.808571,0.841429,0.891429,0.671429,0.269524,0.168286,0.089143,0.671429,0.808571,0.841429,0.891429,0.781459,0.746191,0.75052,0.661429,0.794286,0.83,0.878571,0.661429,0.264762,0.166,0.087857,0.661429,0.794286,0.83,0.878571,0.770336,0.735583,0.740385,0.67,0.792857,0.821429,0.878571,0.67,0.264286,0.164286,0.087857,0.67,0.792857,0.821429,0.878571,0.772981,0.739443,0.743787,0.631429,0.758571,0.804286,0.855714,0.631429,0.252857,0.160857,0.085571,0.631429,0.758571,0.804286,0.855714,0.742326,0.70616,0.711133,0.742326


In [17]:
from sentence_transformers import SentenceTransformer

fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)

results = evaluator(fine_tuned_model)


for dim in matryoshka_dimensions:
    key = f"dim_{dim}_cosine_ndcg@10"
    print(f"{key}: {results[key]}")

dim_768_cosine_ndcg@10: 0.7811946385240341
dim_512_cosine_ndcg@10: 0.7810758678697126
dim_256_cosine_ndcg@10: 0.772515498289343
dim_128_cosine_ndcg@10: 0.7738686027744143
dim_64_cosine_ndcg@10: 0.7427577241007849
