1. Create & Prepare embedding dataset

In [1]:
from datasets import load_dataset
 
# Load dataset from the hub
dataset = load_dataset('csv', data_files='../dataset/QA_data.csv', split="train")


# Shuffle trước để chọn ngẫu nhiên
dataset = dataset.shuffle(seed=42)

# Lấy 1/100 số dòng
dataset = dataset.select(range(len(dataset) // 100))


print("Before:", len(dataset))
dataset = dataset.filter(lambda x: x["Answer"])
print("After:", len(dataset))
# rename columns
dataset = dataset.rename_column("Question", "anchor")
dataset = dataset.rename_column("Answer", "positive")
 
# Add an id column to the dataset
dataset = dataset.add_column("id", range(len(dataset)))
 
# split dataset into a 10% test set
dataset = dataset.train_test_split(test_size=0.1)
 
# save datasets to disk
dataset["train"].to_pandas().to_json("train_dataset.json", orient="records", lines=True, force_ascii=False)
dataset["test"].to_pandas().to_json("test_dataset.json", orient="records", lines=True, force_ascii=False)


  from .autonotebook import tqdm as notebook_tqdm


Before: 1973
After: 1579


2. Create baseline and evaluate pretrained model

In [2]:

from datasets import load_dataset, concatenate_datasets
 
# load test dataset
test_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])
 
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
)  # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
)  # Our queries (qid => question)
 
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for q_id in queries:
    relevant_docs[q_id] = [q_id]
 
 


Generating train split: 158 examples [00:00, 12730.52 examples/s]
Generating train split: 1421 examples [00:00, 156289.65 examples/s]


In [3]:
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim

model_id = "hiieu/halong_embedding"  # Hugging Face model ID
matryoshka_dimensions = [768, 512, 256, 128, 64] # Important: large to small
 
# Load a model
model = SentenceTransformer(
    model_id, device="cuda" if torch.cuda.is_available() else "cpu"
)

matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to a certain dimension
        score_functions={"cosine": cos_sim},
    )
    matryoshka_evaluators.append(ir_evaluator)
 
# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)

In [4]:
# # Evaluate the model
# results = evaluator(model)
# for k,v in results.items():
#     print(k, v)

# print("=======================")
# for dim in matryoshka_dimensions:
#     key = f"dim_{dim}_cosine_ndcg@10"
#     print
#     print(f"{key}: {results[key]}")

3. Define loss function with Matryoshka Representation

In [5]:
from sentence_transformers import SentenceTransformerModelCardData, SentenceTransformer
 
# Hugging Face model ID: https://huggingface.co/BAAI/bge-base-en-v1.5
model_id = "hiieu/halong_embedding"
 
# load model with SDPA for using Flash Attention 2
model = SentenceTransformer(
    model_id,
    model_kwargs={"attn_implementation": "sdpa"},
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="BGE base Financial Matryoshka",
    ),
)
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")
train_dataset

Dataset({
    features: ['anchor', 'Context', 'positive', 'Answer_Start', 'Answer_End', 'id'],
    num_rows: 1421
})

In [6]:
train_dataset[0]

{'anchor': 'Làm cách nào để ngăn ngừa chứng loạn sản thận?',
 'Context': 'Thật không may, chứng loạn sản thận không thể ngăn ngừa được vì đây là một tình trạng bẩm sinh xảy ra trong quá trình phát triển của thai nhi. Tuy nhiên, việc phát hiện và điều trị sớm có thể giúp kiểm soát tình trạng và ngăn ngừa các biến chứng.',
 'positive': 'Thật không may, chứng loạn sản thận không thể ngăn ngừa được vì đây là một tình trạng bẩm sinh xảy ra trong quá trình phát triển của thai nhi. Tuy nhiên, việc phát hiện và điều trị sớm có thể giúp kiểm soát tình trạng và ngăn ngừa các biến chứng.',
 'Answer_Start': 0,
 'Answer_End': 246,
 'id': 76}

In [7]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
 
matryoshka_dimensions = [768, 512, 256, 128, 64]  # Important: large to small
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

4. Fine-tune embedding model with SentenceTransformersTrainer

In [8]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers
 
# load train dataset again
 
# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="medical_embedding_vietnamese", # output directory and hugging face model ID
    num_train_epochs=1,                         # number of epochs
    per_device_train_batch_size=16,             # train batch size
    gradient_accumulation_steps=8,             # for a global batch size of 512
    per_device_eval_batch_size=8,    
    #gradient_checkpointing=True,
    warmup_ratio=0.1,                           # warmup ratio
    learning_rate=2e-5,                         # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",                 # use constant learning rate scheduler
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    # tf32=True,                                  # use tf32 precision
    bf16=True,                                  # use bf16 precision
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="epoch",                      # evaluate after each epoch
    save_strategy="epoch",  
    # save_steps = 500,
    logging_steps=10,                           # log every 10 steps
    save_total_limit=3,                         # save only the last 3 models
    load_best_model_at_end=True,                # load the best model when training ends
    metric_for_best_model="eval_dim_128_cosine_ndcg@10",  # Optimizing for the best ndcg@10 score for the 128 dimension
    report_to = "none",
)

In [9]:
from sentence_transformers import SentenceTransformerTrainer
 
trainer = SentenceTransformerTrainer(
    model=model, # bg-base-en-v1
    args=args,  # training arguments
    train_dataset=train_dataset.select_columns(
        ["anchor", "positive"]
    ),  # training dataset
    loss=train_loss,
    evaluator=evaluator,
)

                                                                     

In [10]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()
 
# save the best model
trainer.save_model()
 
# # push model to hub
# trainer.model.push_to_hub("bge-base-financial-matryoshka")

Epoch,Training Loss,Validation Loss,Dim 768 Cosine Accuracy@1,Dim 768 Cosine Accuracy@3,Dim 768 Cosine Accuracy@5,Dim 768 Cosine Accuracy@10,Dim 768 Cosine Precision@1,Dim 768 Cosine Precision@3,Dim 768 Cosine Precision@5,Dim 768 Cosine Precision@10,Dim 768 Cosine Recall@1,Dim 768 Cosine Recall@3,Dim 768 Cosine Recall@5,Dim 768 Cosine Recall@10,Dim 768 Cosine Ndcg@10,Dim 768 Cosine Mrr@10,Dim 768 Cosine Map@100,Dim 512 Cosine Accuracy@1,Dim 512 Cosine Accuracy@3,Dim 512 Cosine Accuracy@5,Dim 512 Cosine Accuracy@10,Dim 512 Cosine Precision@1,Dim 512 Cosine Precision@3,Dim 512 Cosine Precision@5,Dim 512 Cosine Precision@10,Dim 512 Cosine Recall@1,Dim 512 Cosine Recall@3,Dim 512 Cosine Recall@5,Dim 512 Cosine Recall@10,Dim 512 Cosine Ndcg@10,Dim 512 Cosine Mrr@10,Dim 512 Cosine Map@100,Dim 256 Cosine Accuracy@1,Dim 256 Cosine Accuracy@3,Dim 256 Cosine Accuracy@5,Dim 256 Cosine Accuracy@10,Dim 256 Cosine Precision@1,Dim 256 Cosine Precision@3,Dim 256 Cosine Precision@5,Dim 256 Cosine Precision@10,Dim 256 Cosine Recall@1,Dim 256 Cosine Recall@3,Dim 256 Cosine Recall@5,Dim 256 Cosine Recall@10,Dim 256 Cosine Ndcg@10,Dim 256 Cosine Mrr@10,Dim 256 Cosine Map@100,Dim 128 Cosine Accuracy@1,Dim 128 Cosine Accuracy@3,Dim 128 Cosine Accuracy@5,Dim 128 Cosine Accuracy@10,Dim 128 Cosine Precision@1,Dim 128 Cosine Precision@3,Dim 128 Cosine Precision@5,Dim 128 Cosine Precision@10,Dim 128 Cosine Recall@1,Dim 128 Cosine Recall@3,Dim 128 Cosine Recall@5,Dim 128 Cosine Recall@10,Dim 128 Cosine Ndcg@10,Dim 128 Cosine Mrr@10,Dim 128 Cosine Map@100,Dim 64 Cosine Accuracy@1,Dim 64 Cosine Accuracy@3,Dim 64 Cosine Accuracy@5,Dim 64 Cosine Accuracy@10,Dim 64 Cosine Precision@1,Dim 64 Cosine Precision@3,Dim 64 Cosine Precision@5,Dim 64 Cosine Precision@10,Dim 64 Cosine Recall@1,Dim 64 Cosine Recall@3,Dim 64 Cosine Recall@5,Dim 64 Cosine Recall@10,Dim 64 Cosine Ndcg@10,Dim 64 Cosine Mrr@10,Dim 64 Cosine Map@100,Sequential Score
0,15.0991,No log,0.689873,0.734177,0.765823,0.791139,0.689873,0.244726,0.153165,0.079114,0.689873,0.734177,0.765823,0.791139,0.735996,0.718688,0.723143,0.696203,0.727848,0.753165,0.78481,0.696203,0.242616,0.150633,0.078481,0.696203,0.727848,0.753165,0.78481,0.736524,0.721534,0.726238,0.664557,0.727848,0.734177,0.772152,0.664557,0.242616,0.146835,0.077215,0.664557,0.727848,0.734177,0.772152,0.717246,0.69992,0.704983,0.620253,0.727848,0.740506,0.765823,0.620253,0.242616,0.148101,0.076582,0.620253,0.727848,0.740506,0.765823,0.695815,0.673011,0.675894,0.563291,0.651899,0.683544,0.71519,0.563291,0.2173,0.136709,0.071519,0.563291,0.651899,0.683544,0.71519,0.637231,0.612508,0.618377,0.637231


5. Evaluate fine-tuned model against baseline

In [11]:
# from sentence_transformers import SentenceTransformer
 
# fine_tuned_model = SentenceTransformer(
#     args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
# )
# # Evaluate the model
# results = evaluator(fine_tuned_model)
 
# # # COMMENT IN for full results
# for k,v in results.items():
#     print(k, v)
    
# print("=======================")
 
# # Print the main score
# for dim in matryoshka_dimensions:
#     key = f"dim_{dim}_cosine_ndcg@10"
#     print(f"{key}: {results[key]}")