In [1]:
!pip install sentence_transformers datasets accelerate
!pip install gdown



In [5]:
import random
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from datasets import load_dataset, Dataset
import gdown
import pandas as pd

In [None]:
gdown.download('','/kaggle/working/corpus.csv',fuzzy=True)
gdown.download('','/kaggle/working/train.csv',fuzzy=True)

In [8]:
cp = pd.read_csv('/kaggle/working/corpus.csv')
cp['content_id'] = cp['document'] + ' ' + cp['article']
train = pd.read_csv('/kaggle/working/train.csv')
train['content_id'] = train['document'] + ' ' + train['article']


In [9]:
# Convert the datasets to dictionaries
corpus = dict(zip(cp["content_id"], cp["context"]))  # Our corpus (cid => document)
queries = dict(zip(train["question"], train["question"]))  # Our queries (qid => question)
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for qid, corpus_ids in zip(train["question"], train["content_id"]):
    qid = str(qid)
    corpus_ids = str(corpus_ids)
    if qid not in relevant_docs:
        relevant_docs[qid] = set()
    relevant_docs[qid].add(corpus_ids)

In [16]:
import json
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim
from datasets import load_dataset, concatenate_datasets

model = SentenceTransformer("hiieu/halong_embedding")
matryoshka_dimensions = [768, 512] # Important: large to small
matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to a certain dimension
        score_functions={"cosine": cos_sim},
    )
    matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)

In [17]:
# Evaluate the model
results = evaluator(model)
for k,v in results.items():
    print(k, v)

dim_768_cosine_accuracy@1 0.574
dim_768_cosine_accuracy@3 0.75
dim_768_cosine_accuracy@5 0.818
dim_768_cosine_accuracy@10 0.86
dim_768_cosine_precision@1 0.574
dim_768_cosine_precision@3 0.24999999999999997
dim_768_cosine_precision@5 0.1636
dim_768_cosine_precision@10 0.086
dim_768_cosine_recall@1 0.574
dim_768_cosine_recall@3 0.75
dim_768_cosine_recall@5 0.818
dim_768_cosine_recall@10 0.86
dim_768_cosine_ndcg@10 0.7206348883663962
dim_768_cosine_mrr@10 0.6754904761904761
dim_768_cosine_map@100 0.6812806681618039
dim_512_cosine_accuracy@1 0.568
dim_512_cosine_accuracy@3 0.746
dim_512_cosine_accuracy@5 0.804
dim_512_cosine_accuracy@10 0.86
dim_512_cosine_precision@1 0.568
dim_512_cosine_precision@3 0.24866666666666662
dim_512_cosine_precision@5 0.1608
dim_512_cosine_precision@10 0.086
dim_512_cosine_recall@1 0.568
dim_512_cosine_recall@3 0.746
dim_512_cosine_recall@5 0.804
dim_512_cosine_recall@10 0.86
dim_512_cosine_ndcg@10 0.7154724096290774
dim_512_cosine_mrr@10 0.6689261904761903
di

In [18]:
import pandas as pd
from datasets import Dataset

def prepare_training_dataset(queries, corpus, relevant_docs):
    anchors = []
    positives = []
    for query_id, docs in relevant_docs.items():
        for doc_id in docs:
          anchors.append(queries[query_id])
          positives.append(corpus[doc_id] )
    df = {
        "anchor": anchors,
        "positive": positives
    }

    return Dataset.from_dict(df)

pairs = prepare_training_dataset(queries, corpus, relevant_docs)
pairs

Dataset({
    features: ['anchor', 'positive'],
    num_rows: 500
})

In [19]:
pairs[0]

{'anchor': 'Sinh viên dự bị không trở thành sinh viên chính thức bao nhiêu học kỳ sẽ bị loại khỏi CTTN?',
 'positive': 'Điều  9.\tTuyển bổ sung và loại ra khỏi chương trình, xét chính thức và dự bị\nĐối tượng tham gia CTTN là những sinh viên có năng lực xuất sắc, do đó, sau mỗi học kỳ BĐH quyết định việc loại sinh viên khỏi lớp tài năng, tuyển bổ sung sinh viên từ chương trình chuẩn vào lớp tài năng, xét chuyển đổi sinh viên chính thức và dự bị.\nĐầu mỗi học kỳ, Khoa xét và đề nghị lên BĐH các danh sách sinh viên tuyển bổ sung, bị loại ra khỏi các lớp CTTN hoặc danh sách sinh viên chính thức và dự bị theo các tiêu chuẩn như sau:\n1.\tLoại khỏi chương trình\nTại thời điểm xem xét, sinh viên rơi vào một trong các trường hợp sau:\n-\tChưa tốt nghiệp khi đã quá thời gian thiết kế của khóa học và không có lý do đặc biệt.\n-\tKhông đăng ký học đầy đủ các môn học CTTN bắt buộc trong học kỳ.\n-\tĐTBTL nhỏ hơn 6,5 – tính tương ứng sau học kỳ 1 và sau học kỳ hè (kết quả học tập của học kỳ hè sẽ 

In [20]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512]  # Important: large to small
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [21]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="sample", # output directory and hugging face model ID
    num_train_epochs=10,                         # number of epochs
    per_device_train_batch_size=8,             # train batch size
    gradient_accumulation_steps=4,             # for a global batch size of 512
    per_device_eval_batch_size=4,              # evaluation batch size
    #gradient_checkpointing=True,
    warmup_ratio=0.1,                           # warmup ratio
    learning_rate=2e-5,                         # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",                 # use constant learning rate scheduler
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    #tf32=True,                                  # use tf32 precision
    bf16=True,                                  # use bf16 precision
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="steps",                      # evaluate after each epoch
    #save_strategy="epoch",                      # save after each epoch
    save_steps = 500,
    logging_steps=10,                           # log every 10 steps
    save_total_limit=3,                         # save only the last 3 models
    load_best_model_at_end=True,                # load the best model when training ends
    metric_for_best_model="eval_dim_768_cosine_ndcg@10",  # Optimizing for the best ndcg@10 score for the 128 dimension
     report_to = "none"
)

In [22]:
from sentence_transformers import SentenceTransformerTrainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,  # training arguments
    train_dataset=pairs,
    loss=train_loss,
    evaluator=evaluator,
)

In [23]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save the best model
trainer.save_model()

Step,Training Loss,Validation Loss,Dim 768 Cosine Accuracy@1,Dim 768 Cosine Accuracy@3,Dim 768 Cosine Accuracy@5,Dim 768 Cosine Accuracy@10,Dim 768 Cosine Precision@1,Dim 768 Cosine Precision@3,Dim 768 Cosine Precision@5,Dim 768 Cosine Precision@10,Dim 768 Cosine Recall@1,Dim 768 Cosine Recall@3,Dim 768 Cosine Recall@5,Dim 768 Cosine Recall@10,Dim 768 Cosine Ndcg@10,Dim 768 Cosine Mrr@10,Dim 768 Cosine Map@100,Dim 512 Cosine Accuracy@1,Dim 512 Cosine Accuracy@3,Dim 512 Cosine Accuracy@5,Dim 512 Cosine Accuracy@10,Dim 512 Cosine Precision@1,Dim 512 Cosine Precision@3,Dim 512 Cosine Precision@5,Dim 512 Cosine Precision@10,Dim 512 Cosine Recall@1,Dim 512 Cosine Recall@3,Dim 512 Cosine Recall@5,Dim 512 Cosine Recall@10,Dim 512 Cosine Ndcg@10,Dim 512 Cosine Mrr@10,Dim 512 Cosine Map@100,Sequential Score
10,0.6452,No log,0.608,0.824,0.876,0.92,0.608,0.274667,0.1752,0.092,0.608,0.824,0.876,0.92,0.773902,0.726056,0.72961,0.608,0.816,0.864,0.926,0.608,0.272,0.1728,0.0926,0.608,0.816,0.864,0.926,0.772289,0.722507,0.725676,0.772289


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Could not locate the best model at sample/checkpoint-10/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


In [24]:
from sentence_transformers import SentenceTransformer
import torch
fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)
# Evaluate the model
results = evaluator(fine_tuned_model)

for k,v in results.items():
    print(k, v)

dim_768_cosine_accuracy@1 0.618
dim_768_cosine_accuracy@3 0.828
dim_768_cosine_accuracy@5 0.878
dim_768_cosine_accuracy@10 0.922
dim_768_cosine_precision@1 0.618
dim_768_cosine_precision@3 0.276
dim_768_cosine_precision@5 0.1756
dim_768_cosine_precision@10 0.09219999999999999
dim_768_cosine_recall@1 0.618
dim_768_cosine_recall@3 0.828
dim_768_cosine_recall@5 0.878
dim_768_cosine_recall@10 0.922
dim_768_cosine_ndcg@10 0.779638998514592
dim_768_cosine_mrr@10 0.732915079365079
dim_768_cosine_map@100 0.7366734442306533
dim_512_cosine_accuracy@1 0.61
dim_512_cosine_accuracy@3 0.822
dim_512_cosine_accuracy@5 0.876
dim_512_cosine_accuracy@10 0.93
dim_512_cosine_precision@1 0.61
dim_512_cosine_precision@3 0.274
dim_512_cosine_precision@5 0.1752
dim_512_cosine_precision@10 0.093
dim_512_cosine_recall@1 0.61
dim_512_cosine_recall@3 0.822
dim_512_cosine_recall@5 0.876
dim_512_cosine_recall@10 0.93
dim_512_cosine_ndcg@10 0.776701905339929
dim_512_cosine_mrr@10 0.726838095238095
dim_512_cosine_map@

In [None]:
from huggingface_hub import login

login(token="", add_to_git_credential=True)  

Token has not been saved to git credential helper.


[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


In [26]:
# push model to hub
trainer.model.push_to_hub("improve_halong")

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

'https://huggingface.co/johnweak132/test_hehe/commit/fa67d082562068f56016181f49591cfe44c2c5b0'