In [1]:
%pip install sentence-transformers
%pip install llama_index

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0
Note: you may need to restart the kernel to use updated packages.
Collecting llama_index
  Downloading llama_index-0.11.18-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-agent-openai<0.4.0,>=0.3.4 (from llama_index)
  Downloading llama_index_agent_openai-0.3.4-py3-none-any.whl.metadata (728 bytes)
Collecting llama-index-cli<0.4.0,>=0.3.1 (from llama_index)
  Downloading llama_index_cli-0.3.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.12.0,>=0.11.18 (from llama_index)
  Downloading llama_index_core-0.11.18-py3-none-any.whl.metadata (2.4 kB)
Col

In [2]:
import nltk
nltk.download('punkt_tab')
import os
os.environ["WANDB_MODE"] = "disabled"
import torch
from sentence_transformers import SentenceTransformer, models, datasets, losses
from torch.utils.data import DataLoader
import pickle
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
  from tqdm.autonotebook import tqdm, trange


In [3]:
csv_path = "/kaggle/input/qa-testset/Hydrogen_LLM.csv"
data = pd.read_csv(csv_path)

def calculate_mrr(question_embeddings, answer_embeddings):
    reciprocal_ranks = []

    for i, question_embedding in enumerate(question_embeddings):
        similarities = cosine_similarity([question_embedding], answer_embeddings)[0]
        sorted_indices = np.argsort(-similarities)
        rank_of_correct_answer = np.where(sorted_indices == i)[0][0] + 1 
        reciprocal_ranks.append(1 / rank_of_correct_answer)
        
    mrr = np.mean(reciprocal_ranks)
    return mrr


def calculate_recall_at_k(question_embeddings, answer_embeddings, K):
    recall_scores = []
    for i, question_embedding in enumerate(question_embeddings):
        similarities = cosine_similarity([question_embedding], answer_embeddings)[0]
        sorted_indices = np.argsort(-similarities)
        if i in sorted_indices[:K]:
            recall_scores.append(1)  # Câu trả lời đúng nằm trong top K
        else:
            recall_scores.append(0)  # Câu trả lời đúng không nằm trong top K
    
    recall_at_k = np.mean(recall_scores)
    return recall_at_k


def test_model (model):
    questions_embeddings = model.encode(data['Questions'].tolist(), convert_to_tensor=True)

    answers_embeddings = model.encode(data['Answer'].tolist(), convert_to_tensor=True)

    questions_embeddings = questions_embeddings.cpu().numpy()
    answers_embeddings = answers_embeddings.cpu().numpy()

    mrr = calculate_mrr(questions_embeddings, answers_embeddings)
    recall_1 = calculate_recall_at_k(questions_embeddings, answers_embeddings, 1)
    recall_3 = calculate_recall_at_k(questions_embeddings, answers_embeddings, 3)
    recall_5 = calculate_recall_at_k(questions_embeddings, answers_embeddings, 5)

    print (f"MRR: {mrr}")
    print(f"Recall@5: {recall_5}")
    print(f"Recall@3: {recall_3}")
    print(f"Recall@1: {recall_1}")

In [4]:
def main():
    # Set up argparse to receive command-line arguments
    input_file = "/kaggle/input/rag-hydrogen/chunkers_final.pkl"
    output_dir = "/kaggle/working/output_model"

    epochs = 15
    learning_rate = 3e-5
    batch_size = 2
    # Check for GPU and use it
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Training on {device}...")

    # Read input file containing training sentences
    with open(input_file, 'rb') as file:
        documents = pickle.load(file)
        train_sentences = []
        for doc in documents:
            train_sentences.append(doc.text)

    # Define TSDAE model with "CLS" pooling and use fp16
    model_name = "BAAI/bge-base-en"
    word_embedding_model = models.Transformer(model_name)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Test base model
    test_model(model)
    
    # Create dataset with DenoisingAutoEncoder
    train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
    # DataLoader to split data into batches
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # Loss function for TSDAE
    train_loss = losses.DenoisingAutoEncoderLoss(
        model, decoder_name_or_path=model_name, tie_encoder_decoder=True
    )
    

    for epoch in range(epochs):
        model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=1,
            weight_decay=0,
            scheduler="warmupcosine",
            optimizer_params={"lr": learning_rate},
            show_progress_bar=True,
            use_amp=True,
            checkpoint_path = "/kaggle/working/checkpoints",
        )
        # Test the model with Q&A testset
        test_model(model)
        # Save the model after each epoch
        model.save(f"{output_dir}/epoch_{epoch + 1}")
        print(f"Model after epoch {epoch + 1} saved to {output_dir}/epoch_{epoch + 1}")
        torch.cuda.empty_cache()
        

if __name__ == "__main__":
    main()

Training on cuda...


config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.5665369661008146
Recall@5: 0.6902654867256637
Recall@3: 0.6283185840707964
Recall@1: 0.45132743362831856


Some weights of BertLMHeadModel were not initialized from the model checkpoint at BAAI/bge-base-en and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'encoder.layer.0.crossattention.output.LayerNorm.bias', 'encoder.layer.0.crossattention.output.LayerNorm.weight', 'encoder.layer.0.crossattention.output.dense.bias', 'encoder.layer.0.crossattention.output.dense.weight', 'encoder.layer.0.crossattention.self.key.bias', 'encoder.layer.0.crossattention.self.key.weight', 'encoder.layer.0.crossattention.self.query.bias', 'encoder.layer.0.crossattention.self.query.weight', 'encoder.layer.0.crossattention.self.value.bias', 'encoder.layer.0.crossattention.self.value.weight', 'encoder.layer.1.crossattention.output.LayerNorm.bias', 'encoder.layer.1.crossattention.output.LayerNorm.weight', 'encoder.

Step,Training Loss
500,8.9358
1000,7.0686
1500,6.8356
2000,6.5893
2500,6.2424


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.3793369753085386
Recall@5: 0.5309734513274337
Recall@3: 0.36283185840707965
Recall@1: 0.25663716814159293
Model after epoch 1 saved to /kaggle/working/output_model/epoch_1


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,5.8511
1000,5.8099
1500,5.7222
2000,5.68
2500,5.5977


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.37689794283641787
Recall@5: 0.5752212389380531
Recall@3: 0.4247787610619469
Recall@1: 0.23008849557522124
Model after epoch 2 saved to /kaggle/working/output_model/epoch_2


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,5.4345
1000,5.4034
1500,5.3326
2000,5.3115
2500,5.2564


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.3805345491022564
Recall@5: 0.5752212389380531
Recall@3: 0.4247787610619469
Recall@1: 0.23008849557522124
Model after epoch 3 saved to /kaggle/working/output_model/epoch_3


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,5.1581
1000,5.1294
1500,5.063
2000,5.0467
2500,5.0048


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.37354630194199906
Recall@5: 0.584070796460177
Recall@3: 0.4336283185840708
Recall@1: 0.21238938053097345
Model after epoch 4 saved to /kaggle/working/output_model/epoch_4


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,4.9521
1000,4.9227
1500,4.8561
2000,4.8417
2500,4.8064


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.3861216089567209
Recall@5: 0.5752212389380531
Recall@3: 0.4336283185840708
Recall@1: 0.23008849557522124
Model after epoch 5 saved to /kaggle/working/output_model/epoch_5


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,4.7943
1000,4.7609
1500,4.6928
2000,4.6769
2500,4.6435


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.40026254486640356
Recall@5: 0.6283185840707964
Recall@3: 0.4336283185840708
Recall@1: 0.24778761061946902
Model after epoch 6 saved to /kaggle/working/output_model/epoch_6


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,4.6683
1000,4.6289
1500,4.5574
2000,4.5378
2500,4.5034


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.4051723221008959
Recall@5: 0.6371681415929203
Recall@3: 0.45132743362831856
Recall@1: 0.24778761061946902
Model after epoch 7 saved to /kaggle/working/output_model/epoch_7


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,4.5645
1000,4.5185
1500,4.4427
2000,4.4194
2500,4.3823


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.4113667815999733
Recall@5: 0.6637168141592921
Recall@3: 0.4778761061946903
Recall@1: 0.24778761061946902
Model after epoch 8 saved to /kaggle/working/output_model/epoch_8


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,4.4783
1000,4.4255
1500,4.3441
2000,4.316
2500,4.2751


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.41655480907511966
Recall@5: 0.6637168141592921
Recall@3: 0.4778761061946903
Recall@1: 0.25663716814159293
Model after epoch 9 saved to /kaggle/working/output_model/epoch_9


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,4.4055
1000,4.3448
1500,4.257
2000,4.2233
2500,4.1776


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.4173009222823665
Recall@5: 0.6637168141592921
Recall@3: 0.48672566371681414
Recall@1: 0.25663716814159293
Model after epoch 10 saved to /kaggle/working/output_model/epoch_10


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,4.3432
1000,4.2734
1500,4.1786
2000,4.1383
2500,4.0868


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.41724648735180825
Recall@5: 0.6637168141592921
Recall@3: 0.4690265486725664
Recall@1: 0.25663716814159293
Model after epoch 11 saved to /kaggle/working/output_model/epoch_11


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,4.289
1000,4.2094
1500,4.1068
2000,4.0596
2500,4.0016


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.42351198207200813
Recall@5: 0.6637168141592921
Recall@3: 0.48672566371681414
Recall@1: 0.26548672566371684
Model after epoch 12 saved to /kaggle/working/output_model/epoch_12


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,4.2401
1000,4.1513
1500,4.0404
2000,3.9856
2500,3.9208


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.42427386899464553
Recall@5: 0.6548672566371682
Recall@3: 0.504424778761062
Recall@1: 0.26548672566371684
Model after epoch 13 saved to /kaggle/working/output_model/epoch_13


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,4.197
1000,4.0981
1500,3.9781
2000,3.9155
2500,3.8445


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.42186447092772505
Recall@5: 0.6548672566371682
Recall@3: 0.48672566371681414
Recall@1: 0.26548672566371684
Model after epoch 14 saved to /kaggle/working/output_model/epoch_14


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,4.1597
1000,4.0491
1500,3.9195
2000,3.8485
2500,3.7711


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

MRR: 0.42652682164097044
Recall@5: 0.6283185840707964
Recall@3: 0.504424778761062
Recall@1: 0.2743362831858407
Model after epoch 15 saved to /kaggle/working/output_model/epoch_15
