In [1]:
import os
import json
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer, InputExample, losses, LoggingHandler
from torch.utils.data import DataLoader
import logging
import math
import torch


logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])


DATA_PROCESSED_PATH = "../data/processed"
TRAINING_DATA_FILE = "retriever_training_data.jsonl"
MODEL_BASE_NAME = "BAAI/bge-base-en-v1.5" # retriver for fine-tuning
FINETUNED_MODEL_SAVE_PATH = "../models/retriever_finetuned_bge_base"


NUM_EPOCHS = 3
TRAIN_BATCH_SIZE = 16

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")


training_data_path = os.path.join(DATA_PROCESSED_PATH, TRAINING_DATA_FILE)
os.makedirs(FINETUNED_MODEL_SAVE_PATH, exist_ok=True)

if not os.path.exists(training_data_path):
    raise FileNotFoundError(f"Training file not found: {training_data_path}")

Using device: cuda


In [2]:
train_samples = []
with open(training_data_path, 'r', encoding='utf-8') as f:
    for line in tqdm(f, desc="Loading training data"):
        data = json.loads(line)
        train_samples.append(InputExample(texts=[data['query'], data['passage']], label=float(data['label'])))

if not train_samples:
    raise ValueError("Training data not loaded")

Loading training data: 0it [00:00, ?it/s]

In [3]:
model = SentenceTransformer(MODEL_BASE_NAME, device=DEVICE)
print("Base model loaded.")

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=TRAIN_BATCH_SIZE)

train_loss = losses.CosineSimilarityLoss(model=model)


2025-06-02 21:41:08 - Load pretrained SentenceTransformer: BAAI/bge-base-en-v1.5
Base model loaded.


In [4]:
warmup_steps = math.ceil(len(train_dataloader) * NUM_EPOCHS * 0.1) 
logging.info(f"Warm up steps: {warmup_steps}")

print("\nStarted retriver fine-tuningu")
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=NUM_EPOCHS,
          warmup_steps=warmup_steps,
          output_path=FINETUNED_MODEL_SAVE_PATH,
          show_progress_bar=True,
          # optimizer_class=torch.optim.AdamW # default
          weight_decay=0.01,
          save_best_model=True,
          checkpoint_path=os.path.join(FINETUNED_MODEL_SAVE_PATH, "checkpoints"),
          checkpoint_save_steps=int(len(train_dataloader) * 0.5),
          checkpoint_save_total_limit=3
         )

print("Fine-tuning finished")

2025-06-02 21:41:12 - Warm up steps: 106

Started retriver fine-tuningu


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1336
1000,0.077


2025-06-02 21:41:57 - Saving model checkpoint to ../models/retriever_finetuned_bge_base/checkpoints/checkpoint-175
2025-06-02 21:41:57 - Save model to ../models/retriever_finetuned_bge_base/checkpoints/checkpoint-175
2025-06-02 21:42:42 - Saving model checkpoint to ../models/retriever_finetuned_bge_base/checkpoints/checkpoint-350
2025-06-02 21:42:42 - Save model to ../models/retriever_finetuned_bge_base/checkpoints/checkpoint-350
2025-06-02 21:43:27 - Saving model checkpoint to ../models/retriever_finetuned_bge_base/checkpoints/checkpoint-525
2025-06-02 21:43:27 - Save model to ../models/retriever_finetuned_bge_base/checkpoints/checkpoint-525
2025-06-02 21:44:12 - Saving model checkpoint to ../models/retriever_finetuned_bge_base/checkpoints/checkpoint-700
2025-06-02 21:44:12 - Save model to ../models/retriever_finetuned_bge_base/checkpoints/checkpoint-700
2025-06-02 21:44:56 - Saving model checkpoint to ../models/retriever_finetuned_bge_base/checkpoints/checkpoint-875
2025-06-02 21:44:

In [5]:
final_model_path = os.path.join(FINETUNED_MODEL_SAVE_PATH, "final_model")
model.save(final_model_path)
print(f"Last epoch model saved to: {FINETUNED_MODEL_SAVE_PATH}")

2025-06-02 21:45:46 - Save model to ../models/retriever_finetuned_bge_base/final_model
Last epoch model saved to: ../models/retriever_finetuned_bge_base
