In [None]:
! pip install -U FlagEmbedding[finetune]

## Fine-tune

Below are the arguments for fine-tuning:

The following arguments are for model:
- `model_name_or_path`: The model checkpoint for initialization.
- `config_name`: Pretrained config name or path if not the same as model_name.
- `tokenizer_name`: Pretrained tokenizer name or path if not the same as model_name.
- `cache_dir`: Where do you want to store the pre-trained models downloaded from s3.
- `trust_remote_code`: Trust remote code
- `token`: The token to use when accessing the model.

The following arguments are for data:
- `train_data`: One or more paths to training data. `query: str`, `pos: List[str]`, `neg: List[str]` are required in the training data. Argument type: multiple.
- `cache_path`: Where do you want to store the cached data.
- `train_group_size`: (No metadata provided)
- `query_max_len`: The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated.
- `passage_max_len`: The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated.
- `pad_to_multiple_of`: If set will pad the sequence to be a multiple of the provided value.
- `max_example_num_per_dataset`: The max number of examples for each dataset.
- `query_instruction_for_retrieval`: Instruction for query.
- `query_instruction_format`: Format for query instruction.
- `knowledge_distillation`: Use knowledge distillation when `pos_scores: List[float]` and `neg_scores: List[float]` are in features of training data.
- `passage_instruction_for_retrieval`: Instruction for passage.
- `passage_instruction_format`: Format for passage instruction.
- `shuffle_ratio`: The ratio of shuffling the text.
- `same_dataset_within_batch`: All samples in the same batch comes from the same dataset.
- `small_threshold`: The threshold of small dataset. All small dataset in the same directory will be merged into one dataset.
- `drop_threshold`: The threshold for dropping merged small dataset. If the number of examples in the merged small dataset is less than this threshold, it will be dropped.

And the following extra arguments:
- `negatives_cross_device`: Share negatives across devices.
- `temperature`: Temperature used for similarity score.
- `fix_position_embedding`: Freeze the parameters of position embeddings.
- `sentence_pooling_method`: The pooling method. Available options: cls, mean, last_token. Default: cls.
- `normalize_embeddings`: Whether to normalize the embeddings.
- `sub_batch_size`: Sub batch size for training.
- `kd_loss_type`: The loss type for knowledge distillation. Available options: kl_div, m3_kd_loss. Default: kl_div.

In [None]:
import wandb

wandb.login(key="5075b85e4e708e828d33dba7fcc413dcdecbe4c8")

In [None]:
# 1) Gỡ sạch flash-attn
!pip uninstall -y flash-attn

# 2) Thiết biến môi trường để tắt import Flash-Attention
%env HF_NO_FLASH_ATTN=1


In [None]:
%%bash
torchrun --nproc_per_node 2 \
  -m FlagEmbedding.finetune.reranker.encoder_only.base \
  --model_name_or_path BAAI/bge-reranker-v2-m3 \
  --cache_dir /kaggle/working/cache/model \
  --train_data /kaggle/input/ft-data/training_512.json \
  --cache_path /kaggle/working/cache/data \
  --train_group_size 12 \
  --query_max_len 512 \
  --passage_max_len 1024 \
  --pad_to_multiple_of 8 \
  --query_instruction_for_rerank 'Với vai trò là một chuyên gia pháp luật, hãy tìm kiếm các điều khoản, quy định pháp luật có liên quan trực tiếp đến vấn đề: ' \
  --query_instruction_format '{}{}' \
  --knowledge_distillation False \
  --output_dir test_encoder_only_base_bge-reranker-v2-m3 \
  --overwrite_output_dir \
  --learning_rate 5e-6 \
  --bf16 \
  --num_train_epochs 20 \
  --per_device_train_batch_size 8 \
  --gradient_accumulation_steps 16 \
  --dataloader_drop_last True \
  --warmup_ratio 0.1 \
  --deepspeed /kaggle/input/ft-data/ds_stage0.json \
  --logging_steps 1 \
  --save_steps 500 \
  --gradient_checkpointing \
  --save_total_limit 3 \
  --report_to wandb \
  --run_name "finetune-bge-$(date +%Y%m%d_%H%M)"


In [None]:
import os
import glob
import shutil

# Chuyển vào thư mục chứa các folder checkpoint-*
os.chdir('/kaggle/working/test_encoder_only_base_bge-reranker-v2-m3')

# Lấy danh sách các folder checkpoint-*
checkpoint_dirs = [d for d in glob.glob('checkpoint-*') if os.path.isdir(d)]
if not checkpoint_dirs:
    print("Không tìm thấy folder nào thỏa pattern 'checkpoint-*'")
else:
    print(f"Đã tìm thấy {len(checkpoint_dirs)} checkpoint(s):")
    for d in checkpoint_dirs:
        print("  -", d)

# 1) Zip từng folder checkpoint-xxx thành các file riêng
for chk in checkpoint_dirs:
    zip_name = shutil.make_archive(chk, 'zip', root_dir=chk)
    print(f"Đã tạo archive: {zip_name}")
