In [1]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch.utils.data import DataLoader
from peft import LoraConfig, get_peft_model, TaskType
import torch
from datasets import load_dataset
from tqdm import tqdm
import pickle
import random
from datasets import Dataset
import os
from matplotlib import pyplot as plt
from IPython.display import clear_output
import torch
import gc
from uuid import uuid4
import json
import numpy as np
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerModelCardData,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.evaluation import NanoBEIREvaluator
from sentence_transformers.losses import CachedMultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [2]:
config = {}

### Подготовка данных

##### Конфигурация данных

In [3]:
config['data'] = {'SYNT_TYPE': 'single',
                'POSITIVE_THRESHOLD': 83,
                'NEGATIVE_MINING_MARGINE': '001',
                'NUM_NEGATIVES': 1,
                'HARG_NEGATIVES_THRESHOLD': 75,
                'USE_ALL_DATA': True
                }

##### Загрузка данных

In [4]:
data_config = config['data']
soft_negatives = pickle.load(open(f'data/soft_negatives_{data_config['SYNT_TYPE']}_{data_config['POSITIVE_THRESHOLD']}_{data_config['NEGATIVE_MINING_MARGINE']}_{data_config['NUM_NEGATIVES']}_{data_config['HARG_NEGATIVES_THRESHOLD']}.pkl', 'rb'))
hard_negatives = pickle.load(open(f'data/hard_negatives_{data_config['SYNT_TYPE']}_{data_config['POSITIVE_THRESHOLD']}_{data_config['NEGATIVE_MINING_MARGINE']}_{data_config['NUM_NEGATIVES']}_{data_config['HARG_NEGATIVES_THRESHOLD']}.pkl', 'rb'))

if data_config['USE_ALL_DATA']:
    train_data = hard_negatives + soft_negatives
else:
    train_data = hard_negatives
    
random.shuffle(train_data)

train_dataset = Dataset.from_dict({
    'anchor': [data[0] for data in train_data],
    'positive': [data[1] for data in train_data],
    'negative': [data[2] for data in train_data],
})


##### Конфигурация LoRA

In [5]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config['LoRA'] = {
                    "MODEL_NAME": "e5_large",
                    "LORA_R": 16,
                    "LORA_ALPHA": 32,
                    "LORA_DP": 0.05
                }

LORA_CONFIG = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    inference_mode=False,
    r=config['LoRA']['LORA_R'],
    lora_alpha=config['LoRA']['LORA_ALPHA'],
    lora_dropout=config['LoRA']['LORA_DP'],
    bias="none",
    # modules_to_save=["pooling"]
)

In [6]:
model = SentenceTransformer(config['LoRA']['MODEL_NAME'], device=DEVICE)
before_num_params = sum(p.numel() for p in model.parameters())
model.add_adapter(LORA_CONFIG)
after_num_params = sum(p.numel() for p in model.parameters())
print('Параметров до добавления LoRA:', before_num_params)
print('Параметров после добавления LoRA:', after_num_params)
print('Всего обучаемых параметров:', after_num_params - before_num_params)


Параметров до добавления LoRA: 559890432
Параметров после добавления LoRA: 561463296
Всего обучаемых параметров: 1572864


##### Параметры обучения

In [7]:
config['training'] = {'LR': 2e-5,
                    'WEIGHT_DECAY': 0.01,
                    'MARGIN': 0.2,
                    'EPOCHS': 5,
                    # 'ALPHA': 0.1,
                    'BATCH_SIZE': 128,
                    'ACCUMULATION_STEPS': 2}

In [8]:
def prepare_batch_for_triplet_loss(batch):
    anchors, positives, negatives = batch['anchor'], batch['positive'], batch['negative']
    features = []
    for texts in [anchors, positives, negatives]:
        tokenized = model.tokenize(texts)
        tokenized = {key: value.to(model.device) for key, value in tokenized.items()}
        features.append({'input_ids': tokenized['input_ids'], 'attention_mask': tokenized['attention_mask']})
    return features

In [9]:
triplet_loss = losses.TripletLoss(model=model, distance_metric=losses.TripletDistanceMetric.EUCLIDEAN, triplet_margin=config['training']['MARGIN'])
cosine_loss = losses.CosineSimilarityLoss(model=model)

train_dataloader = DataLoader(train_dataset, batch_size=config['training']['BATCH_SIZE'], shuffle=True)


In [10]:
model_path = f"LoRA_st_training/{str(uuid4())}"

In [11]:
args = SentenceTransformerTrainingArguments(
    output_dir=model_path,
    num_train_epochs=config['training']['EPOCHS'],
    per_device_train_batch_size=config['training']['BATCH_SIZE'],
    learning_rate=config['training']['LR'],
    warmup_ratio=0.1,
    save_steps=3000,
    disable_tqdm=False,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=8,
    save_total_limit=2,
    gradient_accumulation_steps=config['training']['ACCUMULATION_STEPS'],
)

Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.


In [12]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=triplet_loss,
)


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2025-07-10 18:08:04,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


df: /home/datalab/.triton/autotune: No such file or directory
/usr/local/cuda/lib64/libcufile.so: undefined reference to `dlvsym'
/usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'
/usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'
/usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'
collect2: error: ld returned 1 exit status


In [None]:
trainer.train()

model.save_pretrained(model_path + 'final')

with open(model_path + '/config.json', 'w', encoding='utf-8') as f:
    json.dump(config, f, ensure_ascii=False, indent=4)


Step,Training Loss


KeyboardInterrupt: 