# Train Transformer Models V1

Used to train transformer models by creating own metric for similarity between abstracts and using those to backpropagate for abstract-title pairs.

Simply use a multiple negative triplet loss and train it, giving only positive pairs beeing title-abstract.

In [1]:
from importlib import reload

import numpy as np
from datasets import Dataset

import save_load_models as save_load
import vector_search as vs

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
reload(save_load)

sPubMedBert = './Models/SPubMedBERT'

model = save_load.load_model_sentencetransformer(sPubMedBert)

In [3]:
save_dir = './Models/Finetunedv1/SPubMedBERT/'

# Train_Data_Subset
# Train_Data_10k
dataDir = './Data/Train_Data_Subset/'

## Training-Dataset

In [4]:
reload(vs)

# Get Text passages from subset
text_passages, ref_list = vs.get_title_abstract_from_dir(dataDir)
# Take only title text
title_text, title_text_ref = vs.filter_embedding_types(text_passages, ref_list, embedding_types=['Title'])
# Take only abstract text
abstract_text, abstract_text_ref = vs.filter_embedding_types(text_passages, ref_list, embedding_types=['Abstract'])


dataset = Dataset.from_dict({
    "anchor": title_text,
    "positive": abstract_text,
})

# Index of end of train
idx_1 = int(len(dataset)*0.8)
idx_2 = int(len(dataset)*0.9)
train_dataset = Dataset.from_dict({
    "anchor": title_text[:idx_1],
    "positive": abstract_text[:idx_1],
})
eval_dataset = Dataset.from_dict({
    "anchor": title_text[idx_1:idx_2],
    "positive": abstract_text[idx_1:idx_2],
})
test_dataset = Dataset.from_dict({
    "anchor": title_text[idx_2:],
    "positive": abstract_text[idx_2:],
})
'''
'''
train_dataset = train_dataset.shuffle()

In [5]:
print(dataset)

Dataset({
    features: ['anchor', 'positive'],
    num_rows: 20000
})


## Eval/Test-Dataset

In [39]:
from datasets import load_dataset

eval_dataset = load_dataset("tabilab/biosses")

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'score'],
        num_rows: 100
    })
})


In [54]:
def divide_score(example):
    example['score'] = example['score'] / 4
    return example
print(eval_dataset['train'][0]['score'])
new_dataset = eval_dataset.map(divide_score)
print(new_dataset['train'][0:10]['score'])

2.200000047683716
[0.550000011920929, 0.800000011920929, 0.5, 0.699999988079071, 0.6000000238418579, 0.75, 0.05000000074505806, 1.0, 0.75, 0.800000011920929]


## Train

In [6]:
import os
from sentence_transformers.losses import MultipleNegativesSymmetricRankingLoss
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction

loss = MultipleNegativesSymmetricRankingLoss(model)

args = SentenceTransformerTrainingArguments(
    output_dir=save_dir+"/checkpoints",
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay= 0.01,
    adam_epsilon=5e-06,
    warmup_ratio=0.1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    
    save_strategy="epoch",
    save_total_limit=1,
    logging_steps=300,
)
args = args.set_dataloader(train_batch_size=32)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    loss=loss,
)
trainer.train()


# Save the trained model
if not os.path.isdir(save_dir+"/end_model"):
        os.makedirs(save_dir+"/end_model")
model.save_pretrained(save_dir+"/end_model")

Step,Training Loss
300,0.3217
600,0.0927
900,0.0011
1200,0.0248


                                                                                                                             