# Train Transformer Models V2

Used to train transformer models by creating own metric for similarity between abstracts and using those to backpropagate for abstract-title pairs.

For now: Given a certain abstract: Use title with similarity close to 1. 
Find 2-3 other abstracts (either randomly or 1 close 1 middle 1 far) and take their similarity as goal between the titles of given abstracts.

In [1]:
from importlib import reload

import numpy as np
import random
from datasets import Dataset

import save_load_models as save_load
import vector_search as vs

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
reload(save_load)

model_finetune = save_load.load_model_sentencetransformer('./Models/SPubMedBERT')

## Create Dataset

Using the data already existant trying to determine similarities for model.
For now using same model as goal (might not do much... But is easier as similaries in same numeric area)

In [3]:
reload(vs)

# Use embeddings from same model...
# Train_Data_Subset
# Train_Data_10k
embeddings_label, text_label, ref_list_label = vs.add_embeddings_and_text_to_nparray_from_dir('./Embeddings/SPubMedBERT/Train_Data_Subset/')
# Take only title text
title_text, title_text_ref = vs.filter_embedding_types(text_label, ref_list_label, embedding_types=['Title'])
# Take only abstract text
abstract_text, abstract_text_ref = vs.filter_embedding_types(text_label, ref_list_label, embedding_types=['Abstract'])
# Take only abstract embeddings
embeddings_abstr, ref_list_abstr = vs.filter_embedding_types(embeddings_label, ref_list_label, embedding_types=['Abstract'])
# Calculate similarity between abstract embeddings
all_similarity = vs.calc_all_embedding_similarity(embeddings_abstr, scaled=False)


n = 5
titles = []
abstracts = []
sim_scores = []

# Loop through abstracts
for abstr_idx in range(all_similarity.shape[0]):
    # Add title-abstract at abstr_idx with similarity 1
    titles.append(title_text[abstr_idx])
    abstracts.append(abstract_text[abstr_idx])
    # Could just use 1 or 0.999 as sim_score here
    sim_scores.append(all_similarity[abstr_idx][abstr_idx])
    # Sample n abstracts and add their title with all_similarity[idx]
    for i in range(n):
        rand_idx = random.randint(0, all_similarity.shape[0]-2)
        # Make sure rand_idx != abstr_idx
        rand_idx += (int)(rand_idx >= abstr_idx)
        titles.append(title_text[rand_idx])
        abstracts.append(abstract_text[abstr_idx])
        sim_scores.append(all_similarity[abstr_idx][rand_idx])


dataset = Dataset.from_dict({
    "sentence_A": titles,
    "sentence_B": abstracts,
    "label": sim_scores,
})
# Index of end of train
idx_1 = int(len(dataset)*0.8)
idx_2 = int(len(dataset)*0.9)
train_dataset = Dataset.from_dict({
    "sentence_A": titles[:idx_1],
    "sentence_B": abstracts[:idx_1],
    "label": sim_scores[:idx_1],
})
eval_dataset = Dataset.from_dict({
    "sentence_A": titles[idx_1:idx_2],
    "sentence_B": abstracts[idx_1:idx_2],
    "label": sim_scores[idx_1:idx_2],
})
test_dataset = Dataset.from_dict({
    "sentence_A": titles[idx_2:],
    "sentence_B": abstracts[idx_2:],
    "label": sim_scores[idx_2:],
})
train_dataset = train_dataset.shuffle()
dataset = dataset.shuffle()

In [5]:
print(dataset)

Dataset({
    features: ['sentence_A', 'sentence_B', 'label'],
    num_rows: 113754
})


## Implement Training

In [8]:
import os
from sentence_transformers.losses import CosineSimilarityLoss, CoSENTLoss
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction

save_dir = './Models/Finetunedv2/SPubMedBERT_batch/'

#loss = CosineSimilarityLoss(model_finetune)
loss = CoSENTLoss(model_finetune)


args = SentenceTransformerTrainingArguments(
    output_dir=save_dir+"/checkpoints",
    num_train_epochs=2,
    learning_rate=1e-5,
    weight_decay= 0.01,
    adam_epsilon=5e-06,
    warmup_ratio=0.1,

    save_strategy="epoch",
    save_total_limit=1,
    logging_steps=300,
)
args = args.set_dataloader(train_batch_size=16)

trainer = SentenceTransformerTrainer(
    model=model_finetune,
    args=args,
    train_dataset=dataset,
    loss=loss,
)
trainer.train()


# Save the trained model
if not os.path.isdir(save_dir+"/end_model"):
        os.makedirs(save_dir+"/end_model")
model_finetune.save_pretrained(save_dir+"/end_model")

"""
args = SentenceTransformerTrainingArguments(
    output_dir=save_dir+"/checkpoints",
    num_train_epochs=2,
    learning_rate=2e-5,
    adam_epsilon=5e-06,
    warmup_ratio=0.1,
    weight_decay= 0.01,

    
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="epoch",
    save_total_limit=1,
    logging_steps=100,
)

dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["sentence_A"],
    sentences2=eval_dataset["sentence_B"],
    scores=eval_dataset["label"],
    main_similarity=SimilarityFunction.COSINE,
)
dev_evaluator(model_finetune)

trainer = SentenceTransformerTrainer(
    model=model_finetune,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

# Evaluate the trained model on the test set
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["sentence_A"],
    sentences2=eval_dataset["sentence_B"],
    scores=eval_dataset["label"],
    main_similarity=SimilarityFunction.COSINE,
)
test_evaluator(model_finetune)

# Save the trained model
if not os.path.isdir(save_dir+"/end_model"):
        os.makedirs(save_dir+"/end_model")
model_finetune.save_pretrained(save_dir+"/end_model")
"""

OutOfMemoryError: CUDA out of memory. Tried to allocate 66.00 MiB. GPU 0 has a total capacity of 44.35 GiB of which 17.00 MiB is free. Process 3293934 has 30.75 GiB memory in use. Including non-PyTorch memory, this process has 13.56 GiB memory in use. Of the allocated memory 13.13 GiB is allocated by PyTorch, and 116.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)