# finetuning for similarity tasks
- STS Benchmark, widely used for semantic textual similarity, 8628 sentence pairs with similarity scores
- CrisisTransformers datasets: MNR loss with GooAQ (question/answer), QQP (Quora Question pairs, anchor/positive/hard negative), AllNLI (SNLI+MultiNLI anchor/entailment/contradiction)
- AllNLI and STSb datasets accessible with sentence-transformers, https://www.sbert.net/examples/datasets/README.html
- GooAQ and QQP accessible with HuggingFace, https://huggingface.co/datasets/gooaq, https://huggingface.co/datasets/embedding-data/QQP_triplets
- Natural Language Inference as the first fine-tuning step for sentence embedding methods, https://www.sbert.net/examples/training/nli/README.html
- note the distribution of labels???

In [1]:
import os, logging, torch
import gzip, csv
import random
from scipy import stats
from scipy.spatial.distance import cosine
from sentence_transformers import models, losses, datasets, util
from sentence_transformers import LoggingHandler, InputExample, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# for i in range(4):
#     torch.cuda.set_device(i)
#     torch.cuda.empty_cache()   # clear cache for each cuda
    
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()])

## download datasets 
# util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', 'data/allnli')
# util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', 'data/stsb')

# from datasets import load_dataset
# qqp = load_dataset('embedding-data/QQP_triplets', split='train').shuffle(seed=100)   
# iterable = iter(qqp)
# first_item = next(iterable)
# print(first_item)

# with open('data/qqp.txt', 'w', encoding='utf8') as file_out: 
#     for item in iter(qqp):
#         anchor = item['set']['query']
#         pos = random.choice(item['set']['pos'])
#         neg = random.choice(item['set']['neg'])
#         file_out.write('{},{},{}\n'.format(anchor, pos, neg))

In [3]:
## read data from allnli and qqp

def add_to_samples(sent1, sent2, label):
    if sent1 not in training_data.keys():
        training_data[sent1] = {"contradiction":set(), "entailment":set(), "neutral":set()}
    training_data[sent1][label].add(sent2)

training_data = {}
with gzip.open("../data/allnli", "rt", encoding="utf8") as file:
    reader = csv.DictReader(file, delimiter="\t", quoting=csv.QUOTE_NONE)
    for row in reader:
        if row["split"] == "train":
            sent1 = row["sentence1"].strip()
            sent2 = row["sentence2"].strip()
            add_to_samples(sent1, sent2, row["label"])

training_samples = []
for sent1,value in training_data.items():
    if len(value['entailment']) > 0 and len(value['contradiction']) > 0:
        training_samples.append(InputExample(texts = [sent1, random.choice(list(value['entailment'])), random.choice(list(value['contradiction']))]))

with open('../data/qqp.txt', 'rt', encoding='utf8') as file_in:     
    lines = file_in.readlines()
    for line in lines:
        sents = line.split(',')
        training_samples.append(InputExample(texts=[sents[0], sents[1], sents[2]]))
    
print(len(training_samples))

378992


In [5]:
## read stsb as development set and test set

dev_samples = []
test_samples = []
with gzip.open('../data/stsb', "rt", encoding="utf8") as fIn:
    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row["score"]) / 5.0
        if row["split"] == "dev":
            dev_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
        elif row["split"] == "test":
            test_samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))

print(len(dev_samples), len(test_samples))

1500 1379


In [None]:
matbert = models.Transformer('matbert-base-cased', max_seq_length=75)
pooling = models.Pooling(matbert.get_word_embedding_dimension(), pooling_mode="mean")
model = SentenceTransformer(modules=[matbert, pooling], device='cuda:2')

dataloader = datasets.NoDuplicatesDataLoader(training_samples, batch_size=256)   # usually larger batch size leads to better result
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=256, name="sts-dev")
loss = losses.MultipleNegativesRankingLoss(model)

num_epochs = 20
warmup_steps = int(len(dataloader) * num_epochs * 0.1)
output_path = 'outputs/matbert_mnr_triplet'

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
model.fit(
    train_objectives = [(dataloader, loss)],
    evaluator = dev_evaluator,   # evaluates the model after each epoch during training on development set to determine the best model to be saved
    evaluation_steps = int(len(dataloader)),   # evaluated after 1480 steps, len(dataloader) is the number of batches
    save_best_model = True,
    epochs = num_epochs,
    warmup_steps = warmup_steps,
    output_path = output_path,
    use_amp = True
)

In [None]:
## evaluation

tuned_model = SentenceTransformer(output_path)

# stsb test samples
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=512, name="sts-test")
test_evaluator(tuned_model, output_path=output_path)   # results written in csv

# spearman
with open('data/zt_ori_84.txt', 'r') as f:
    lines = f.readlines()
    names = [line.strip().split('\t')[0] for line in lines]
    zt_scores = [line.strip().split('\t')[-1] for line in lines]
    
center_embedding = tuned_model.encode('thermoelectric')
name_embeddings = tuned_model.encode(names)
cos_sims = [(1-cosine(center_embedding,name_embedding)) for name_embedding in name_embeddings]
corr, pvalue = stats.spearmanr(cos_sims, zt_scores)
print('spearman correlation', corr)

In [None]:
# stopped after epoch 10, best result (stsb dev 0.85) at epoch 5 saved to matbert_mnr_triplet