In [None]:
import pandas as pd
from tqdm import tqdm
from sentence_transformers import InputExample, datasets
from sentence_transformers import models, losses, SentenceTransformer

# import torch
# for i in range(4):
#     torch.cuda.set_device(i)
#     torch.cuda.empty_cache()

## model
model_path = 'outputs/matbert_mnr_triplet'   # model after first step finetuning
model = SentenceTransformer(model_path, device='cuda:2')

## data
df = pd.read_csv('data/formulae_synonyms_descriptions.csv')
data = df[df['description'] != 'NO DESCRIPTION']
form_syn = data[['formula', 'synonym']]
syn_desc = data[['synonym', 'description']]
form_desc = data[['formula', 'description']]
data_pairs = pd.concat([form_syn, syn_desc, form_desc])
def extract_pairs(row):
    a = row.iloc[0]
    b = row.iloc[1]
    sample = InputExample(texts=[str(a), str(b)])
    return sample
samples = list(data_pairs.apply(extract_pairs, axis=1))
batch_size = 32
dataloader = datasets.NoDuplicatesDataLoader(samples, batch_size=batch_size)

## training
loss = losses.MultipleNegativesRankingLoss(model)
epochs = 5
warmup_steps = int(len(dataloader) * epochs * 0.1)
output_path = 'outputs/matbert_further_finetuning'
model.fit(
    train_objectives=[(dataloader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path=output_path,
    show_progress_bar=True
)

## spearman
from scipy import stats
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
with open('data/zt_ori_84.txt', 'r') as f:
    lines = f.readlines()
    names = [line.strip().split('\t')[0] for line in lines]
    zt_scores = [line.strip().split('\t')[-1] for line in lines]
tuned_model = SentenceTransformer(output_path)
center_embedding = tuned_model.encode('thermoelectric')
name_embeddings = tuned_model.encode(names)
cos_sims = [(1-cosine(center_embedding,name_embedding)) for name_embedding in name_embeddings]
corr, pvalue = stats.spearmanr(cos_sims, zt_scores)
print('spearman correlation', corr)   

## 0.3144679558570416