# finetuning 

In [1]:
import pandas as pd
from tqdm import tqdm
from sentence_transformers import InputExample, datasets
from sentence_transformers import models, losses, SentenceTransformer
from sentence_transformers import LoggingHandler
import logging 

D:\Anaconda3\envs\hugging\lib\site-packages\numpy\.libs\libopenblas.QVLO2T66WEPI7JZ63PS3HMOHFEY472BC.gfortran-win_amd64.dll
D:\Anaconda3\envs\hugging\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


In [2]:
# 检查cuda
import torch
print(torch.__version__)
print(torch.cuda.is_available())

1.10.0+cpu
False


In [3]:
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()]
)

## data - input_samples - dataloader

In [4]:
df = pd.read_csv('formulae_synonyms_descriptions.csv')
data = df[df['description'] != 'NO DESCRIPTION']

form_syn = data[['formula', 'synonym']]
syn_desc = data[['synonym', 'description']]
form_desc = data[['formula', 'description']]

data_pairs = pd.concat([form_syn, syn_desc, form_desc])
print(len(data_pairs))   # need to split the data into train-val-test???

10329


In [5]:
def extract_pairs(row):
    a = row.iloc[0]
    b = row.iloc[1]
    sample = InputExample(texts=[str(a), str(b)])   # represent one input which contains texts and label???
    return sample

samples = list(data_pairs.apply(extract_pairs, axis=1))

In [6]:
batch_size = 32
dataloader = datasets.NoDuplicatesDataLoader(samples, batch_size=batch_size)
# help(datasets)
# NoDuplicatesDataLoader, used with MultipleNegativesRankingLoss
# ParallelSentencesDataset, to read-in tab-separated parallel sentences

## matbert + pooler = model

In [7]:
# C:/Users/Lenovo/Desktop/EnergyBERT/mat_bert_cased
model_path = 'C:/Users/Lenovo/Desktop/EnergyBERT/mat_bert_cased'
max_seq_length = 128
matbert = models.Transformer(model_path, max_seq_length)   # word embedding model

# help(models.Pooling)
embedding_dimension = matbert.get_word_embedding_dimension()
pooler = models.Pooling(embedding_dimension, pooling_mode="mean_sqrt_len_tokens")   # extract third layer???
# mean_sqrt_len_tokens: mean-pooling but divide by square of input_length, reduce the impact of sequence length on pooling operation
# cls/max/mean/lasttoken/weightdmean(position weighted mean pooling)

model = SentenceTransformer(modules=[matbert, pooler],device='cuda')

Some weights of BertModel were not initialized from the model checkpoint at C:/Users/Lenovo/Desktop/EnergyBERT/mat_bert_cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## training with MNR loss

In [8]:
loss = losses.MultipleNegativesRankingLoss(model)   # other loss function???

In [9]:
epochs = 5   # use validation sets to determine appropriate number of epochs to avoid overfitting and achieve the best performance???
warmup_steps = int(len(dataloader) * epochs * 0.1)
output_path = 'outputs/matbert_mnr'

# help(model.fit)
# param evaluator: (sentence_transformers.evaluation) evaluates the model performance during training on held-out dev data
# param save_best_model: if true, the best model according to the evaluator is stored at output_path
# param optimizer_class: Optimizer; param optimizer_params: Optimizer parameters, default {'lr': 2e-05(2*10^-5)}
# lr (learning rate): a hyperparameter that controls how much the model's parameters are updated during each training step
# usually range from 1e-4 to 1e-6, lower learning rate can be more stable, better to start with a small learning rate
model.fit(
    train_objectives=[(dataloader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path=output_path,
    show_progress_bar=True
)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/322 [00:00<?, ?it/s]

Iteration:   0%|          | 0/322 [00:00<?, ?it/s]

Iteration:   0%|          | 0/322 [00:00<?, ?it/s]

Iteration:   0%|          | 0/322 [00:00<?, ?it/s]

Iteration:   0%|          | 0/322 [00:00<?, ?it/s]

2024-04-22 17:10:14 - Save model to outputs/matbert_mnr


# downstream task

In [3]:
from scipy import stats
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

In [4]:
with open('../for_spearman/zt_ori_84.txt', 'r') as f:
    lines = f.readlines()
    names = [line.strip().split('\t')[0] for line in lines]
    zt_scores = [line.strip().split('\t')[-1] for line in lines]

In [5]:
output_path = 'outputs/matbert_mnr'
tuned_model = SentenceTransformer(output_path)

center_embedding = tuned_model.encode('thermoelectric')
name_embeddings = tuned_model.encode(names)

cos_sims = [(1-cosine(center_embedding,name_embedding)) for name_embedding in name_embeddings]

You try to use a model that was created with version 2.7.0, however, your version is 2.2.2. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





TypeError: __init__() got an unexpected keyword argument 'pooling_mode_weightedmean_tokens'

In [None]:
corr, pvalue = stats.spearmanr(cos_sims, zt_scores)
print('spearman correlation', corr)   # (vega) result: 0.3367