In [None]:
import os 


os.environ["CUDA_VISIBLE_DEVICES"]= "1,2,3"  

In [None]:
import math
import logging
from datetime import datetime

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

# logger
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [None]:
pretrained_model_name = 'klue/roberta-large'
sts_num_epochs = 4
train_batch_size = 32

sts_model_save_path = 'output/training_sts-'+pretrained_model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:

openfile = pd.read_excel('STS_dataset file  *.xlsx')
openfile

In [None]:
def make_sts_input_example(openfile):
    ''' 
    Transform to InputExample
    ''' 
    total = openfile.shape[0]

    train_num = round(total*0.8)
    val_num = round(total*0.15)
    test_num = total-train_num-val_num

    train_examples = []
    print('train range:',train_num)
    for i in range(0, train_num):
        sentence1 = openfile.iloc[i]['s1']
        sentence2 = openfile.iloc[i]['s2']
        score = openfile.iloc[i]['cosSim']
        train_examples .append(InputExample(texts=[sentence1, sentence2], label=np.float32(score)))
        
    val_examples = []
    print('val range:', train_num+1, train_num+val_num)
    for i in range(train_num+1, train_num+val_num):
        sentence1 = openfile.iloc[i]['s1']
        sentence2 = openfile.iloc[i]['s2']
        score = openfile.iloc[i]['cosSim']
        val_examples .append(InputExample(texts=[sentence1, sentence2], label=np.float32(score)))
        
    test_examples = []
    print('test range:', train_num+val_num+1, total)
    for i in range(train_num+val_num+1, total):
        sentence1 = openfile.iloc[i]['s1']
        sentence2 = openfile.iloc[i]['s2']
        score = openfile.iloc[i]['cosSim']
        test_examples .append(InputExample(texts=[sentence1, sentence2], label=np.float32(score)))

    return train_examples, val_examples, test_examples



In [None]:
train_examples, val_examples, test_examples = make_sts_input_example(openfile)


In [None]:
# Train Dataloader
train_dataloader = DataLoader(
    train_examples,
    shuffle=True,
    batch_size=train_batch_size,
)

# Evaluator by sts-validation
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples,
    name="sts-dev",
)

# Evaluator by sts-test
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    test_examples,
    name="sts-test",
)

### 2. Load Embedding Model 

In [None]:
# Load Embedding Model
embedding_model = models.Transformer(
    model_name_or_path=pretrained_model_name, 
    max_seq_length=256,
    do_lower_case=True
)

# Only use Mean Pooling -> Pooling all token embedding vectors of sentence.
pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[embedding_model, pooling_model])

### 3. STS Training

In [None]:
# Use CosineSimilarityLoss
train_loss = losses.CosineSimilarityLoss(model=model)

# warmup steps
warmup_steps = math.ceil(len(train_examples) * sts_num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Training
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=sts_num_epochs,
    evaluation_steps=int(len(train_dataloader)*0.1),
    warmup_steps=warmup_steps,
    output_path=sts_model_save_path
)

In [None]:
# evaluation sts-test
test_evaluator(model, output_path=sts_model_save_path)

In [None]:

import pickle 

with open('klue_large_stsfinetune.pickle', 'wb') as pkl:
    pickle.dump(model, pkl)