## 0. Import

In [None]:
! pip install transformers
! pip install -U sentence-transformers
! pip install sentencepiece

In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import STSBenchmarkDataReader, InputExample

import torch
from torch.utils.data import DataLoader

import pandas as pd
import math
import sys
import logging

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(">> Using {}".format(device))

>> Using cuda


## 1. Make Sentence-BERT from KoLawBERT

In [None]:
from models.kor_sentence_bert import *

In [None]:
# 1.Load BERT style Masking modeling
model_path="pretrained-KorLawDistil_2/checkpoint-11000"
tokenizer_path="pretrained-KorLawDistil_2"
model_name="bert"

# 2.Load Roberta style Masking modeling
model_path="pretrained-KorLawRoberta_2/checkpoint-7000"
tokenizer_path="pretrained-KorLawRoberta_2"
model_name="roberta"

# 3. Load ALBERT style Masking modeling
model_path="pretrained-KorLawAlBERT_2/checkpoint-5000"
tokenizer_path="pretrained-KorLawAlBERT_2"
model_name="albert"

# Make sentence BERT
model = make_sentenceBERT(model_path=model_path,
                          tokenizer_path=tokenizer_path,
                          model_name=model_name,
                          device=device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## 2. KorNLI Fine-Tuning Task

In [None]:
# Dataset for Train
train_snli = pd.read_csv("../S_bert/data/snli_1.0_train.ko.tsv", sep='\t', quoting=3)  # quating = 3 : 큰 따옴표 무시
train_xnli = pd.read_csv("../S_bert/data/multinli.train.ko.tsv", sep='\t', quoting=3)
train_data = pd.concat([train_snli, train_xnli], ignore_index=True)
print(">> Total Train Dataset size :", len(train_data))

# Dataset for Eval
val_data = pd.read_csv("../S_bert/data/sts-dev.tsv", sep='\t', quoting=3)
test_data = pd.read_csv("../S_bert/data/sts-test.tsv", sep='\t', quoting=3)
print(">> Total Validataion Dataset size :", len(val_data))
print(">> Total Test Dataset size :", len(test_data))

# label_dict
label_dict = {"contradiction": 0, "entailment": 1, "neutral": 2}

>> Total Train Dataset size : 942854
>> Total Validataion Dataset size : 1500
>> Total Test Dataset size : 1379


In [None]:
train_data = drop_kornli(train_data)
train_data.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,한 사람이 경쟁을 위해 말을 훈련시키고 있다.,neutral
1,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,한 사람이 식당에서 오믈렛을 주문하고 있다.,contradiction
2,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,사람은 야외에서 말을 타고 있다.,entailment
3,카메라에 웃고 손을 흔드는 아이들,그들은 부모님을 보고 웃고 있다,neutral
4,카메라에 웃고 손을 흔드는 아이들,아이들이 있다,entailment


In [None]:
# Make Dataset for Training
# Traing dataset
train_batch_size = 16
train_samples = make_kornli_dataset(train_data)

# Train DataLoader
train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)

# Val/Test dataset
val_data = drop_korsts(val_data)
test_data = drop_korsts(test_data)

dev_samples = make_korsts_dataset(val_data)
test_samples = make_korsts_dataset(test_data)

# Eval DataLoader
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')

In [None]:
# Loss function : Calculate MSE loss
train_loss = losses.SoftmaxLoss(model=model,
                                sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
                                num_labels=len(label_dict))

# Warmup(10% of train data for warm-up) & Epochs
num_epochs = 3
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1)   
logging.info("Warmup-steps: {}".format(warmup_steps))

### 2-1. Train

In [None]:
#model_save_path = 'output/nil_task_bert'
model_save_path = 'output/nil_task_albert'
#model_save_path = 'output/nil_task_roberta'


model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

### 2-2. Evaluation

In [None]:
model_save_path = 'output/nil_task_bert/0_Transformer'
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')
print(">> Best TEST Score is : {:.4f}".format(test_evaluator(model, output_path=model_save_path)))



>> Best TEST Score is : 0.6405


## 3. KorSTS Fine-Tuning Task

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(">> Using {}".format(device))

# NLI TASK : Trained Distilation(5 epoch)
model_name = 'output/nil_law/0_Transformer'
model_save_path = 'output/sts_law'
# Load Model
model = SentenceTransformer(model_name)

>> Using cpu




In [None]:
# Get KoSTS Dataset(made by Kakao Brain)
train_data = pd.read_csv("../S_bert/data/sts-train.tsv", sep='\t', quoting=3)
val_data = pd.read_csv("../S_bert/data/sts-dev.tsv", sep='\t', quoting=3)
test_data = pd.read_csv("../S_bert/data/sts-test.tsv", sep='\t', quoting=3)

train_data = drop_korsts(train_data)
val_data = drop_korsts(val_data)
test_data = drop_korsts(test_data)

# Traing/val/test dataset
train_samples = make_korsts_dataset(train_data)
dev_samples = make_korsts_dataset(val_data)
test_samples = make_korsts_dataset(test_data)

# DataLoader
train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)

In [None]:
# Loss function : Calculate Cosine similarity
train_loss = losses.CosineSimilarityLoss(model=model)

# Evaluator 
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

# Warmup(10% of train data for warm-up) & Epochs
num_epochs = 10
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1)  
logging.info("Warmup-steps: {}".format(warmup_steps))

### 3-1. Train

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

### 3-2. Evaluation

In [None]:
model_save_path = 'output/sts_law'
print(">> Trained BERT Model Name is :", model_save_path)
model = SentenceTransformer(model_save_path)

test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
print(">> Best TEST Socre is : {:.4f}".format(test_evaluator(model, output_path=model_save_path)))