In [3]:
!pip install sentence-transformers datasets

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m441.9/441.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting torchvision
  Downloading torchvision-0.14.0-cp37-cp37m-manylinux1_x86_64.whl (24.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.3/24.3 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.8/24.8 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[

In [4]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

In [None]:
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [None]:
model_name = "klue/roberta-base"

In [None]:
train_batch_size = 32
num_epochs = 4
model_save_path = "output/training_klue_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
embedding_model = models.Transformer(model_name)

In [None]:
pooler = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

In [None]:
model = SentenceTransformer(modules=[embedding_model, pooler])

In [5]:
datasets = load_dataset("klue", "sts")
datasets['train']

Downloading builder script: 100%|██████████| 23.3k/23.3k [00:00<00:00, 12.5MB/s]
Downloading metadata: 100%|██████████| 22.7k/22.7k [00:00<00:00, 112kB/s] 
Downloading readme: 100%|██████████| 15.9k/15.9k [00:00<00:00, 77.9kB/s]


Downloading and preparing dataset klue/sts (download: 1.29 MiB, generated: 2.82 MiB, post-processed: Unknown size, total: 4.11 MiB) to /opt/ml/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e...


Downloading data: 100%|██████████| 1.35M/1.35M [00:00<00:00, 26.3MB/s]
                                                                                      

Dataset klue downloaded and prepared to /opt/ml/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 552.54it/s]


Dataset({
    features: ['guid', 'source', 'sentence1', 'sentence2', 'labels'],
    num_rows: 11668
})

In [7]:
testsets = load_dataset("kor_nlu", "sts")

Downloading builder script: 100%|██████████| 6.49k/6.49k [00:00<00:00, 3.89MB/s]
Downloading metadata: 100%|██████████| 4.49k/4.49k [00:00<00:00, 2.56MB/s]
Downloading readme: 100%|██████████| 2.98k/2.98k [00:00<00:00, 1.98MB/s]


Downloading and preparing dataset kor_nlu/sts (download: 1.53 MiB, generated: 1.54 MiB, post-processed: Unknown size, total: 3.07 MiB) to /opt/ml/.cache/huggingface/datasets/kor_nlu/sts/1.0.0/4facbba77df60b0658056ced2052633e681a50187b9428bd5752ebd59d332ba8...


Downloading data: 1.05MB [00:00, 38.8MB/s]                  
Downloading data: 307kB [00:00, 9.81MB/s]                    
Downloading data: 250kB [00:00, 43.1MB/s]                    
                                                                                          

Dataset kor_nlu downloaded and prepared to /opt/ml/.cache/huggingface/datasets/kor_nlu/sts/1.0.0/4facbba77df60b0658056ced2052633e681a50187b9428bd5752ebd59d332ba8. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 541.36it/s]


In [8]:
train_samples = []
dev_samples = []
test_samples = []

# KLUE STS 내 훈련, 검증 데이터 예제 변환
for phase in ["train", "validation"]:
    examples = datasets[phase]

    for example in examples:
        score = float(example["labels"]["label"]) / 5.0  # 0.0 ~ 1.0 스케일로 유사도 정규화

        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]], 
            label=score,
        )

        if phase == "validation":
            dev_samples.append(inp_example)
        else:
            train_samples.append(inp_example)

# KorSTS 내 테스트 데이터 예제 변환
for example in testsets["test"]:
    score = float(example["score"]) / 5.0

    if example["sentence1"] and example["sentence2"]:
        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]],
            label=score,
        )

    test_samples.append(inp_example)

In [11]:
train_samples

[<sentence_transformers.readers.InputExample.InputExample at 0x7f918d44e590>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f91962cfed0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f918de96090>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f91a2490890>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f921c13f810>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f9225fbf990>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f918d44e6d0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f918d44e4d0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f91877b2ad0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f918d405a10>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f918d44e750>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f9225ff3410>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7

In [10]:
train_samples[0].texts

['숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.',
 '숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.']