In [None]:
!pip install sentence-transformers datasets huggingface_hub faiss-cpu accelerate

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)


Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-1.1.5-py3-none-any.whl.metadata (13 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting tqdm (from sentence-transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (62 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transfo

In [3]:
!pip install -U "sentence-transformers" "transformers" "huggingface_hub" hf_transfer -qqq

In [1]:
# 사전 학습된 언어 모델을 불러와 문장 임베딩 모델 만들기
from sentence_transformers import SentenceTransformer, models

transformer_model = models.Transformer('klue/roberta-base')

pooling_layer = models.Pooling(
    transformer_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

embedding_model = SentenceTransformer(
    modules=[transformer_model, pooling_layer]
)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# 실습 데이터셋 다운로드 및 확인
from datasets import load_dataset
klue_sts_train = load_dataset('klue', 'sts', split='train')
klue_sts_test = load_dataset('klue', 'sts', split='validation')
print(klue_sts_train[0])

README.md: 0.00B [00:00, ?B/s]

sts/train-00000-of-00001.parquet:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

sts/validation-00000-of-00001.parquet:   0%|          | 0.00/68.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/519 [00:00<?, ? examples/s]

{'guid': 'klue-sts-v1_train_00000', 'source': 'airbnb-rtt', 'sentence1': '숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.', 'sentence2': '숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.', 'labels': {'label': 3.7, 'real-label': 3.714285714285714, 'binary-label': 1}}


In [4]:
# 학습 데이터셋의 10%를 검증 데이터셋으로 구성한다.
klue_sts_train = klue_sts_train.train_test_split(test_size=0.1, seed=42)
klue_sts_train, klue_sts_eval = klue_sts_train['train'], klue_sts_train['test']

print(f"학습 데이터의 크기 : {len(klue_sts_train)}")
print(f"검증 데이터의 크기 : {len(klue_sts_eval)}")

학습 데이터의 크기 : 10501
검증 데이터의 크기 : 1167


In [5]:
# label 정규화하기
from sentence_transformers import InputExample

# 유사도 점수를 0~1 사이로 정규화하고 InputExample 객체에 담는다.
def prepare_sts_examples(dataset):
    examples = []
    for data in dataset:
        examples.append(
            InputExample(
                texts = [data['sentence1'], data['sentence2']],
                label=data['labels']['label'] / 5.0
            )
        )
    return examples

print(klue_sts_train[0]['labels'])

klue_sts_train_examples = prepare_sts_examples(klue_sts_train)

print(klue_sts_train_examples[0].label)

{'label': 3.4, 'real-label': 3.428571428571428, 'binary-label': 1}
0.6799999999999999


In [6]:
train_examples = prepare_sts_examples(klue_sts_train)
eval_examples = prepare_sts_examples(klue_sts_eval)
test_examples = prepare_sts_examples(klue_sts_test)

In [7]:
# 학습에 사용할 배치 데이터셋 만들기
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [8]:
# 검증을 위한 평가 객체 준비
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

eval_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(eval_examples)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples)

In [9]:
# 언어 모델을 그대로 활용할 경우 문장 임베딩 모델의 성능
test_evaluator(embedding_model)

{'pearson_cosine': 0.347707041961158, 'spearman_cosine': 0.35560473197486514}

In [10]:
# 임베딩 모델 학습
from sentence_transformers import losses

num_epochs = 4
model_name = 'klue/roberta-base'
model_save_path = '/content/drive/MyDrive/LLM_RAG_Application/models/training_sts_' + model_name.replace("/", "-")
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# 임베딩 모델 학습
embedding_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=eval_evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=100,
    output_path=model_save_path
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
657,0.028,No log,0.956285,0.914005
1000,0.0081,No log,0.958002,0.91662
1314,0.0081,No log,0.957974,0.915358
1971,0.005,No log,0.960485,0.920018
2000,0.0035,No log,0.960389,0.920263
2628,0.0026,No log,0.961328,0.921347


In [23]:
# 학습한 임베딩 모델의 성능 평가
trained_embedding_model = SentenceTransformer(model_save_path)
test_evaluator(trained_embedding_model)


{'pearson_cosine': 0.8905083131274608, 'spearman_cosine': 0.8905864813839348}

In [None]:
# 허깅페이스 허브에 모델 저장
from huggingface_hub import login
from huggingface_hub import HfApi

login(token='본인의 허깅 페이스 KEY 값')
api = HfApi()
repo_id = "klue-roberta-base-klue-sts"
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path=model_save_path,
    repo_id=f"Laseung/{repo_id}",
    repo_type="model"
)

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/Laseung/klue-roberta-base-klue-sts/commit/dffba55a8bfc0851f03d09f031c09a8638343bc1', commit_message='Upload folder using huggingface_hub', commit_description='', oid='dffba55a8bfc0851f03d09f031c09a8638343bc1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Laseung/klue-roberta-base-klue-sts', endpoint='https://huggingface.co', repo_type='model', repo_id='Laseung/klue-roberta-base-klue-sts'), pr_revision=None, pr_num=None)

In [26]:
# 허깅페이스에 저장한 모델을 불러와 평가를 다시 진행해 보기
from sentence_transformers import SentenceTransformer

load_model = SentenceTransformer("Laseung/klue-roberta-base-klue-sts")

test_evaluator(load_model)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/283 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/312 [00:00<?, ?B/s]

{'pearson_cosine': 0.8905083131274608, 'spearman_cosine': 0.8905864813839348}