# Creating a langugae model as an embedding model

## Prepare training

**Creating embedding model using pre-trained language model**

In [1]:
from sentence_transformers import SentenceTransformer, models

transformer_model = models.Transformer('klue/roberta-base')

pooling_layer = models.Pooling(
    transformer_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)
embedding_model = SentenceTransformer(modules=[transformer_model, pooling_layer])

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
from datasets import load_dataset

klue_sts_train = load_dataset('klue', 'sts', split='train')
klue_sts_test = load_dataset('klue', 'sts', split='validation')

In [6]:
klue_sts_train[0]

{'guid': 'klue-sts-v1_train_00000',
 'source': 'airbnb-rtt',
 'sentence1': '숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.',
 'sentence2': '숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.',
 'labels': {'label': 3.7, 'real-label': 3.714285714285714, 'binary-label': 1}}

In [3]:
# split train, validation datasets using training data
klue_sts_train = klue_sts_train.train_test_split(test_size=0.1, seed=42)
klue_sts_train, klue_sts_eval = klue_sts_train['train'], klue_sts_train['test']

In [9]:
# normalization label
from sentence_transformers import InputExample # format for managing data in Sentence-Transformers

# Normalization similarity score to 0 ~ 1 -> IndexExample
def prepare_sts_examples(dataset):
    examples = []
    for data in dataset:
        examples.append(
            InputExample(
                texts=[data['sentence1'], data['sentence2']],
                label=data['labels']['label'] / 5.0)
        )
    return examples

train_examples = prepare_sts_examples(klue_sts_train)
eval_examples = prepare_sts_examples(klue_sts_eval)
test_examples = prepare_sts_examples(klue_sts_test)

In [13]:
# make dataset for training
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [14]:
# Prepare evaluation object for validation
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

eval_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(eval_examples)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples)

In [15]:
test_evaluator(embedding_model)

0.36460670798564826

## Training an embedding model with similar sentence data

**Training embedding model**

In [16]:
from sentence_transformers import losses

num_epochs = 4
model_name = 'klue/roberta-base'
model_save_path = 'output/training_sts_' + model_name.replace('/', '-')
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

embedding_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=eval_evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=100,
    output_path=model_save_path
)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/657 [00:00<?, ?it/s]

Iteration:   0%|          | 0/657 [00:00<?, ?it/s]

Iteration:   0%|          | 0/657 [00:00<?, ?it/s]

Iteration:   0%|          | 0/657 [00:00<?, ?it/s]

**Evaluate trained embedding model performace**

In [17]:
trained_embedding_model = SentenceTransformer(model_save_path)
test_evaluator(trained_embedding_model)

0.8891355260276683

**Model save in hugging-face hub**

In [18]:
from huggingface_hub import login
from huggingface_hub import HfApi

login(token='input-your-hftokens')
api = HfApi()

repo_id = 'klue-roberta-base-klue-sts'
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path=model_save_path,
    repo_id=f"Noahyun/{repo_id}",
    repo_type="model"
)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Noahyun/klue-roberta-base-klue-sts/commit/6b35913b12e2af8469380986cf8cdf7aa45e1e12', commit_message='Upload folder using huggingface_hub', commit_description='', oid='6b35913b12e2af8469380986cf8cdf7aa45e1e12', pr_url=None, pr_revision=None, pr_num=None)

# Fine-tuning Embedding Model

## Prepare training

**Dataset check**

In [4]:
from datasets import load_dataset

klue_mrc_train = load_dataset('klue', 'mrc', split='train')

In [21]:
klue_mrc_train[0]

{'title': '제주도 장마 시작 … 중부는 이달 말부터',
 'context': '올여름 장마가 17일 제주도에서 시작됐다. 서울 등 중부지방은 예년보다 사나흘 정도 늦은 이달 말께 장마가 시작될 전망이다.17일 기상청에 따르면 제주도 남쪽 먼바다에 있는 장마전선의 영향으로 이날 제주도 산간 및 내륙지역에 호우주의보가 내려지면서 곳곳에 100㎜에 육박하는 많은 비가 내렸다. 제주의 장마는 평년보다 2~3일, 지난해보다는 하루 일찍 시작됐다. 장마는 고온다습한 북태평양 기단과 한랭 습윤한 오호츠크해 기단이 만나 형성되는 장마전선에서 내리는 비를 뜻한다.장마전선은 18일 제주도 먼 남쪽 해상으로 내려갔다가 20일께 다시 북상해 전남 남해안까지 영향을 줄 것으로 보인다. 이에 따라 20~21일 남부지방에도 예년보다 사흘 정도 장마가 일찍 찾아올 전망이다. 그러나 장마전선을 밀어올리는 북태평양 고기압 세력이 약해 서울 등 중부지방은 평년보다 사나흘가량 늦은 이달 말부터 장마가 시작될 것이라는 게 기상청의 설명이다. 장마전선은 이후 한 달가량 한반도 중남부를 오르내리며 곳곳에 비를 뿌릴 전망이다. 최근 30년간 평균치에 따르면 중부지방의 장마 시작일은 6월24~25일이었으며 장마기간은 32일, 강수일수는 17.2일이었다.기상청은 올해 장마기간의 평균 강수량이 350~400㎜로 평년과 비슷하거나 적을 것으로 내다봤다. 브라질 월드컵 한국과 러시아의 경기가 열리는 18일 오전 서울은 대체로 구름이 많이 끼지만 비는 오지 않을 것으로 예상돼 거리 응원에는 지장이 없을 전망이다.',
 'news_category': '종합',
 'source': 'hankyung',
 'guid': 'klue-mrc-v1_train_12759',
 'is_impossible': False,
 'question_type': 1,
 'question': '북태평양 기단과 오호츠크해 기단이 만나 국내에 머무르는 기간은?',
 'answers': {'answer_start': [478, 478]

**Load basic embedding model**

In [15]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('Noahyun/klue-roberta-base-klue-sts')

**Preprocess data**

In [16]:
klue_mrc_train = load_dataset('klue', 'mrc', split='train')
klue_mrc_test = load_dataset('klue', 'mrc', split='validation')

df_train = klue_mrc_train.to_pandas()
df_test = klue_mrc_test.to_pandas()

df_train = df_train[['title', 'question', 'context']]
df_test = df_test[['title', 'question', 'context']]

In [17]:
# Add irrelevant context

def add_ir_context(df):
    irrelevant_contexts = []
    for idx, row in df.iterrows():
        title = row['title']
        irrelevant_contexts.append(df.query(f"title != '{title}'").sample(n=1)['context'].values[0])
    df['irrelevant_context'] = irrelevant_contexts
    return df

df_train_ir = add_ir_context(df_train)
df_test_ir = add_ir_context(df_test)

In [18]:
# Make data for evaluation performance

from sentence_transformers import InputExample

examples = []
for idx, row in df_test_ir.iterrows():
    examples.append(
        InputExample(texts=[row['question'], row['context']], label=1)
    )
    examples.append(
        InputExample(texts=[row['question'], row['irrelevant_context']], label=0)
    )

In [None]:
# Evaluation

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    examples
)
evaluator(sentence_model)

## Fine-tuning using MNR loss

In [None]:
# Datasets
train_samples = []
for idx, row in df_train_ir.iterrows():
    train_samples.append(InputExample(texts=[row['question'], row['context']]))

In [None]:
# Remove duplicates
from sentence_transformers import datasets

batch_size = 16
loader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=batch_size)

In [None]:
# Load MNR loss
from sentence_transformers import losses

loss = losses.MultipleNegativesRankingLoss(sentence_model)

In [42]:
# Fine-tuning
epochs = 1
save_path = './klue_mrc_mnr'

sentence_model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=100,
    output_path=save_path,
    show_progress_bar=True
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1097 [00:00<?, ?it/s]

In [43]:
# Evaluate
evaluator(sentence_model)

0.8594708084199976

In [44]:
from huggingface_hub import HfApi

api = HfApi()

repo_id = 'klue-roberta-base-klue-sts-mrc'
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path=save_path,
    repo_id=f"Noahyun/{repo_id}",
    repo_type="model"
)

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Noahyun/klue-roberta-base-klue-sts-mrc/commit/deb9069a385962e8bcce2645804ba99c24a8a08d', commit_message='Upload folder using huggingface_hub', commit_description='', oid='deb9069a385962e8bcce2645804ba99c24a8a08d', pr_url=None, pr_revision=None, pr_num=None)

# Reorder the rankings

**cross-encoder**

In [12]:
from sentence_transformers import CrossEncoder

cross_model = CrossEncoder('klue/roberta-small', num_labels=1)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# evaluation cross encoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator

ce_evaluator = CECorrelationEvaluator.from_input_examples(examples)
ce_evaluator(cross_model)

In [10]:
# prepare dataset
train_samples = []
for idx, row in df_train_ir.iterrows():
    train_samples.append(InputExample(texts=[row['question'], row['context']], label=1))
    train_samples.append(InputExample(texts=[row['question'], row['irrelevant_context']], label=0))

In [11]:
# train cross encoder
from torch.utils.data import DataLoader

train_batch_size = 16
num_epochs = 1
model_save_path = 'output/training_mrc'

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

cross_model.fit(
    train_dataloader=train_dataloader,
    epochs=num_epochs,
    warmup_steps=100,
    output_path=model_save_path
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2195 [00:00<?, ?it/s]

In [12]:
# Evaluate
ce_evaluator(cross_model)

0.8648947632389092

In [13]:
from huggingface_hub import login
from huggingface_hub import HfApi

login(token='input your hftokens')
api = HfApi()

repo_id = 'klue-roberta-small-cross-encoder'
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path=model_save_path,
    repo_id=f"Noahyun/{repo_id}",
    repo_type="model"
)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


CommitInfo(commit_url='https://huggingface.co/Noahyun/klue-roberta-small-cross-encoder/commit/de67dae4310034eb397a934e651a5ca1d97c4eac', commit_message='Upload folder using huggingface_hub', commit_description='', oid='de67dae4310034eb397a934e651a5ca1d97c4eac', pr_url=None, pr_revision=None, pr_num=None)

# RAG Implementation with bi-encoder and cross-encoder

In [1]:
# dataset sampling for test
from datasets import load_dataset

klue_mrc_test = load_dataset('klue', 'mrc', split='validation')
klue_mrc_test = klue_mrc_test.train_test_split(test_size=1000, seed=42)['test']

In [2]:
# Implement function to store and retrieve embeddings
import faiss

def make_embedding_index(sentence_model, corpus):
    embeddings = sentence_model.encode(corpus)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

def find_embedding_top_k(query, sentence_model, index, k):
    embedding = sentence_model.encode([query])
    distances, indices = index.search(embedding, k)
    return indices

In [3]:
# Rerank orders
def make_question_context_pairs(question_idx, indices):
    return [[klue_mrc_test['question'][question_idx], klue_mrc_test['context'][idx]] for idx in indices]

def rerank_top_k(cross_model, question_idx, indices, k):
    input_examples = make_question_context_pairs(question_idx, indices)
    relevance_scores = cross_model.predict(input_examples)
    reranked_indices = indices[np.argsort(relevance_scores)[::-1]]
    return reranked_indices

In [4]:
# Metric: hit rate
import time

def evaluate_hit_rate(datasets, embedding_model, index, k=10):
    start_time = time.time()
    predictions = []
    for question in datasets['question']:
        predictions.append(find_embedding_top_k(question, embedding_model, index, k)[0])
    total_prediction_count = len(predictions)
    hit_count = 0
    questions = datasets['question']
    context = datasets['context']
    for idx, prediction in enumerate(predictions):
        for pred in prediction:
            if context[pred] == context[idx]:
                hit_count += 1
                break

    end_time = time.time()
    return hit_count / total_prediction_count, end_time - start_time

## Retrieval using base embedding model

In [5]:
from sentence_transformers import SentenceTransformer

base_embedding_model = SentenceTransformer('Noahyun/klue-roberta-base-klue-sts')
base_index = make_embedding_index(base_embedding_model, klue_mrc_test['context'])

evaluate_hit_rate(klue_mrc_test, base_embedding_model, base_index, 10)

(0.87, 3.5601565837860107)

## Retrieval using fine-tuned embedding model

In [6]:
finetuned_embedding_model = SentenceTransformer('Noahyun/klue-roberta-base-klue-sts-mrc')
finetuned_index = make_embedding_index(finetuned_embedding_model, klue_mrc_test['context'])

evaluate_hit_rate(klue_mrc_test, finetuned_embedding_model, finetuned_index, 10)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.86k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

(0.946, 3.6130285263061523)

## Retrieval using combination with fine-tuned embedding model and cross encoder

**Evaluate metric including order rerank**

In [7]:
import time
import numpy as np
from tqdm.auto import tqdm

def evaluate_hit_rate_with_rerank(datasets, embedding_model, cross_model, index, bi_k=30, cross_k=10):
    start_time = time.time()
    predictions = []
    for question_idx, question in enumerate(tqdm(datasets['question'])):
        indices = find_embedding_top_k(question, embedding_model, index, bi_k)[0]
        predictions.append(rerank_top_k(cross_model, question_idx, indices, k=cross_k))
    total_prediction_count = len(predictions)
    hit_count = 0
    questions = datasets['question']
    contexts = datasets['context']
    for idx, prediction in enumerate(predictions):
        for pred in prediction:
            if contexts[pred] == contexts[idx]:
                hit_count += 1
                break
    end_time = time.time()
    return hit_count / total_prediction_count, end_time - start_time, predictions

In [None]:
cross_model = SentenceTransformer('shangrilar/klue-roberta-small-cross-encoder')

hit_rate, cosumed_time, predictions = evaluate_hit_rate_with_rerank(klue_mrc_test, finetuned_embedding_model, cross_model, finetuned_index)
hit_rate, cosumed_time