In [None]:
# 교차 인코더 사용할 사전 학습 모델 불러오기
from sentence_transformers import CrossEncoder
cross_model = CrossEncoder('klue/roberta-small', num_labels=1)

In [None]:
#데이터 전처리
from datasets import load_dataset
from sentence_transformers import InputExample

klue_mrc_train = load_dataset('klue', 'mrc', split='train')
klue_mrc_test = load_dataset('klue', 'mrc', split='validation')

df_train = klue_mrc_train.to_pandas()
df_test = klue_mrc_test.to_pandas()

df_train = df_train[['title', 'question', 'context']]
df_test = df_test[['title', 'question', 'context']]

def add_ir_context(df):
    irrelevant_contexts = []
    for idx, row in df.iterrows():
        title = row['title']
        irrelevant_contexts.append(df.query(f"title != '{title}'").sample(n=1)['context'].values[0])
    df['irrelevant_context'] = irrelevant_contexts
    return df

df_train_ir = add_ir_context(df_train)
df_test_ir = add_ir_context(df_test)

examples = []
for idx, row in df_test_ir.iteerows():
    examples.append(InputExample(texts=[row['question'], row['context']], label=1))
    examples.append(InputExample(texts=[row['question'], row['irrelevant_context']], label=0))

In [None]:
# 미세 조정하지 않은 교차 인코더의 성능 평가 결과
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
ce_evaluator = CECorrelationEvaluator.from_input_examples(examples)
ce_evaluator(cross_model)

In [None]:
# 교차 인코더 학습 데이터셋 준비
train_samples = []
for idx, row in df_train_ir.iterrows():
    train_samples.append(InputExample(texts=[row['question'], row['context']], label=1))
    train_samples.append(InputExample(texts=[row['question'], row['irrelevant_context']], label=0))

In [None]:
# 교차 인코더 학습 수행
train_batch_size = 16
num_epochs = 1
model_save_path = 'output/training_mrc'

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

cross_model.fit(
    train_dataloader=train_dataloader,
    epochs=num_epochs,
    warmpu_steps=100,
    output_path=model_save_path
)

In [None]:
# 학습한 교차 인코더 평가 결과
ce_evaluator(cross_model)

In [None]:
# 학습을 마친 교차 인코더를 허깅페이스 허브에 업로드
from huggingface_hub import HfApi
from huggingface_hub import login

login(token = '본인의 허깅 페이스 KEY 값')

api = HfApi()
repo_id = "klue-roberta-small-klue-mrc-cross-encoder-finetuned"
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path=model_save_path,
    repo_id=f"Laseung/{repo_id}",
    repo_type="model",
)
