In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install sentence_transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.0 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.0-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 25.9 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 52.8 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 57.6 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 71.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manyl

In [None]:
import math
import logging
from datetime import datetime
import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

In [None]:
pretrained_model_path = '/content/drive/MyDrive/기종설/output/training_sts-bert-base-cased-2022-11-16_05-39-31/' # 수능특강/완성 학습
pretrained_model_path = '/content/drive/MyDrive/기종설/output/training_sts-bert-base-cased-2022-11-16_15-13-43/' # 수능특강/완성 + 외부데이터셋 학습
# pretrained_model_path = 'bert-base-cased'
# pretrained_model_path = 't5-base'
sts_num_epochs = 10
train_batch_size = 16

# sts_model_save_path = '/content/drive/MyDrive/기종설/output/training_sts-'+pretrained_model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
# DATASET LOAD 

dataset = pd.read_csv('/content/drive/MyDrive/기종설/dataset/datset.csv')

In [None]:
# Load Embedding Model
embedding_model = models.Transformer(
    model_name_or_path=pretrained_model_path, 
    max_seq_length=256,
    do_lower_case=True
)

# Only use Mean Pooling -> Pooling all token embedding vectors of sentence.
pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[embedding_model, pooling_model])

In [None]:
from sentence_transformers import util

def get_answer(model, summary_text, passages, return_score=False):
    summary_text_embedding = model.encode(summary_text, convert_to_tensor=True)
    cosine_scores = []
    for i in range(len(passages)):
        pembedding = model.encode(passages[i], convert_to_tensor=True)
        cosine_scores.append(util.cos_sim(summary_text_embedding, pembedding)[0][0].cpu())
    if return_score: return np.argmax(cosine_scores), cosine_scores
    else: return np.argmax(cosine_scores)

def evaulate_answer(dataset, problem_id, verbose=False):
    passages = ['passage1', 'passage2', 'passage3', 'passage4', 'passage5']
    pred = get_answer(model, dataset.loc[problem_id, 'summary_text'], dataset.loc[problem_id, passages])
    if verbose:
        print(f"pred : {dataset.loc[problem_id, 'passage' + str(pred + 1)]} \nans : {dataset.loc[problem_id, 'answer']}")
    return dataset.loc[problem_id, 'passage' + str(pred + 1)] == dataset.loc[problem_id, 'answer']

def calculate_accuracy(dataset, verbose=False):
    correct, wrong = 0, 0
    for i in dataset.index:
        result = evaulate_answer(dataset, i, verbose)
        if result: correct += 1
        else: wrong += 1
    return correct / (correct + wrong)

acc = calculate_accuracy(dataset, True)
print(f"Accuracy : {acc * 100} %")

# bert-base-cased : Accuracy : 43.56880359330339 %
# t5-base : Accuracy : 49.77541853817885 %
# 수능특강/완성 학습 후 : Accuracy : 100.0 %
# 추가 데이터셋 학습 후 : Accuracy : 99.75500204164966 %

pred : Sympathy and kindness play a decisive role in forming a good impression. 
ans : Sympathy and kindness play a decisive role in forming a good impression.
pred : It is hard to be sure whether you have noticed a lie properly. 
ans : It is hard to be sure whether you have noticed a lie properly.
pred : Show your parents themselves to use digital devices. 
ans : Show your parents themselves to use digital devices.
pred : Once reviewed in a positive attitude before refusing an idea, It is good to decide. 
ans : Once reviewed in a positive attitude before refusing an idea, It is good to decide.
pred : You must clearly reveal your own characteristics and strengths and differentiate it from others. do. 
ans : You must clearly reveal your own characteristics and strengths and differentiate it from others. do.
pred : probabilities as a key cause of faulty reasoning 
ans : probabilities as a key cause of faulty reasoning
pred : evolutionary process in which pesticide resistance develops 
an