In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install sentence_transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 4.9 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 61.4 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 29.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 57.6 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 79.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manyl

In [3]:
import math
import logging
from datetime import datetime
import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

In [4]:
# logger
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [22]:
# pretrained_model_name = 't5-base'
pretrained_model_name = 'bert-base-cased' # distilbert-base-cased, roberta-base
sts_num_epochs = 2
train_batch_size = 16

sts_model_save_path = '/content/drive/MyDrive/기종설/output/training_sts-'+pretrained_model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# 외부 데이터 추가

In [6]:
!git clone https://github.com/nyu-mll/GLUE-baselines.git
!python /content/GLUE-baselines/download_glue_data.py --data_dir glue_data --tasks all

Cloning into 'GLUE-baselines'...
remote: Enumerating objects: 891, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 891 (delta 1), reused 3 (delta 1), pack-reused 886[K
Receiving objects: 100% (891/891), 1.48 MiB | 22.92 MiB/s, done.
Resolving deltas: 100% (610/610), done.
Downloading and extracting CoLA...
	Completed!
Downloading and extracting SST...
	Completed!
Processing MRPC...
	Error downloading standard development IDs for MRPC. You will need to manually split your data.
Downloading and extracting QQP...
	Completed!
Downloading and extracting STS...
	Completed!
Downloading and extracting MNLI...
	Note (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.
	Completed!
Downloading and extracting QNLI...
	Completed!
Downloading and extracting RTE...
	Completed!
Downloading and extracting WNLI...
	Completed!
Downloading and extracting diagnostic...
	

In [7]:
# STS-B
import pandas as pd
import csv
sts_b_train = pd.read_csv('/content/glue_data/STS-B/train.tsv', delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
sts_b_valid = pd.read_csv('/content/glue_data/STS-B/dev.tsv', delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
sts_b_test = pd.read_csv('/content/glue_data/STS-B/test.tsv', delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

In [8]:
# MRPC
# STS-B
import pandas as pd
import csv
mrpc_train = pd.read_csv('/content/glue_data/MRPC/msr_paraphrase_train.txt', delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
mrpc_test = pd.read_csv('/content/glue_data/MRPC/msr_paraphrase_test.txt', delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

In [9]:
len(sts_b_train), len(sts_b_valid), len(sts_b_test)

(5749, 1500, 1379)

In [10]:
len(mrpc_train), len(mrpc_test)

(4076, 1725)

In [11]:
sts_b_train.head()

Unnamed: 0,index,genre,filename,year,old_index,source1,source2,sentence1,sentence2,score
0,0,main-captions,MSRvid,2012test,1,none,none,A plane is taking off.,An air plane is taking off.,5.0
1,1,main-captions,MSRvid,2012test,4,none,none,A man is playing a large flute.,A man is playing a flute.,3.8
2,2,main-captions,MSRvid,2012test,5,none,none,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8
3,3,main-captions,MSRvid,2012test,6,none,none,Three men are playing chess.,Two men are playing chess.,2.6
4,4,main-captions,MSRvid,2012test,9,none,none,A man is playing the cello.,A man seated is playing the cello.,4.25


In [12]:
mrpc_train.head()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977,"Amrozi accused his brother , whom he called "" ...","Referring to him as only "" the witness "" , Amr..."
1,0,2108705,2108831,Yucaipa owned Dominick 's before selling the c...,Yucaipa bought Dominick 's in 1995 for $ 693 m...
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10 , the ship 's owners had published ..."
3,0,3344667,3344648,"Around 0335 GMT , Tab shares were up 19 cents ...","Tab shares jumped 20 cents , or 4.6 % , to set..."
4,1,1236820,1236712,"The stock rose $ 2.11 , or about 11 percent , ...",PG & E Corp. shares jumped $ 1.63 or 8 percent...


In [14]:
def make_sts_b_input_example(dataset):
    input_examples = []
    for i in dataset.index:
        sentence1 = dataset.loc[i, 'sentence1']
        sentence2 = dataset.loc[i, 'sentence2']
        label = float((dataset.loc[i, 'score'] - 3) / 2.0)
        input_examples.append(InputExample(texts=[sentence1, sentence2], label=label))
    return input_examples

def make_mrpc_input_example(dataset):
    input_examples = []
    for i in dataset.index:
        sentence1 = dataset.loc[i, '#1 String']
        sentence2 = dataset.loc[i, '#2 String']
        label = float(dataset.loc[i, 'Quality']) * 2 - 1
        input_examples.append(InputExample(texts=[sentence1, sentence2], label=label))
    return input_examples

In [15]:
sts_train = make_sts_b_input_example(sts_b_train) + make_mrpc_input_example(mrpc_train)
sts_valid = make_sts_b_input_example(sts_b_valid) + make_mrpc_input_example(mrpc_test)

In [16]:
len(sts_train), len(sts_valid)

(9825, 3225)

# 수능특강/완성 데이터 추가

In [17]:
my_dataset = pd.read_csv('/content/drive/MyDrive/기종설/dataset/datset.csv')

samples = [[], []]
passages = ['passage1', 'passage2', 'passage3', 'passage4', 'passage5']
for i in my_dataset.index:
    for j in passages:
        summary_text = my_dataset.loc[i, "summary_text"]
        if my_dataset.loc[i, 'answer'] == my_dataset.loc[i, j]:
            samples[1].append({"summary_text" : summary_text, "passage" : my_dataset.loc[i, j], "label" : 1.0})
        else:
            samples[0].append({"summary_text" : summary_text, "passage" : my_dataset.loc[i, j], "label" : -1.0})

train_ratio = 0.8
sts_train_samples = samples[0][:int(len(samples[0]) * train_ratio)] \
            + samples[1][:int(len(samples[1]) * train_ratio)]
sts_valid_samples = samples[0][int(len(samples[0]) * train_ratio):] \
            + samples[1][int(len(samples[1]) * train_ratio):]

In [18]:
def make_sts_input_example(dataset):
    input_examples = []
    for i in range(len(dataset)):
        text = dataset[i]['summary_text']
        passage = dataset[i]['passage']
        label = dataset[i]['label']
        input_examples.append(InputExample(texts=[text, passage], label=label))
    return input_examples

sts_train_examples = make_sts_input_example(sts_train_samples)
sts_valid_examples = make_sts_input_example(sts_valid_samples)

In [19]:
sts_train += sts_train_examples
sts_valid += sts_valid_examples

In [20]:
# Train Dataloader
train_dataloader = DataLoader(
    sts_train,
    shuffle=True,
    batch_size=train_batch_size,
)

# Evaluator by sts-validation
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_valid,
    name="sts-dev",
)

In [21]:
# Load Embedding Model
embedding_model = models.Transformer(
    model_name_or_path=pretrained_model_name, 
    max_seq_length=256,
    do_lower_case=True
)

# Only use Mean Pooling -> Pooling all token embedding vectors of sentence.
pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[embedding_model, pooling_model])

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [23]:
# Use CosineSimilarityLoss
train_loss = losses.CosineSimilarityLoss(model=model)

# warmup steps
warmup_steps = math.ceil(len(sts_train_examples) * sts_num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Training
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=sts_num_epochs,
    evaluation_steps=int(len(train_dataloader)*0.1),
    optimizer_params={'lr': 2e-5},
    warmup_steps=warmup_steps,
    output_path=sts_model_save_path
)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1227 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1227 [00:00<?, ?it/s]