In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip install sentence_transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.0-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 61.2 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 64.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 24.7 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 69.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manyl

In [3]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

In [4]:
# logger
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [11]:
# pretrained_model_name = 't5-base'
pretrained_model_name = 'bert-base-cased' # distilbert-base-cased, roberta-base
sts_num_epochs = 2
train_batch_size = 16

sts_model_save_path = '/content/drive/MyDrive/기종설/output/training_sts-'+pretrained_model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [7]:
import pandas as pd

dataset = pd.read_csv('/content/drive/MyDrive/기종설/dataset/datset.csv')

In [8]:
dataset.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,text,passage1,passage2,passage3,passage4,passage5,answer,summary_text
0,0,0,Compassion and kindness so color the messages ...,To understand the situation of the other perso...,Service workers often show kind and friendly t...,Misunderstandings and hostility have a negativ...,"The longer you meet, the greater the feeling o...",Sympathy and kindness play a decisive role in ...,Sympathy and kindness play a decisive role in ...,Nice guys don’t finish last. They’re forgiven...
1,1,1,Many people believe that they are very good at...,The argument without evidence does not exert i...,"After identifying the situation, you can make ...",It is hard to be sure whether you have noticed...,Honestly revealing your thoughts helps to form...,It takes a long time to recover the trust that...,It is hard to be sure whether you have noticed...,Many people believe that they are very good a...
2,2,2,Recognize that you may very well have unhealth...,Let your child realize the problem of using sm...,Actively use digital devices to communicate wi...,Post in a place where you can see the family r...,Show your parents themselves to use digital de...,When you suspect your child's smartphone addic...,Show your parents themselves to use digital de...,60 percent of parents worried that their kids...
3,3,3,A positive attitude does not mean that every i...,Analyze the reason why the idea is not adopted...,"From the beginning, I suggest an idea with the...",Once reviewed in a positive attitude before re...,"In order to discover creative ideas, positive ...","Rather than criticizing the proposed ideas, co...",Once reviewed in a positive attitude before re...,A positive attitude does not mean that every ...
4,4,4,Creating your personal brand is a way of clari...,"Rather than riding a large number of opinions,...",You must clearly reveal your own characteristi...,You have to know the interests of others for s...,Before choosing a job Should be.,"Rather than loyal to the organization, it emph...",You must clearly reveal your own characteristi...,Your personal brand can be an anchor in a sea...


In [9]:
samples = [[], []]
passages = ['passage1', 'passage2', 'passage3', 'passage4', 'passage5']
for i in dataset.index:
    for j in passages:
        summary_text = dataset.loc[i, "summary_text"]
        if dataset.loc[i, 'answer'] == dataset.loc[i, j]:
            samples[1].append({"summary_text" : summary_text, "passage" : dataset.loc[i, j], "label" : 1.0})
        else:
            samples[0].append({"summary_text" : summary_text, "passage" : dataset.loc[i, j], "label" : 0.0})
balanced_samples = [samples[0], samples[1] * 4]

In [10]:
train_ratio = 0.9
sts_train = balanced_samples[0][:int(len(balanced_samples[0]) * train_ratio)] \
            + balanced_samples[1][:int(len(balanced_samples[1]) * train_ratio)]
sts_valid = balanced_samples[0][int(len(balanced_samples[0]) * train_ratio):] \
            + balanced_samples[1][int(len(balanced_samples[1]) * train_ratio):]

In [12]:
sts_train[15000]

{'summary_text': ' Technology has a great possibility to improve efficiency and efficiency . As a result of optimization, the costs associated with activities may be significantly reduced . Reducing costs, existing consumers can pay more consumers for new consumers .',
 'passage': 'As the consumption rises, the improvement of efficiency disappears.',
 'label': 1.0}

In [13]:
import numpy as np

def make_sts_input_example(dataset):
    input_examples = []
    for i in range(len(dataset)):
        text = dataset[i]['summary_text']
        passage = dataset[i]['passage']
        label = dataset[i]['label']
        input_examples.append(InputExample(texts=[text, passage], label=label))
    return input_examples

In [14]:
sts_train_examples = make_sts_input_example(sts_train)
sts_valid_examples = make_sts_input_example(sts_valid)

In [15]:
# Train Dataloader
train_dataloader = DataLoader(
    sts_train_examples,
    shuffle=True,
    batch_size=train_batch_size,
)

# Evaluator by sts-validation
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_valid_examples,
    name="sts-dev",
)

In [16]:
# Load Embedding Model
embedding_model = models.Transformer(
    model_name_or_path=pretrained_model_name, 
    max_seq_length=256,
    do_lower_case=True
)

# Only use Mean Pooling -> Pooling all token embedding vectors of sentence.
pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[embedding_model, pooling_model])

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [17]:
# Use CosineSimilarityLoss
train_loss = losses.CosineSimilarityLoss(model=model)

# warmup steps
warmup_steps = math.ceil(len(sts_train_examples) * sts_num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Training
model.fit(
    train_objectives=[(train_dataloader, train_loss)], # CosineSimilarity
    evaluator=dev_evaluator, 
    epochs=sts_num_epochs, 
    evaluation_steps=int(len(train_dataloader)*0.1), 
    warmup_steps=warmup_steps, # train data 10%
    output_path=sts_model_save_path
)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1102 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1102 [00:00<?, ?it/s]

In [18]:
drive.flush_and_unmount()