In [17]:
import torch
from pytorch_lightning import Trainer, LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import PreTrainedModel, BertTokenizer, BertConfig, BertForSequenceClassification
from transformers.optimization import AdamW
from Korpora import Korpora
import os, csv, re
from dataclasses import dataclass
from typing import List, Optional
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.optim.lr_scheduler import ExponentialLR
from sklearn.model_selection import train_test_split
import pandas as pd 

In [18]:
!pwd

/tf/notebooks/NLP


In [19]:
trainDataDir = './train_data'
trainDataPath = os.path.join(trainDataDir, 'total_train.csv')
trainDataPath

'./train_data/total_train.csv'

In [20]:
trainCSVData = list(csv.reader(open(trainDataPath, "r", encoding="utf-8"), delimiter=",", quotechar='"'))[1:-1]
trainCSVData[:5]

[['지금 배달되나요?', '1', '0', '0', '0', '0', '0', '0', '0', '0'],
 ['아 네 배달됩니다', '1', '0', '0', '0', '0', '0', '0', '0', '0'],
 ['짬뽕류는 어떤 게 있나요? 잘 나가는 짬뽕 있나요?', '1', '0', '0', '0', '0', '0', '0', '0', '0'],
 ['특해물 짬뽕도 있고 전복 새우 짬뽕도 있고 해물 종류도 새우 홍합 전복 없는 게 없습니다',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0'],
 ['전복 들어가는 거는 특해물 짬뽕 시켜야 돼요?', '1', '0', '0', '0', '0', '0', '0', '0', '0']]

In [21]:
trainCSVData[228363]

['아까 요 제품 이 저 는 좀 나은 거 같은 데 가격 은 똑같 아예 ?',
 '0',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0']

위는 쉼표가 포함된 문장도 잘 불러와졌는지 한번 보려고 출력해본 것

In [22]:
train, test = train_test_split(trainCSVData, test_size=0.3)
'train', train[:2], len(train), 'test', test[:2], len(test)

('train',
 [['혹시 여기 떡국떡 따로 구매 가능한가요', '0', '0', '0', '1', '0', '0', '0', '0', '0'],
  ['보증금을 걸지 않으면 계약을 하지 않으려고 해요', '0', '0', '0', '0', '0', '0', '0', '0', '1']],
 253155,
 'test',
 [['이 향수 는 달콤한 향 이에요', '0', '0', '0', '1', '0', '0', '0', '0', '0'],
  ['네 결제 시같이 주세요', '1', '0', '0', '0', '0', '0', '0', '0', '0']],
 108496)

In [23]:
@dataclass
class ClassificationDataFormat:
    text_a: str
    label: Optional[int] = None

In [24]:
ClassificationDataFormat(text_a=train[0][0], label=train[0][1:].index('1'))

ClassificationDataFormat(text_a='혹시 여기 떡국떡 따로 구매 가능한가요', label=3)

In [25]:
[int(label) for label in train[0][1:]]

[0, 0, 0, 1, 0, 0, 0, 0, 0]

In [26]:
trainDataFormatList = []

In [27]:
for data in train:
    trainDataFormatList.append(ClassificationDataFormat(text_a=data[0], label=data[1:].index('1')))
trainDataFormatList[:5]

[ClassificationDataFormat(text_a='혹시 여기 떡국떡 따로 구매 가능한가요', label=3),
 ClassificationDataFormat(text_a='보증금을 걸지 않으면 계약을 하지 않으려고 해요', label=8),
 ClassificationDataFormat(text_a='#이름#', label=2),
 ClassificationDataFormat(text_a='네', label=1),
 ClassificationDataFormat(text_a='네 , 있습니다 .', label=1)]

In [28]:
testDataFormatList = []

In [29]:
for data in test:
    testDataFormatList.append(ClassificationDataFormat(text_a=data[0], label=data[1:].index('1')))
testDataFormatList[:5]

[ClassificationDataFormat(text_a='이 향수 는 달콤한 향 이에요', label=3),
 ClassificationDataFormat(text_a='네 결제 시같이 주세요', label=0),
 ClassificationDataFormat(text_a='감기약 은 한 번 에 1 알 씩 아침 저녁 하루 두 번 먹고 갈근탕 은 따뜻하게 데워 서 식전 에 드 세 요', label=4),
 ClassificationDataFormat(text_a='기본 가격 에 포함 되 어 있 어요', label=6),
 ClassificationDataFormat(text_a='네 무슨 메뉴로 주문하시나요', label=0)]

In [30]:
@dataclass
class ARGS:
    pretrained_model_name: str = 'beomi/kcbert-base'
    batch_size: int = 16
    learning_rate: float = 5e-5
    max_seq_length: int = 64
    epochs: int = 3
    tpu_cores: int = 0
    downstream_task_name: str = "chat-type-classification"
    cpu_workers: int = 7
    downstream_model_dir: str = './chatbot_model'
args = ARGS()

In [31]:
tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_name, #pre-trained model
    do_lower_case=False
)
tokenizer

PreTrainedTokenizer(name_or_path='beomi/kcbert-base', vocab_size=30000, model_max_len=300, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [32]:
batchEncoding = tokenizer(
    [data.text_a for data in trainDataFormatList],    
    max_length=args.max_seq_length, # 128
    padding="max_length", # 최대 길이 만큼 패딩
    truncation=True # 길이 오버시 자름 
)

In [33]:
batchEncoding.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [34]:
input_sample = {k: batchEncoding[k][0] for k in batchEncoding}

In [35]:
@dataclass
class ClassificationFeatures:
    input_ids: List[int]
    attention_mask: Optional[List[int]] = None
    token_type_ids: Optional[List[int]] = None
    label: Optional[List[int]] = None

In [36]:
feature_sample = ClassificationFeatures(**input_sample, label=trainDataFormatList[0].label)
feature_sample

ClassificationFeatures(input_ids=[2, 9703, 8225, 1023, 4123, 5137, 9718, 11514, 12397, 8877, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], label=3)

In [37]:
train_feature = []

In [38]:
for idx in range(len(trainDataFormatList)):
    input_tokenized = {k: batchEncoding[k][idx] for k in batchEncoding}
    feature = ClassificationFeatures(**input_tokenized, label=trainDataFormatList[idx].label)
    train_feature.append(feature)
train_feature[:3]

[ClassificationFeatures(input_ids=[2, 9703, 8225, 1023, 4123, 5137, 9718, 11514, 12397, 8877, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], label=3),
 ClassificationFeatures(input_ids=[2, 22598, 10304, 254, 4102, 10612, 12446, 4027, 8094, 2175, 9567, 13847, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [39]:
for idx, data in enumerate(trainDataFormatList[:3]):
    print(f"#{idx}: {data.text_a}")
    token = " /".join(tokenizer.convert_ids_to_tokens(train_feature[idx].input_ids))
    print(f"token: {token}\n========\n")

#0: 혹시 여기 떡국떡 따로 구매 가능한가요
token: [CLS] /혹시 /여기 /떡 /##국 /##떡 /따로 /구매 /가능한 /##가요 /[SEP] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD]

#1: 보증금을 걸지 않으면 계약을 하지 않으려고 해요
token: [CLS] /보증 /##금을 /걸 /##지 /않으면 /계약 /##을 /하지 /않 /##으려고 /해요 /[SEP] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD]

#2: #이름#
token: [CLS] /# /이름 /# /[SEP] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[

In [40]:
batchEncoding_for_test = tokenizer(
    [data.text_a for data in testDataFormatList],    
    max_length=args.max_seq_length, # 128
    padding="max_length", # 최대 길이 만큼 패딩
    truncation=True # 길이 오버시 자름 
)

In [41]:
test_feature = []

In [42]:
for idx in range(len(testDataFormatList)):
    input_tokenized = {k: batchEncoding_for_test[k][idx] for k in batchEncoding}
    feature = ClassificationFeatures(**input_tokenized, label=testDataFormatList[idx].label)
    test_feature.append(feature)
test_feature[:3]

[ClassificationFeatures(input_ids=[2, 2451, 3402, 4110, 750, 29286, 4047, 3402, 18368, 4040, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], label=3),
 ClassificationFeatures(input_ids=[2, 654, 25545, 2002, 8299, 10309, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [43]:
for idx, data in enumerate(testDataFormatList[:3]):
    print(f"#{idx}: {data.text_a}")
    token = " /".join(tokenizer.convert_ids_to_tokens(test_feature[idx].input_ids))
    print(f"token: {token}\n========\n")

#0: 이 향수 는 달콤한 향 이에요
token: [CLS] /이 /향 /##수 /는 /달콤 /##한 /향 /이에 /##요 /[SEP] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD]

#1: 네 결제 시같이 주세요
token: [CLS] /네 /결제 /시 /##같이 /주세요 /[SEP] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD]

#2: 감기약 은 한 번 에 1 알 씩 아침 저녁 하루 두 번 먹고 갈근탕 은 따뜻하게 데워 서 식전 에 드 세 요
token: [CLS] /감기 /##약 /은 /한 /

In [44]:
def data_collator(features):
    batch = {}
    batch["input_ids"] = torch.tensor([feature.input_ids for feature in features], dtype=torch.long)
    batch["attention_mask"] = torch.tensor([feature.attention_mask for feature in features], dtype=torch.long)
    batch["token_type_ids"] = torch.tensor([feature.token_type_ids for feature in features], dtype=torch.long)
    batch["labels"] = torch.tensor([feature.label for feature in features], dtype=torch.long)
    return batch

In [45]:
data_collator(train_feature)

{'input_ids': tensor([[    2,  9703,  8225,  ...,     0,     0,     0],
         [    2, 22598, 10304,  ...,     0,     0,     0],
         [    2,     7,  8745,  ...,     0,     0,     0],
         ...,
         [    2,  2451,  2699,  ...,     0,     0,     0],
         [    2,  3354,  8485,  ...,     0,     0,     0],
         [    2, 23832,  8374,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'labels': tensor([3, 8, 2,  ..., 8, 4, 4])}

In [46]:
trainDataLoader = DataLoader(
    train_feature,
    batch_size=args.batch_size,
    sampler=RandomSampler(train_feature, replacement=False),
    collate_fn=data_collator,
    drop_last=False,
    num_workers=args.cpu_workers
)

In [47]:
testDataLoader = DataLoader(
    test_feature,
    batch_size=args.batch_size,
    sampler=RandomSampler(test_feature, replacement=False),
    collate_fn=data_collator,
    drop_last=False,
    num_workers=args.cpu_workers
)

In [48]:
pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
    num_labels=9,
)

In [49]:
model = BertForSequenceClassification.from_pretrained(
    args.pretrained_model_name,
    config=pretrained_model_config
)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

In [50]:
checkpointPath = args.downstream_model_dir
os.makedirs(checkpointPath, exist_ok=True)

In [51]:
checkpoint_callback = ModelCheckpoint(
        dirpath=checkpointPath,
        save_top_k=1,
        monitor="val_loss",
        mode="min",
        filename='{epoch}-{val_loss:.2f}',
    )



In [52]:
trainer = Trainer(
    max_epochs=args.epochs,
    fast_dev_run=False,
    num_sanity_val_steps=0,
    callbacks=[checkpoint_callback],
    default_root_dir=checkpointPath,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [53]:
def accuracy(preds, labels, ignore_index=None):
    with torch.no_grad():
        assert preds.shape[0] == len(labels)
        correct = torch.sum(preds == labels)
        total = torch.sum(torch.ones_like(labels))
        if ignore_index is not None:
            # 모델이 맞춘 것 가운데 ignore index에 해당하는 것 제외
            correct -= torch.sum(torch.logical_and(preds == ignore_index, preds == labels))
            # accuracy의 분모 가운데 ignore index에 해당하는 것 제외
            total -= torch.sum(labels == ignore_index)
    return correct.to(dtype=torch.float) / total.to(dtype=torch.float)

In [54]:
class ClassificationTrainTask(LightningModule):
    def __init__(self, model: PreTrainedModel, args):
        super().__init__()
        self.model = model
        self.args = args
        
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.args.learning_rate)
        scheduler = ExponentialLR(optimizer, gamma=0.9)
        return {
            'optimizer': optimizer,
            'scheduler': scheduler
        }
    
    def training_step(self, inputs, batch_idx):
#         print('train input', inputs['labels'].size())
        print(inputs)
        outputs = self.model(**inputs)
#         print('train input', inputs['labels'].size(), outputs)
        predict = outputs.logits.argmax(dim=-1)
        labels = inputs['labels']
        acc = accuracy(predict, labels)
        self.log("loss", outputs.loss, prog_bar=False, logger=True, on_step=True, on_epoch=False)
        self.log("acc", acc, prog_bar=True, logger=True, on_step=True, on_epoch=False)
        return outputs.loss
        
    def validation_step(self, inputs, batch_idx):
#         print('inputs?', inputs)
        outputs = self.model(**inputs)
        predict = outputs.logits.argmax(dim=-1)
        labels = inputs["labels"]
        acc = accuracy(predict, labels)
        self.log("val_loss", outputs.loss, prog_bar=True, logger=True, on_step=False, on_epoch=True)
        self.log("val_acc", acc, prog_bar=True, logger=True, on_step=False, on_epoch=True)
        return outputs.loss

In [55]:
task = ClassificationTrainTask(model, args)

In [None]:
trainer.fit(
    task,
    train_dataloader=trainDataLoader,
    val_dataloaders=testDataLoader
)

In [56]:
train_feature[:1], trainDataFormatList[:1]

([ClassificationFeatures(input_ids=[2, 9703, 8225, 1023, 4123, 5137, 9718, 11514, 12397, 8877, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], label=3)],
 [ClassificationDataFormat(text_a='혹시 여기 떡국떡 따로 구매 가능한가요', label=3)])

In [74]:
dataCollactorSample = data_collator(train_feature[12:15])
dataCollactorSample

{'input_ids': tensor([[    2,  2980,  5088,  7968,  2972,  4169,  9718, 11514,  8021, 14046,
              3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0],
         [    2,  1373,  3288,  2851, 15255, 21953,    32,     3,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,   

In [75]:
outputs = model(**dataCollactorSample)
outputs

SequenceClassifierOutput(loss=tensor(2.0194, grad_fn=<NllLossBackward0>), logits=tensor([[-0.6597,  0.0737, -0.3774, -0.6632,  0.0953, -0.2092,  0.1090, -0.8112,
         -0.1802],
        [-0.5069,  0.1797, -0.2550, -0.5503,  0.3207, -0.2133,  0.1562, -0.6542,
         -0.0902],
        [-0.4643,  0.1082,  0.1898, -0.1973,  0.4369, -0.0938, -0.1061, -0.6765,
         -0.0262]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [76]:
pred = outputs.logits.argmax(dim=-1)
pred, dataCollactorSample['labels']

(tensor([6, 4, 4]), tensor([6, 8, 1]))

In [77]:
accuracy(outputs.logits.argmax(dim=-1), dataCollactorSample['labels'])

tensor(0.3333)

In [78]:
pred.shape[0], len(dataCollactorSample['labels'])

(3, 3)

In [83]:
correct = torch.sum(pred == dataCollactorSample['labels'])
correct

tensor(1)

맞은 결과들의 합

In [82]:
total = torch.sum(torch.ones_like(dataCollactorSample['labels']))
total, torch.ones_like(dataCollactorSample['labels'])

(tensor(3), tensor([1, 1, 1]))

ones_like: 텐서 형태 안에 값을 모두 1로 변경 후 

In [84]:
correct / total

tensor(0.3333)

In [85]:
outputs.loss

tensor(2.0194, grad_fn=<NllLossBackward0>)