## 문서 분류기 만들기

In [1]:
import torch
from pytorch_lightning import Trainer, LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import PreTrainedModel, BertTokenizer, BertConfig, BertForSequenceClassification
from transformers.optimization import AdamW
from Korpora import Korpora
import os, csv
from dataclasses import dataclass
from typing import List, Optional
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.optim.lr_scheduler import ExponentialLR

In [2]:
rootDir = !pwd
rootDir[0]

'/tf/notebooks/NLP'

In [3]:
args = {
    "pretrained_model_name":"beomi/kcbert-base",
    "downstream_corpus_name":"nsmc",
    "downstream_model_dir":f"{rootDir[0]}/nsmc",
    "batch_size": 4,
    "learning_rate":5e-5,
    "max_seq_length":128,
    "epochs":3,
    "tpu_cores":8,
    "seed":7,
    "downstream_corpus_root_dir":f"{rootDir[0]}/corpus",
    "downstream_task_name": "document-classification",
    "cpu_workers": 5
}
args

{'pretrained_model_name': 'beomi/kcbert-base',
 'downstream_corpus_name': 'nsmc',
 'downstream_model_dir': '/tf/notebooks/NLP/nsmc',
 'batch_size': 4,
 'learning_rate': 5e-05,
 'max_seq_length': 128,
 'epochs': 3,
 'tpu_cores': 8,
 'seed': 7,
 'downstream_corpus_root_dir': '/tf/notebooks/NLP/corpus',
 'downstream_task_name': 'document-classification',
 'cpu_workers': 5}

In [4]:
Korpora.fetch(
    corpus_name=args["downstream_corpus_name"],
    root_dir=args["downstream_corpus_root_dir"],
    force_download=True
)

[nsmc] download ratings_train.txt: 14.6MB [00:01, 13.2MB/s]                            
[nsmc] download ratings_test.txt: 4.90MB [00:00, 11.3MB/s]                            


In [5]:
tokenizer = BertTokenizer.from_pretrained(
    args["pretrained_model_name"], #pre-trained model
    do_lower_case=False
)
tokenizer

PreTrainedTokenizer(name_or_path='beomi/kcbert-base', vocab_size=30000, model_max_len=300, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

## 데이터 셋 구축

In [6]:
cached_features_file = os.path.join(
    args["downstream_corpus_root_dir"],
    args["downstream_corpus_name"],
    "cached_{}_{}_{}_{}_{}".format(
        "train",
        tokenizer.__class__.__name__,
        str(args["max_seq_length"]),
        args["downstream_corpus_name"],
        args["downstream_task_name"],
    ),
)
cached_features_file

'/tf/notebooks/NLP/corpus/nsmc/cached_train_BertTokenizer_128_nsmc_document-classification'

In [7]:
lock_path = cached_features_file + ".lock"
lock_path

'/tf/notebooks/NLP/corpus/nsmc/cached_train_BertTokenizer_128_nsmc_document-classification.lock'

In [8]:
corpus_path = os.path.join(
                    args["downstream_corpus_root_dir"],
                    args["downstream_corpus_name"],
                )
corpus_path

'/tf/notebooks/NLP/corpus/nsmc'

In [9]:
data_fpath = os.path.join(corpus_path, f"ratings_train.txt")
data_fpath

'/tf/notebooks/NLP/corpus/nsmc/ratings_train.txt'

In [10]:
lines = list(csv.reader(open(data_fpath, "r", encoding="utf-8"), delimiter="\t", quotechar='"'))
lines[:5]

[['id', 'document', 'label'],
 ['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0'],
 ['3819312', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1'],
 ['10265843', '너무재밓었다그래서보는것을추천한다', '0'],
 ['9045019', '교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '0']]

In [11]:
@dataclass
class ClassificationExample:
    text_a: str
    text_b: Optional[str] = None
    label: Optional[str] = None

In [12]:
sample = ClassificationExample(text_a=lines[1][1], text_b=None, label=lines[1][2])
sample #NsmcCorpus get_examples

ClassificationExample(text_a='아 더빙.. 진짜 짜증나네요 목소리', text_b=None, label='0')

In [13]:
trainRawData = []

In [14]:
for (i, line) in enumerate(lines): #enumerate: index and value tubple type
    if i == 0:
        continue
    _, text, label = line
    trainRawData.append(ClassificationExample(text_a=text, text_b=None, label=label))
trainRawData[:5]

[ClassificationExample(text_a='아 더빙.. 진짜 짜증나네요 목소리', text_b=None, label='0'),
 ClassificationExample(text_a='흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', text_b=None, label='1'),
 ClassificationExample(text_a='너무재밓었다그래서보는것을추천한다', text_b=None, label='0'),
 ClassificationExample(text_a='교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', text_b=None, label='0'),
 ClassificationExample(text_a='사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', text_b=None, label='1')]

In [15]:
labels = ['0', '1']

In [16]:
trainDataLabelList = [data.label for data in trainRawData]
trainDataLabelList[:5]

['0', '1', '0', '0', '1']

## Batch encoding

In [17]:
batchEncoding = tokenizer(
    [data.text_a for data in trainRawData],    
    max_length=args["max_seq_length"], # 128
    padding="max_length", # 최대 길이 만큼 패딩
    truncation=True # 길이 오버시 자름 
)

In [18]:
batchEncoding.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

## Tokenizing

In [19]:
input_test = {k: batchEncoding[k][0] for k in batchEncoding}

In [20]:
@dataclass
class ClassificationFeatures:
    input_ids: List[int]
    attention_mask: Optional[List[int]] = None
    token_type_ids: Optional[List[int]] = None
    label: Optional[int] = None

In [21]:
feature_test = ClassificationFeatures(**input_test, label=trainDataLabelList[0])
feature_test

ClassificationFeatures(input_ids=[2, 2170, 832, 5045, 17, 17, 7992, 29734, 4040, 10720, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [22]:
features = []

In [23]:
for idx in range(len(trainRawData)):
    inputs = {k: batchEncoding[k][idx] for k in batchEncoding}
    features.append(ClassificationFeatures(**inputs, label=int(trainDataLabelList[idx])))
features[:2]

[ClassificationFeatures(input_ids=[2, 2170, 832, 5045, 17, 17, 7992, 29734, 4040, 10720, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [24]:
for idx, data in enumerate(trainRawData[:3]):
    print(f"#{idx}: {data.text_a}")
    token = " /".join(tokenizer.convert_ids_to_tokens(features[idx].input_ids))
    print(f"token: {token}\n========\n")

#0: 아 더빙.. 진짜 짜증나네요 목소리
token: [CLS] /아 /더 /##빙 /. /. /진짜 /짜증나네 /##요 /목소리 /[SEP] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD]

#1: 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
token: [CLS] /흠 /. /. /. /포 /##스터 /##보고 /초딩 /##영화 /##줄 /. /. 

# Data loader 

In [25]:
# data_collator
isinstance(trainRawData[0], dict), isinstance(trainRawData[0].label, torch.Tensor), isinstance(trainRawData[0].label, int)

(False, False, False)

In [26]:
batch = {}

In [27]:
torch.tensor([int(trainRawData[0].label)], dtype=torch.float)

tensor([0.])

In [28]:
batch = torch.tensor([int(trainData.label) for trainData in trainRawData], dtype=torch.float)
batch

tensor([0., 1., 0.,  ..., 0., 1., 0.])

In [29]:
torch.tensor([features[0].input_ids], dtype=torch.float)

tensor([[2.0000e+00, 2.1700e+03, 8.3200e+02, 5.0450e+03, 1.7000e+01, 1.7000e+01,
         7.9920e+03, 2.9734e+04, 4.0400e+03, 1.0720e+04, 3.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e

In [30]:
features[0]

ClassificationFeatures(input_ids=[2, 2170, 832, 5045, 17, 17, 7992, 29734, 4040, 10720, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [31]:
torch.tensor(features[0].input_ids, dtype=torch.float)

tensor([2.0000e+00, 2.1700e+03, 8.3200e+02, 5.0450e+03, 1.7000e+01, 1.7000e+01,
        7.9920e+03, 2.9734e+04, 4.0400e+03, 1.0720e+04, 3.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+

In [32]:
def data_collator(features):
    batch = {}
    batch["input_ids"] = torch.tensor([feature.input_ids for feature in features], dtype=torch.long)
    batch["attention_mask"] = torch.tensor([feature.attention_mask for feature in features], dtype=torch.long)
    batch["token_type_ids"] = torch.tensor([feature.token_type_ids for feature in features], dtype=torch.long)
    batch["labels"] = torch.tensor([int(feature.label) for feature in features], dtype=torch.long)
    return batch

In [33]:
data_collator(features)

{'input_ids': tensor([[    2,  2170,   832,  ...,     0,     0,     0],
         [    2,  3521,    17,  ...,     0,     0,     0],
         [    2,  8069,  4089,  ...,     0,     0,     0],
         ...,
         [    2,  8098,  1427,  ...,     0,     0,     0],
         [    2, 23061,  9376,  ...,     0,     0,     0],
         [    2,  7997,  9376,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'labels': tensor([0, 1, 0,  ..., 0, 1, 0])}

In [34]:
trainDataLoader = DataLoader(
    features,
    batch_size=args['batch_size'],
    sampler=RandomSampler(features, replacement=False),
    collate_fn=data_collator,
    drop_last=False,
    num_workers=args['cpu_workers']
)

In [35]:
trainDataLoader.collate_fn(features)

{'input_ids': tensor([[    2,  2170,   832,  ...,     0,     0,     0],
         [    2,  3521,    17,  ...,     0,     0,     0],
         [    2,  8069,  4089,  ...,     0,     0,     0],
         ...,
         [    2,  8098,  1427,  ...,     0,     0,     0],
         [    2, 23061,  9376,  ...,     0,     0,     0],
         [    2,  7997,  9376,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'labels': tensor([0, 1, 0,  ..., 0, 1, 0])}

In [36]:
collateData = trainDataLoader.collate_fn(features)
collateData['input_ids'].size(), \
collateData['attention_mask'].size(), \
collateData['token_type_ids'].size(), \
collateData['labels'].size()

(torch.Size([150000, 128]),
 torch.Size([150000, 128]),
 torch.Size([150000, 128]),
 torch.Size([150000]))

In [37]:
trainDataLoader.dataset[:2]

[ClassificationFeatures(input_ids=[2, 2170, 832, 5045, 17, 17, 7992, 29734, 4040, 10720, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# Test data loader

In [38]:
test_data_fpath = os.path.join(corpus_path, f"ratings_test.txt")
test_data_fpath

'/tf/notebooks/NLP/corpus/nsmc/ratings_test.txt'

In [39]:
lines_test = list(csv.reader(open(test_data_fpath, "r", encoding="utf-8"), delimiter="\t", quotechar='"'))
lines_test[:5]

[['id', 'document', 'label'],
 ['6270596', '굳 ㅋ', '1'],
 ['9274899', 'GDNTOPCLASSINTHECLUB', '0'],
 ['8544678', '뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', '0'],
 ['6825595', '지루하지는 않은데 완전 막장임... 돈주고 보기에는....', '0']]

In [40]:
testRawData = []

In [41]:
for (i, line) in enumerate(lines_test): #enumerate: index and value tubple type
    if i == 0:
        continue
    _, text, label = line
    testRawData.append(ClassificationExample(text_a=text, text_b=None, label=label))
testRawData[:5]

[ClassificationExample(text_a='굳 ㅋ', text_b=None, label='1'),
 ClassificationExample(text_a='GDNTOPCLASSINTHECLUB', text_b=None, label='0'),
 ClassificationExample(text_a='뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', text_b=None, label='0'),
 ClassificationExample(text_a='지루하지는 않은데 완전 막장임... 돈주고 보기에는....', text_b=None, label='0'),
 ClassificationExample(text_a='3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??', text_b=None, label='0')]

In [42]:
testDataLabelList = [data.label for data in testRawData]
testDataLabelList[:5]

['1', '0', '0', '0', '0']

In [43]:
batchEncoding_test = tokenizer(
    [data.text_a for data in testRawData],    
    max_length=args["max_seq_length"], # 128
    padding="max_length", # 최대 길이 만큼 패딩
    truncation=True # 길이 오버시 자름 
)

In [44]:
batchEncoding_test.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [45]:
features_testData = []

In [46]:
for idx in range(len(testRawData)):
    inputs = {k: batchEncoding_test[k][idx] for k in batchEncoding_test}
    features_testData.append(ClassificationFeatures(**inputs, label=int(testDataLabelList[idx])))
features_testData[:2]

[ClassificationFeatures(input_ids=[2, 352, 192, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [47]:
testDataLoader = DataLoader(
    features_testData,
    batch_size=args['batch_size'],
    sampler=RandomSampler(features_testData, replacement=False),
    collate_fn=data_collator,
    drop_last=False,
    num_workers=args['cpu_workers']
)

In [48]:
testDataLoader.dataset[:2]

[ClassificationFeatures(input_ids=[2, 352, 192, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [49]:
collateData = testDataLoader.collate_fn(features_testData)
collateData['input_ids'].size(), \
collateData['attention_mask'].size(), \
collateData['token_type_ids'].size(), \
collateData['labels'].size()

(torch.Size([50000, 128]),
 torch.Size([50000, 128]),
 torch.Size([50000, 128]),
 torch.Size([50000]))

# Load pre-trained model

In [50]:
pretrained_model_config = BertConfig.from_pretrained(
    args["pretrained_model_name"],
    num_labels=2,
)

In [51]:
model = BertForSequenceClassification.from_pretrained(
    args["pretrained_model_name"],
    config=pretrained_model_config
)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

# Setup trainer

In [52]:
checkpointPath = args["downstream_model_dir"]
os.makedirs(checkpointPath, exist_ok=True)

In [53]:
checkpoint_callback = ModelCheckpoint(
        dirpath=checkpointPath,
        save_top_k=1,
        monitor="val_loss",
        mode="min",
        filename='{epoch}-{val_loss:.2f}',
    )



In [54]:
trainer = Trainer(
    max_epochs=args["epochs"],
    fast_dev_run=False,
    num_sanity_val_steps=0,
    callbacks=[checkpoint_callback],
    default_root_dir=checkpointPath,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


## Training

- use pre-trained model `BERT` kcbert-base

In [56]:
def accuracy(preds, labels, ignore_index=None):
    with torch.no_grad():
        assert preds.shape[0] == len(labels)
        correct = torch.sum(preds == labels)
        total = torch.sum(torch.ones_like(labels))
        if ignore_index is not None:
            # 모델이 맞춘 것 가운데 ignore index에 해당하는 것 제외
            correct -= torch.sum(torch.logical_and(preds == ignore_index, preds == labels))
            # accuracy의 분모 가운데 ignore index에 해당하는 것 제외
            total -= torch.sum(labels == ignore_index)
    return correct.to(dtype=torch.float) / total.to(dtype=torch.float)

In [57]:
class ClassificationTrainTask(LightningModule):
    def __init__(self, model: PreTrainedModel, args):
        super().__init__()
        self.model = model
        self.args = args
        
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.args['learning_rate'])
        scheduler = ExponentialLR(optimizer, gamma=0.9)
        return {
            'optimizer': optimizer,
            'scheduler': scheduler
        }
    
    def training_step(self, inputs, batch_idx):
#         print('train input', inputs['labels'].size())
        outputs = self.model(**inputs)
#         print('train input', inputs['labels'].size(), outputs)
        predict = outputs.logits.argmax(dim=-1)
        labels = inputs['labels']
        acc = accuracy(predict, labels)
        self.log("loss", outputs.loss, prog_bar=False, logger=True, on_step=True, on_epoch=False)
        self.log("acc", acc, prog_bar=True, logger=True, on_step=True, on_epoch=False)
        return outputs.loss
        
    def validation_step(self, inputs, batch_idx):
#         print('inputs?', inputs)
        outputs = self.model(**inputs)
        predict = outputs.logits.argmax(dim=-1)
        labels = inputs["labels"]
        acc = accuracy(predict, labels)
        self.log("val_loss", outputs.loss, prog_bar=True, logger=True, on_step=False, on_epoch=True)
        self.log("val_acc", acc, prog_bar=True, logger=True, on_step=False, on_epoch=True)
        return outputs.loss

In [58]:
task = ClassificationTrainTask(model, args)

In [None]:
trainer.fit(
    task,
    train_dataloader=trainDataLoader,
    val_dataloaders=testDataLoader
)


  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 108 M 
--------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.680   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]