## 문서 분류기 만들기

In [1]:
import torch
from transformers import PreTrainedModel, BertTokenizer
from Korpora import Korpora
import os, csv
from dataclasses import dataclass
from typing import List, Optional

In [2]:
rootDir = !pwd
rootDir[0]

'/tf/notebooks/NLP'

In [3]:
args = {
    "pretrained_model_name":"beomi/kcbert-base",
    "downstream_corpus_name":"nsmc",
    "downstream_model_dir":f"{rootDir[0]}/nsmc",
    "batch_size": 4,
    "learning_rate":5e-5,
    "max_seq_length":128,
    "epochs":3,
    "tpu_cores":8,
    "seed":7,
    "downstream_corpus_root_dir":f"{rootDir[0]}/corpus",
    "downstream_task_name": "document-classification",
}
args

{'pretrained_model_name': 'beomi/kcbert-base',
 'downstream_corpus_name': 'nsmc',
 'downstream_model_dir': '/tf/notebooks/NLP/nsmc',
 'batch_size': 4,
 'learning_rate': 5e-05,
 'max_seq_length': 128,
 'epochs': 3,
 'tpu_cores': 8,
 'seed': 7,
 'downstream_corpus_root_dir': '/tf/notebooks/NLP/corpus',
 'downstream_task_name': 'document-classification'}

In [4]:
Korpora.fetch(
    corpus_name=args["downstream_corpus_name"],
    root_dir=args["downstream_corpus_root_dir"],
    force_download=True
)

[nsmc] download ratings_train.txt: 14.6MB [00:01, 12.3MB/s]                            
[nsmc] download ratings_test.txt: 4.90MB [00:00, 11.9MB/s]                            


In [5]:
tokenizer = BertTokenizer.from_pretrained(
    args["pretrained_model_name"], #pre-trained model
    do_lower_case=False
)
tokenizer

PreTrainedTokenizer(name_or_path='beomi/kcbert-base', vocab_size=30000, model_max_len=300, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

## 데이터 셋 구축

In [6]:
cached_features_file = os.path.join(
    args["downstream_corpus_root_dir"],
    args["downstream_corpus_name"],
    "cached_{}_{}_{}_{}_{}".format(
        "train",
        tokenizer.__class__.__name__,
        str(args["max_seq_length"]),
        args["downstream_corpus_name"],
        args["downstream_task_name"],
    ),
)
cached_features_file

'/tf/notebooks/NLP/corpus/nsmc/cached_train_BertTokenizer_128_nsmc_document-classification'

In [7]:
lock_path = cached_features_file + ".lock"
lock_path

'/tf/notebooks/NLP/corpus/nsmc/cached_train_BertTokenizer_128_nsmc_document-classification.lock'

In [8]:
corpus_path = os.path.join(
                    args["downstream_corpus_root_dir"],
                    args["downstream_corpus_name"],
                )
corpus_path

'/tf/notebooks/NLP/corpus/nsmc'

In [9]:
data_fpath = os.path.join(corpus_path, f"ratings_train.txt")
data_fpath

'/tf/notebooks/NLP/corpus/nsmc/ratings_train.txt'

In [10]:
lines = list(csv.reader(open(data_fpath, "r", encoding="utf-8"), delimiter="\t", quotechar='"'))
lines[:5]

[['id', 'document', 'label'],
 ['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0'],
 ['3819312', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1'],
 ['10265843', '너무재밓었다그래서보는것을추천한다', '0'],
 ['9045019', '교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '0']]

In [11]:
@dataclass
class ClassificationExample:
    text_a: str
    text_b: Optional[str] = None
    label: Optional[str] = None

In [12]:
sample = ClassificationExample(text_a=lines[1][1], text_b=None, label=lines[1][2])
sample

ClassificationExample(text_a='아 더빙.. 진짜 짜증나네요 목소리', text_b=None, label='0')

In [13]:
trainRawData = []

In [14]:
for (i, line) in enumerate(lines): #enumerate: index and value tubple type
    if i == 0:
        continue
    _, text, label = line
    trainRawData.append(ClassificationExample(text_a=text, text_b=None, label=label))
trainRawData[:5]

[ClassificationExample(text_a='아 더빙.. 진짜 짜증나네요 목소리', text_b=None, label='0'),
 ClassificationExample(text_a='흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', text_b=None, label='1'),
 ClassificationExample(text_a='너무재밓었다그래서보는것을추천한다', text_b=None, label='0'),
 ClassificationExample(text_a='교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', text_b=None, label='0'),
 ClassificationExample(text_a='사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', text_b=None, label='1')]

In [15]:
labels = ['0', '1']

In [16]:
trainDataLabelList = [data.label for data in trainRawData]
trainDataLabelList[:5]

['0', '1', '0', '0', '1']

## Batch encoding

In [30]:
batchEncoding = tokenizer(
    [data.text_a for data in trainRawData],    
    max_length=args["max_seq_length"], # 128
    padding="max_length", # 최대 길이 만큼 패딩
    truncation=True # 길이 오버시 자름 
)

In [31]:
batchEncoding.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

## Tokenizing

In [32]:
input_test = {k: batchEncoding[k][0] for k in batchEncoding}

In [33]:
@dataclass
class ClassificationFeatures:
    input_ids: List[int]
    attention_mask: Optional[List[int]] = None
    token_type_ids: Optional[List[int]] = None
    label: Optional[int] = None

In [34]:
feature_test = ClassificationFeatures(**input_test, label=trainDataLabelList[0])
feature_test

ClassificationFeatures(input_ids=[2, 2170, 832, 5045, 17, 17, 7992, 29734, 4040, 10720, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [35]:
features = []

In [37]:
for idx in range(len(trainRawData)):
    inputs = {k: batchEncoding[k][idx] for k in batchEncoding}
    features.append(ClassificationFeatures(**inputs, label=trainDataLabelList[idx]))
features[:2]

[ClassificationFeatures(input_ids=[2, 2170, 832, 5045, 17, 17, 7992, 29734, 4040, 10720, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], token_type_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [43]:
for idx, data in enumerate(trainRawData[:3]):
    print(f"#{idx}: {data.text_a}")
    token = " /".join(tokenizer.convert_ids_to_tokens(features[idx].input_ids))
    print(f"token: {token}\n========\n")

#0: 아 더빙.. 진짜 짜증나네요 목소리
token: [CLS] /아 /더 /##빙 /. /. /진짜 /짜증나네 /##요 /목소리 /[SEP] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD] /[PAD]

#1: 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
token: [CLS] /흠 /. /. /. /포 /##스터 /##보고 /초딩 /##영화 /##줄 /. /. 