## import modules

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import random
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer

## seed 고정

In [2]:
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Load Tokenizer
Hugging Face Hub에 존재하는 Pretrained Tokenizer 불러오기

[URL] 토큰 추가

In [8]:
MODEL_NAME = 'klue/roberta-small'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [9]:
tokenizer

PreTrainedTokenizerFast(name_or_path='klue/roberta-small', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [11]:
special_tokens_dict = {
    'additional_special_tokens': ['[URL]']
}

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

num_added_toks

1

In [12]:
tokenizer

PreTrainedTokenizerFast(name_or_path='klue/roberta-small', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[URL]']})

In [15]:
tokenizer.tokenize("야 오버워치()2 해봄? 지금 이벤트도 하는데 같이 하자!! [URL]")

['야',
 '오버',
 '##워',
 '##치',
 '(',
 ')',
 '2',
 '해',
 '##봄',
 '?',
 '지금',
 '이벤트',
 '##도',
 '하',
 '##는데',
 '같이',
 '하자',
 '!',
 '!',
 '[URL]']