## import modules

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import random
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer

## seed 고정

In [2]:
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Load Tokenizer
Hugging Face Hub에 존재하는 Pretrained Tokenizer 불러오기

[URL] 토큰 추가

In [26]:
MODEL_NAME = 'klue/roberta-small'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [27]:
tokenizer

PreTrainedTokenizerFast(name_or_path='klue/roberta-small', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [28]:
special_tokens_dict = {
    'additional_special_tokens': ['[URL]']
}

print(tokenizer.vocab_size)

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print(num_added_toks, tokenizer.vocab_size)

32000
1 32000


In [29]:
tokenizer

PreTrainedTokenizerFast(name_or_path='klue/roberta-small', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[URL]']})

In [38]:
print(tokenizer.tokenize("야 오버워치()2 해봄? 지금 이벤트도 하는데 같이 하자!! [URL]"))
print(tokenizer("야 오버워치()2 해봄? 지금 이벤트도 하는데 같이 하자!! [URL]"))
print(tokenizer.decode(tokenizer.encode("야 오버워치()2 해봄? 지금 이벤트도 하는데 같이 하자!! [URL]")))

['야', '오버', '##워', '##치', '(', ')', '2', '해', '##봄', '?', '지금', '이벤트', '##도', '하', '##는데', '같이', '하자', '!', '!', '[URL]']
{'input_ids': [0, 1396, 10737, 2667, 2225, 12, 13, 22, 1897, 3064, 35, 3660, 5028, 2119, 1889, 13964, 3848, 20651, 5, 5, 32000, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] 야 오버워치 ( ) 2 해봄? 지금 이벤트도 하는데 같이 하자!! [URL] [SEP]


## Model 불러오기

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

model

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'class

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [31]:
# token embedding resize

model.resize_token_embeddings(tokenizer.vocab_size + num_added_toks)

Embedding(32001, 768)

In [32]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32001, 768)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

## 데이터 준비

In [45]:
train_dataset = pd.read_csv("/home/egg2018037024/Interlink_Project/Preprocess_Data/train.csv", encoding='cp949', index_col=0)
valid_dataset = pd.read_csv("/home/egg2018037024/Interlink_Project/Preprocess_Data/valid.csv", encoding='cp949', index_col=0)
test_dataset = pd.read_csv("/home/egg2018037024/Interlink_Project/Preprocess_Data/test.csv", encoding='cp949', index_col=0)

In [46]:
tokenized_train = tokenizer(
    list(train_dataset['Sentence']),
    return_tensors="pt",  # pytorch.Tensor로 리턴
    max_length=256, 
    padding=True,  # batch 중 가장 긴 시퀀스를 기준으로 pad 채움.
    truncation=True,  # max_length 넘어가면 버림
    add_special_tokens=True
)

tokenized_valid = tokenizer(
    list(valid_dataset['Sentence']),
    return_tensors="pt",
    max_length=256,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

tokenized_test = tokenizer(
    list(test_dataset['Sentence']),
    return_tensors="pt",
    max_length=256,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

In [53]:
print(tokenized_train['input_ids'][970])
print(tokenizer.decode(tokenized_train['input_ids'][970]))

tensor([    0,  3788,  1536,  2359, 13964,  4035,  2052,  1415,  2203,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 