## import modules

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import random
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer

## seed 고정

In [2]:
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Load Tokenizer
Hugging Face Hub에 존재하는 Pretrained Tokenizer 불러오기

[URL] 토큰 추가

In [3]:
MODEL_NAME = 'klue/roberta-small'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [4]:
tokenizer

PreTrainedTokenizerFast(name_or_path='klue/roberta-small', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
special_tokens_dict = {
    'additional_special_tokens': ['[URL]']
}

print(tokenizer.vocab_size)

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print(num_added_toks, tokenizer.vocab_size)

32000
1 32000


In [6]:
tokenizer

PreTrainedTokenizerFast(name_or_path='klue/roberta-small', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[URL]']})

In [7]:
print(tokenizer.tokenize("야 오버워치() 해봄? 지금 이벤트도 하는데 같이 하자!! [URL]"))
print(tokenizer("야 오버워치() 해봄? 지금 이벤트도 하는데 같이 하자!! [URL]"))
print(tokenizer.decode(tokenizer.encode("야 오버워치() 해봄? 지금 이벤트도 하는데 같이 하자!! [URL]")))

['야', '오버', '##워', '##치', '(', ')', '2', '해', '##봄', '?', '지금', '이벤트', '##도', '하', '##는데', '같이', '하자', '!', '!', '[URL]']
{'input_ids': [0, 1396, 10737, 2667, 2225, 12, 13, 22, 1897, 3064, 35, 3660, 5028, 2119, 1889, 13964, 3848, 20651, 5, 5, 32000, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] 야 오버워치 ( ) 2 해봄? 지금 이벤트도 하는데 같이 하자!! [URL] [SEP]


## Model 불러오기

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

model

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifi

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [9]:
# token embedding resize

model.resize_token_embeddings(tokenizer.vocab_size + num_added_toks)

Embedding(32001, 768)

In [10]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32001, 768)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

## 데이터 준비

In [11]:
train_dataset = pd.read_csv("/home/egg2018037024/Interlink_Project/Preprocess_Data/train.csv", encoding='cp949', index_col=0)
valid_dataset = pd.read_csv("/home/egg2018037024/Interlink_Project/Preprocess_Data/valid.csv", encoding='cp949', index_col=0)
test_dataset = pd.read_csv("/home/egg2018037024/Interlink_Project/Preprocess_Data/test.csv", encoding='cp949', index_col=0)

In [12]:
tokenized_train = tokenizer(
    list(train_dataset['Sentence']),
    return_tensors="pt",  # pytorch.Tensor로 리턴
    max_length=256, 
    padding=True,  # batch 중 가장 긴 시퀀스를 기준으로 pad 채움.
    truncation=True,  # max_length 넘어가면 버림
    add_special_tokens=True
)

tokenized_valid = tokenizer(
    list(valid_dataset['Sentence']),
    return_tensors="pt",
    max_length=256,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

tokenized_test = tokenizer(
    list(test_dataset['Sentence']),
    return_tensors="pt",
    max_length=256,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

In [13]:
print(tokenized_train['input_ids'][970])
print(tokenizer.decode(tokenized_train['input_ids'][970]))

tensor([    0,  3788,  1536,  2359, 13964,  4035,  2052,  1415,  2203,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1, 

## Dataset 클래스 정의

In [14]:
class klue_Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, label):  # 전처리된 데이터 셋이 들어옴
        self.dataset = dataset
        self.label = label

    def __getitem__(self, idx):
        # gradient 계산에 영향을 주지 않게 clone().detach() 실행
        
        item = {key: val[idx].clone().detach() for key, val in self.dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):  # 샘플 수
        return len(self.label)

In [15]:
train_klue_dataset = klue_Dataset(tokenized_train, train_dataset['Label'])
valid_klue_dataset = klue_Dataset(tokenized_valid, valid_dataset['Label'])
test_klue_dataset = klue_Dataset(tokenized_test, test_dataset['Label'])

print(train_klue_dataset.__len__())
print(valid_klue_dataset.__len__())
print(train_klue_dataset.__getitem__(970))
print(tokenizer.decode(train_klue_dataset.__getitem__(970)['input_ids']))

87600
10950
{'input_ids': tensor([    0,  3788,  1536,  2359, 13964,  4035,  2052,  1415,  2203,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,   

## 모델 평가 함수

In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

## Trainer 정의

In [17]:
training_ars = TrainingArguments(
    output_dir='./klue_roberta_small_result',
    num_train_epochs=5,
 #   max_steps=5000,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    save_total_limit=5,
    save_strategy = "epoch",
    #save_steps=1000,
    learning_rate=5e-5,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    load_best_model_at_end = True,
)

trainer = Trainer(
    model=model,
    args=training_ars,
    train_dataset=train_klue_dataset,
    eval_dataset=valid_klue_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [18]:
trainer.train()

***** Running training *****
  Num examples = 87600
  Num Epochs = 5
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 1715


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.010227,0.996895,0.992259,0.989555,0.994977
2,0.028000,0.012208,0.99653,0.99126,0.99861,0.984018
3,0.005700,0.008621,0.998174,0.99543,0.99634,0.994521
4,0.005700,0.010101,0.997808,0.994518,0.994973,0.994064
5,0.002300,0.010473,0.998356,0.995889,0.996344,0.995434


***** Running Evaluation *****
  Num examples = 10950
  Batch size = 256
Saving model checkpoint to ./klue_roberta_small_result/checkpoint-343
Configuration saved in ./klue_roberta_small_result/checkpoint-343/config.json
Model weights saved in ./klue_roberta_small_result/checkpoint-343/pytorch_model.bin
tokenizer config file saved in ./klue_roberta_small_result/checkpoint-343/tokenizer_config.json
Special tokens file saved in ./klue_roberta_small_result/checkpoint-343/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 10950
  Batch size = 256
Saving model checkpoint to ./klue_roberta_small_result/checkpoint-686
Configuration saved in ./klue_roberta_small_result/checkpoint-686/config.json
Model weights saved in ./klue_roberta_small_result/checkpoint-686/pytorch_model.bin
tokenizer config file saved in ./klue_roberta_small_result/checkpoint-686/tokenizer_config.json
Special tokens file saved in ./klue_roberta_small_result/checkpoint-686/special_tokens_map.json
***** 

TrainOutput(global_step=1715, training_loss=0.010675710126887953, metrics={'train_runtime': 932.1822, 'train_samples_per_second': 469.865, 'train_steps_per_second': 1.84, 'total_flos': 2.9010360305664e+16, 'train_loss': 0.010675710126887953, 'epoch': 5.0})

In [19]:
model.save_pretrained('./klue_best_model')

Configuration saved in ./klue_best_model/config.json
Model weights saved in ./klue_best_model/pytorch_model.bin


## Test 셋으로 검증

In [20]:
MODEL_NAME = './klue_best_model'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

loading configuration file ./klue_best_model/config.json
Model config RobertaConfig {
  "_name_or_path": "./klue_best_model",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "tokenizer_class": "BertTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32001
}

loading weights file ./klue_best_model/pytorch_model.bin
All model checkpoint weights were used when in

In [22]:
print(device)

model.to(device)

cuda


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32001, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [45]:
dataloader = DataLoader(test_klue_dataset, batch_size=128, shuffle=False)

model.eval()
output_pred = []
output_prob = []
labels = []

for i, data in enumerate(tqdm(dataloader)):
    with torch.no_grad():
        outputs = model(
            input_ids=data['input_ids'].to(device),
            attention_mask=data['attention_mask'].to(device),
            token_type_ids=data['token_type_ids'].to(device)
        )
    logits = outputs[0]
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)
    labels.append(data['label'].tolist())

    output_pred.append(result)
    output_prob.append(prob)

pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()
print(pred_answer)
print(output_prob)

100%|███████████████████████████████████████████| 86/86 [00:12<00:00,  6.95it/s]

[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 




In [50]:
labelss = []

for i in labels:
    for j in i:
        labelss.append(j)
    
labelss

[1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [53]:
precision, recall, f1, _ = precision_recall_fscore_support(labelss, pred_answer, average='binary')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.9967727063162748
Recall: 0.9872146118721461
F1-score: 0.9919706354668502


In [66]:
print("---------", len(labelss), "개 중 잘못 분류한 예시들----------")
print("문장번호", "정답", "예측값", sep="\t")
for i in range(len(labelss)):
    if(labelss[i] != pred_answer[i]):
        print(i, labelss[i], pred_answer[i], sep="\t")

--------- 10950 개 중 잘못 분류한 예시들----------
문장번호	정답	예측값
77	1	0
584	1	0
1233	0	1
1855	0	1
2040	1	0
2116	1	0
2723	1	0
3192	1	0
3254	1	0
4042	1	0
4155	1	0
4365	1	0
4463	1	0
4566	1	0
4782	1	0
6739	1	0
6756	1	0
6942	0	1
7033	1	0
7342	1	0
7396	1	0
7626	1	0
8569	1	0
8657	1	0
8667	1	0
8742	1	0
8891	0	1
8972	1	0
9195	1	0
9502	1	0
9579	0	1
9990	0	1
10083	1	0
10703	1	0
10862	0	1


In [105]:
test_str = ['국외발신 고객님 인증번호 해외결제 원 정상처리완료 문의']

In [106]:
test_tok = klue_Dataset(tokenizer(test_str,return_tensors="pt",
    max_length=256,
    padding=True,
    truncation=True,
    add_special_tokens=True), [1])

In [107]:
test_tok.__getitem__(0)

{'input_ids': tensor([    0, 11782,  2311,  2250,  3916,  2098,  5488,  2517,  2016,  4075,
         28605,  1478,  4073, 20111,  2365,  2071,  7421,     2]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'label': tensor(1)}

In [109]:
test_dataloader = DataLoader(test_tok, batch_size=128, shuffle=False)

model.eval()
for i, data in enumerate(tqdm(test_dataloader)):
    with torch.no_grad():
        outputs = model(
                input_ids=data['input_ids'].to(device),
                attention_mask=data['attention_mask'].to(device),
                token_type_ids=data['token_type_ids'].to(device)
            )
    logits = outputs[0]
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)

print(result, prob)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 44.95it/s]

[1] [[9.9459176e-05 9.9990058e-01]]





## 스크립트로 변환

In [111]:
tokenizer

PreTrainedTokenizerFast(name_or_path='klue/roberta-small', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[URL]']})

In [113]:
tokenizer.save_pretrained('./klue_best_tok')  # 토크나이저 저장

tokenizer config file saved in ./klue_best_tok/tokenizer_config.json
Special tokens file saved in ./klue_best_tok/special_tokens_map.json


('./klue_best_tok/tokenizer_config.json',
 './klue_best_tok/special_tokens_map.json',
 './klue_best_tok/vocab.txt',
 './klue_best_tok/added_tokens.json',
 './klue_best_tok/tokenizer.json')

In [120]:
toscript_tokenizer = AutoTokenizer.from_pretrained('./klue_best_tok', torchscript=True)

loading file ./klue_best_tok/vocab.txt
loading file ./klue_best_tok/tokenizer.json
loading file ./klue_best_tok/added_tokens.json
loading file ./klue_best_tok/special_tokens_map.json
loading file ./klue_best_tok/tokenizer_config.json


In [124]:
sentences = ['국외발신 고객님 인증번호 해외결제 원 정상처리완료 문의 윮꿻 스크립트 전용 인풋']
encoded_input = toscript_tokenizer(sentences, padding=True, truncation=True, 
                                   max_length=128, return_tensors='pt', add_special_tokens=True)

# !!! tokenizer_model takes list of sentences as inputs, how should I provide tensorial dummpy inputs?
traced_tokenizer_model = torch.jit.trace(toscript_tokenizer, encoded_input)


RuntimeError: Could not get name of python class object