# 0. 들어가기 앞서

* 고객 질문: 1, 상담원 질문: 2, 고객 및 상담원 대답: 0

# 1. 라이브러리 로드

In [1]:
import sys
sys.version

'3.8.5 (default, Sep  4 2020, 07:30:14) \n[GCC 7.3.0]'

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup ##

In [5]:
##GPU 사용 시
device = torch.device("cuda:0")

# 2. 모델, 사전, 데이터셋 불러오기

In [6]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /home/adminuser/notebooks/modeling/question/[0차] KoBERT_QA_baseline/.cache/kobert_v1.zip
using cached model. /home/adminuser/notebooks/modeling/question/[0차] KoBERT_QA_baseline/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [7]:
import os
os.listdir("../[0차] 원본_preprocessing/")

['.ipynb_checkpoints',
 '질의응답_K쇼핑_질문유형분류_원본.csv',
 '[0차] 원본데이터_preprocessing.ipynb',
 '질의응답_K쇼핑_질문분류_원본.csv']

In [8]:
df=pd.read_csv("../[0차] 원본_preprocessing/질의응답_K쇼핑_질문분류_원본.csv", index_col=0)

  mask |= (ar1 == a)


# 3. 질문분류시작

* 고객 질문: 1, 상담원 질문: 2, 고객 및 상담원 대답: 0

In [9]:
# 2중 리스트로 변환됨

data_list = []
for q, label in zip(df["msg"],df["QA"])  :
    data = []
    data.append(q)
    data.append(str(label))

    data_list.append(data)

## 3-1. Train / Test set 분리

* 라벨링은 이미 진행했으므로, 바로 train/ test 분리 진행

In [10]:
dataset_train, dataset_test = train_test_split(data_list, test_size=0.25, random_state=0)

In [11]:
print(len(dataset_train))
print(len(dataset_test))

753924
251309


## 3-2. KoBERT 입력 데이터로 만들기

* 데이터를 train data와 test data로 나누었다면 각 데이터가 KoBERT 모델의 입력으로 들어갈 수 있는 형태가 되도록 토큰화, 정수 인코딩, 패딩 등을 해주어야 한다

In [12]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [13]:
# Setting parameters

max_len = 32 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
batch_size = 32
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

In [14]:
# 토큰화
tokenizer= get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

using cached model. /home/adminuser/notebooks/modeling/question/[0차] KoBERT_QA_baseline/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [15]:
# 첫 번째는 패딩된 시퀀스
# 두 번째는 길이와 타입에 대한 내용
# 세 번재는 어텐션 마스크 시퀀스

data_train[0]

(array([   2, 2847, 4103, 5130,  793, 5925,  517,   54, 2926, 6141, 6050,
        2822, 5330, 7287,  517, 7707, 7494,  517, 7710, 7753, 6664,  517,
        6539, 5931, 3647, 6314, 2650, 6749, 6964, 4227, 1767,    3],
       dtype=int32),
 array(32, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 2)

In [16]:
from transformers import AutoModel, AutoTokenizer

In [17]:
# pytorch용 DataLoader 사용(torch 형식의 dataset을 만들어주기)
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

## 3-3. KoBERT 학습모델 만들기

* 고객 질문: 1, 상담원 질문: 2, 고객 및 상담원 대답: 0 
* 3가지의 class를 분류하기 때문에 num_classes는 3으로 입력

In [18]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out) ##

In [19]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc



## 3-4. KoBERT 모델 학습시키기

In [23]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/23561 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.6713322997093201 train acc 0.8125
epoch 1 batch id 201 loss 0.37738046050071716 train acc 0.8468594527363185
epoch 1 batch id 401 loss 0.22280533611774445 train acc 0.8641677057356608
epoch 1 batch id 601 loss 0.33898356556892395 train acc 0.8824875207986689
epoch 1 batch id 801 loss 0.23097948729991913 train acc 0.8920099875156055
epoch 1 batch id 1001 loss 0.167324960231781 train acc 0.8983204295704296
epoch 1 batch id 1201 loss 0.08900411427021027 train acc 0.9041944213155704
epoch 1 batch id 1401 loss 0.12853601574897766 train acc 0.908168272662384
epoch 1 batch id 1601 loss 0.23934578895568848 train acc 0.9110126483447845
epoch 1 batch id 1801 loss 0.27416399121284485 train acc 0.9137978900610771
epoch 1 batch id 2001 loss 0.18008917570114136 train acc 0.916104447776112
epoch 1 batch id 2201 loss 0.1914428323507309 train acc 0.9173671058609723
epoch 1 batch id 2401 loss 0.13977640867233276 train acc 0.9189139941690962
epoch 1 batch id 2601 loss 0.20029728

epoch 1 batch id 21601 loss 0.059612151235342026 train acc 0.9372728693116059
epoch 1 batch id 21801 loss 0.10626745223999023 train acc 0.9373236892803083
epoch 1 batch id 22001 loss 0.032149143517017365 train acc 0.9373863688014181
epoch 1 batch id 22201 loss 0.416692852973938 train acc 0.937436658258637
epoch 1 batch id 22401 loss 0.3622431457042694 train acc 0.9374665193518147
epoch 1 batch id 22601 loss 0.3928498923778534 train acc 0.9375221229149153
epoch 1 batch id 22801 loss 0.19935035705566406 train acc 0.9375849743432305
epoch 1 batch id 23001 loss 0.06947287172079086 train acc 0.9376290704751967
epoch 1 batch id 23201 loss 0.1982797086238861 train acc 0.9376589371147795
epoch 1 batch id 23401 loss 0.1719861626625061 train acc 0.9376976411264476
epoch 1 train acc 0.9377612898433852


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/7854 [00:00<?, ?it/s]

epoch 1 test acc 0.944120830150242


  0%|          | 0/23561 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.15985627472400665 train acc 0.96875
epoch 2 batch id 201 loss 0.1864076405763626 train acc 0.9345460199004975
epoch 2 batch id 401 loss 0.19038473069667816 train acc 0.9390586034912718
epoch 2 batch id 601 loss 0.12435223907232285 train acc 0.9407757903494176
epoch 2 batch id 801 loss 0.12288222461938858 train acc 0.9406601123595506
epoch 2 batch id 1001 loss 0.3048648536205292 train acc 0.939935064935065
epoch 2 batch id 1201 loss 0.03089788928627968 train acc 0.9411427976686095


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 2 batch id 3201 loss 0.30379876494407654 train acc 0.9412488284910965
epoch 2 batch id 3401 loss 0.14878815412521362 train acc 0.9415613054983828
epoch 2 batch id 3601 loss 0.5793598890304565 train acc 0.9416221188558733
epoch 2 batch id 3801 loss 0.20120035111904144 train acc 0.9417834122599316
epoch 2 batch id 4001 loss 0.3725069463253021 train acc 0.9417255061234692
epoch 2 batch id 4201 loss 0.1440453827381134 train acc 0.9416879909545346
epoch 2 batch id 4401 loss 0.05173569172620773 train acc 0.9419521131561008


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 2 batch id 5001 loss 0.22692067921161652 train acc 0.9422615476904619
epoch 2 batch id 5201 loss 0.23618099093437195 train acc 0.9421144972120746
epoch 2 batch id 5401 loss 0.13292382657527924 train acc 0.9423254952786521
epoch 2 batch id 5601 loss 0.2737630009651184 train acc 0.9422815122299589
epoch 2 batch id 5801 loss 0.1959785372018814 train acc 0.9419173418376142
epoch 2 batch id 6001 loss 0.11611108481884003 train acc 0.9416191051491418
epoch 2 batch id 6201 loss 0.05464968830347061 train acc 0.9414761731978714
epoch 2 batch id 6401 loss 0.045884281396865845 train acc 0.9415521012341822
epoch 2 batch id 6601 loss 0.08790092915296555 train acc 0.9413819875776398
epoch 2 batch id 6801 loss 0.17504191398620605 train acc 0.9413872959858844
epoch 2 batch id 7001 loss 0.25396066904067993 train acc 0.9414503285244965
epoch 2 batch id 7201 loss 0.16198013722896576 train acc 0.9414838216914317
epoch 2 batch id 7401 loss 0.09790629148483276 train acc 0.9414732806377517
epoch 2 batch

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 3 batch id 2001 loss 0.2143349051475525 train acc 0.9449650174912544
epoch 3 batch id 2201 loss 0.14813576638698578 train acc 0.9445706497046797
epoch 3 batch id 2401 loss 0.12956920266151428 train acc 0.9442159516867972
epoch 3 batch id 2601 loss 0.11064151674509048 train acc 0.9439758746635909
epoch 3 batch id 2801 loss 0.11601313948631287 train acc 0.9445622099250268
epoch 3 batch id 3001 loss 0.06880909949541092 train acc 0.9443102299233589
epoch 3 batch id 3201 loss 0.2748487889766693 train acc 0.944148313027179


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 3 batch id 5801 loss 0.24595405161380768 train acc 0.9447886140320635
epoch 3 batch id 6001 loss 0.08718191087245941 train acc 0.9449518830194967
epoch 3 batch id 6201 loss 0.017170244827866554 train acc 0.9447568940493469
epoch 3 batch id 6401 loss 0.04788504168391228 train acc 0.9448279565692861
epoch 3 batch id 6601 loss 0.17266497015953064 train acc 0.9448994470534767
epoch 3 batch id 6801 loss 0.18458138406276703 train acc 0.9449575430083811
epoch 3 batch id 7001 loss 0.23212362825870514 train acc 0.9450435652049707


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 3 batch id 9201 loss 0.3744829297065735 train acc 0.9447818171937833
epoch 3 batch id 9401 loss 0.14768704771995544 train acc 0.9447731624295288
epoch 3 batch id 9601 loss 0.22535528242588043 train acc 0.9448625143214249
epoch 3 batch id 9801 loss 0.08479530364274979 train acc 0.9449131466176921
epoch 3 batch id 10001 loss 0.18875668942928314 train acc 0.9449711278872113
epoch 3 batch id 10201 loss 0.0954781100153923 train acc 0.9450053916282717
epoch 3 batch id 10401 loss 0.12948673963546753 train acc 0.9450293241034516
epoch 3 batch id 10601 loss 0.22202101349830627 train acc 0.9450936232430903


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 3 batch id 13001 loss 0.15491464734077454 train acc 0.9451917160218445
epoch 3 batch id 13201 loss 0.05287919566035271 train acc 0.9452977047193395
epoch 3 batch id 13401 loss 0.36613574624061584 train acc 0.9453608872472203
epoch 3 batch id 13601 loss 0.28229233622550964 train acc 0.9453785567237704
epoch 3 batch id 13801 loss 0.045480385422706604 train acc 0.9453504275052532
epoch 3 batch id 14001 loss 0.23382218182086945 train acc 0.9453766695236054
epoch 3 batch id 14201 loss 0.10070531070232391 train acc 0.945296546017886
epoch 3 batch id 14401 loss 0.19735458493232727 train acc 0.9453466773140754


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 3 batch id 17001 loss 0.2882872521877289 train acc 0.9459645756131992
epoch 3 batch id 17201 loss 0.4161304533481598 train acc 0.9459297424568339
epoch 3 batch id 17401 loss 0.12423323094844818 train acc 0.9459837078328832
epoch 3 batch id 17601 loss 0.27512112259864807 train acc 0.9460328958581898
epoch 3 batch id 17801 loss 0.1268053650856018 train acc 0.9460177799000056
epoch 3 batch id 18001 loss 0.13461896777153015 train acc 0.9461193128159546
epoch 3 batch id 18201 loss 0.0945521742105484 train acc 0.9461207488599528


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 3 batch id 20601 loss 0.08127056062221527 train acc 0.9465393063443522
epoch 3 batch id 20801 loss 0.04378838464617729 train acc 0.9465169943752704
epoch 3 batch id 21001 loss 0.3311086595058441 train acc 0.9465605804485501
epoch 3 batch id 21201 loss 0.21061034500598907 train acc 0.9465237488797699
epoch 3 batch id 21401 loss 0.14313600957393646 train acc 0.9465182701742909
epoch 3 batch id 21601 loss 0.16924962401390076 train acc 0.9464738322299894
epoch 3 batch id 21801 loss 0.15525983273983002 train acc 0.9464918467042797
epoch 3 batch id 22001 loss 0.0576753243803978 train acc 0.9465152152174902


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 4 batch id 1801 loss 0.1961064487695694 train acc 0.9496980843975569
epoch 4 batch id 2001 loss 0.16780635714530945 train acc 0.9498844327836082
epoch 4 batch id 2201 loss 0.06627018004655838 train acc 0.9493838028169014
epoch 4 batch id 2401 loss 0.12716104090213776 train acc 0.9495132236568097
epoch 4 batch id 2601 loss 0.07987946271896362 train acc 0.9493584198385236
epoch 4 batch id 2801 loss 0.08761551231145859 train acc 0.9500847911460193
epoch 4 batch id 3001 loss 0.1874416321516037 train acc 0.9499125291569477
epoch 4 batch id 3201 loss 0.21166186034679413 train acc 0.9495763042799126


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 4 batch id 5601 loss 0.27470266819000244 train acc 0.950226522049634
epoch 4 batch id 5801 loss 0.1992512196302414 train acc 0.9502133252887434
epoch 4 batch id 6001 loss 0.0600331574678421 train acc 0.9503416097317113
epoch 4 batch id 6201 loss 0.05012010410428047 train acc 0.9503457103692953
epoch 4 batch id 6401 loss 0.03076469898223877 train acc 0.9503788470551476
epoch 4 batch id 6601 loss 0.17995457351207733 train acc 0.9504573170731707
epoch 4 batch id 6801 loss 0.17818957567214966 train acc 0.9504117041611527


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 4 batch id 9201 loss 0.33789899945259094 train acc 0.9503111074883165
epoch 4 batch id 9401 loss 0.0954081192612648 train acc 0.9502845441974258
epoch 4 batch id 9601 loss 0.13500037789344788 train acc 0.9503339495885845
epoch 4 batch id 9801 loss 0.0646088495850563 train acc 0.9504259769411284
epoch 4 batch id 10001 loss 0.20118360221385956 train acc 0.9504955754424558
epoch 4 batch id 10201 loss 0.08677928894758224 train acc 0.9505287471816488
epoch 4 batch id 10401 loss 0.1927189826965332 train acc 0.9505305980194212


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 4 batch id 12401 loss 0.0755591168999672 train acc 0.9505231432948956
epoch 4 batch id 12601 loss 0.30917489528656006 train acc 0.9505495595587652
epoch 4 batch id 12801 loss 0.2822939157485962 train acc 0.9505605030856964
epoch 4 batch id 13001 loss 0.14268004894256592 train acc 0.9506312014460426
epoch 4 batch id 13201 loss 0.0771171823143959 train acc 0.9507092265737445
epoch 4 batch id 13401 loss 0.5010039210319519 train acc 0.9507499440340273
epoch 4 batch id 13601 loss 0.40826714038848877 train acc 0.9507848687596501


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 4 batch id 16201 loss 0.17366282641887665 train acc 0.9509366705758904
epoch 4 batch id 16401 loss 0.13102759420871735 train acc 0.9509881257240412
epoch 4 batch id 16601 loss 0.03997732326388359 train acc 0.95102704656346
epoch 4 batch id 16801 loss 0.022381676360964775 train acc 0.9510278406047259
epoch 4 batch id 17001 loss 0.3116569221019745 train acc 0.9510433209811188
epoch 4 batch id 17201 loss 0.43769174814224243 train acc 0.951000305214813
epoch 4 batch id 17401 loss 0.0829300582408905 train acc 0.9510265214642837


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 4 batch id 19801 loss 0.02327548712491989 train acc 0.9513282157466795
epoch 4 batch id 20001 loss 0.14396169781684875 train acc 0.9513102469876507
epoch 4 batch id 20201 loss 0.06856479495763779 train acc 0.9513328548091678
epoch 4 batch id 20401 loss 0.15637752413749695 train acc 0.9513963776285477
epoch 4 batch id 20601 loss 0.07719969004392624 train acc 0.9514359133051794
epoch 4 batch id 20801 loss 0.08621272444725037 train acc 0.9514356280948031
epoch 4 batch id 21001 loss 0.18107454478740692 train acc 0.9514680848531022
epoch 4 batch id 21201 loss 0.2336631566286087 train acc 0.9514512876751097


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 4 train acc 0.9516295467085438


  0%|          | 0/7854 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 5 batch id 1001 loss 0.12903693318367004 train acc 0.9520791708291708
epoch 5 batch id 1201 loss 0.020033176988363266 train acc 0.9526436303080766
epoch 5 batch id 1401 loss 0.08010727912187576 train acc 0.9529800142755175
epoch 5 batch id 1601 loss 0.28645768761634827 train acc 0.953408026233604
epoch 5 batch id 1801 loss 0.1449361890554428 train acc 0.9539838978345364
epoch 5 batch id 2001 loss 0.22656212747097015 train acc 0.9542572463768116
epoch 5 batch id 2201 loss 0.06285908073186874 train acc 0.9538987960018174
epoch 5 batch id 2401 loss 0.06870408356189728 train acc 0.953990524781341


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 5 batch id 4401 loss 0.020313037559390068 train acc 0.9544421722335833
epoch 5 batch id 4601 loss 0.2161734253168106 train acc 0.9545750923712236
epoch 5 batch id 4801 loss 0.06827736645936966 train acc 0.9545472297438033
epoch 5 batch id 5001 loss 0.06870435178279877 train acc 0.9546090781843631
epoch 5 batch id 5201 loss 0.1827896684408188 train acc 0.9545159584695251
epoch 5 batch id 5401 loss 0.03722337633371353 train acc 0.9548116089613035
epoch 5 batch id 5601 loss 0.2754688858985901 train acc 0.9546621139082306


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 5 batch id 8001 loss 0.19949588179588318 train acc 0.9543338332708411
epoch 5 batch id 8201 loss 0.12416625767946243 train acc 0.9543996159004999
epoch 5 batch id 8401 loss 0.04310927167534828 train acc 0.9543320735626711
epoch 5 batch id 8601 loss 0.07814836502075195 train acc 0.9543476049296593
epoch 5 batch id 8801 loss 0.010730432346463203 train acc 0.9543482274741507
epoch 5 batch id 9001 loss 0.21970294415950775 train acc 0.9543488223530718
epoch 5 batch id 9201 loss 0.25325170159339905 train acc 0.9544376969894577
epoch 5 batch id 9401 loss 0.0997912809252739 train acc 0.9544297149239442


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 5 batch id 11801 loss 0.039614077657461166 train acc 0.9545721760867724
epoch 5 batch id 12001 loss 0.0361827127635479 train acc 0.9544959795017082
epoch 5 batch id 12201 loss 0.04513010010123253 train acc 0.9545529054995492
epoch 5 batch id 12401 loss 0.09025018662214279 train acc 0.9546004354487542
epoch 5 batch id 12601 loss 0.1327977180480957 train acc 0.9546414967066106
epoch 5 batch id 12801 loss 0.22040139138698578 train acc 0.9546666276072182
epoch 5 batch id 13001 loss 0.13071748614311218 train acc 0.9547126182601339


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 5 batch id 15201 loss 0.15757040679454803 train acc 0.9549865140451286
epoch 5 batch id 15401 loss 0.02415342442691326 train acc 0.9549765437309266
epoch 5 batch id 15601 loss 0.13506273925304413 train acc 0.954978847509775
epoch 5 batch id 15801 loss 0.030794866383075714 train acc 0.9550265805961649
epoch 5 batch id 16001 loss 0.1990462988615036 train acc 0.9550672614211612
epoch 5 batch id 16201 loss 0.05403732880949974 train acc 0.9550683599777792
epoch 5 batch id 16401 loss 0.1415528655052185 train acc 0.9550922962014511


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 5 batch id 18801 loss 0.21627788245677948 train acc 0.9552051752566353
epoch 5 batch id 19001 loss 0.285936564207077 train acc 0.9552276853849797
epoch 5 batch id 19201 loss 0.1019584983587265 train acc 0.955264374251341
epoch 5 batch id 19401 loss 0.3171822726726532 train acc 0.9553599041286531
epoch 5 batch id 19601 loss 0.23122118413448334 train acc 0.9553610147441457
epoch 5 batch id 19801 loss 0.02075735665857792 train acc 0.9553447426897631
epoch 5 batch id 20001 loss 0.0886542797088623 train acc 0.9553319209039548


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 5 batch id 22401 loss 0.19471345841884613 train acc 0.9553856412660149
epoch 5 batch id 22601 loss 0.14114578068256378 train acc 0.9554444493606478
epoch 5 batch id 22801 loss 0.21647141873836517 train acc 0.9554720735932635
epoch 5 batch id 23001 loss 0.0890384167432785 train acc 0.9554924242424242
epoch 5 batch id 23201 loss 0.13988149166107178 train acc 0.955494914012327
epoch 5 batch id 23401 loss 0.05011158064007759 train acc 0.9555173924191274
epoch 5 train acc 0.955555526081236


  0%|          | 0/7854 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



epoch 5 test acc 0.9491222625413802


In [22]:
torch.cuda.empty_cache()

In [30]:
import torch
import torchvision.models as models


#모델의 형태를 포함하여 저장하기
# torch.save(model, 'KoBERT_QA_v.0.0.2_sona.pth')
# torch.save(model.state_dict(), "./KoBERT_QA_v.0.0.2_sona.pt")

## 3-5.새로운 문장 테스트

In [49]:
#불러오기
# model_pt= BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
# model_pt.load_state_dict(torch.load('KoBERT_QA_v.0.0.2_sona.pt'))

device = torch.device("cuda")
model_pt = BERTClassifier(bertmodel,  dr_rate=0.5)
model_pt.load_state_dict(torch.load('KoBERT_QA_v.0.0.2_sona.pt'))
model_pt.to(device)

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [62]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model_pt.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model_pt(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            # 고객 질문: 1, 상담원 질문: 2, 고객 및 상담원 대답: 0

            print(np.argmax(logits))
#             if np.argmax(logits) == 0:
#                 test_eval.append("대답")
#             elif np.argmax(logits) == 1:
#                 test_eval.append("고객 질문")
#             elif np.argmax(logits) == 2:
#                 test_eval.append("상담원 질문")

#         print(">> 입력하신 내용은 " + test_eval[0] + " 라고 판단됩니다.")

using cached model. /home/adminuser/notebooks/modeling/question/[0차] KoBERT_QA_baseline/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [63]:
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == 0 :
        break
    predict(sentence)
    print("\n")

하고싶은 말을 입력해주세요 : 이렇게 말하면 몇이 나올까
1


하고싶은 말을 입력해주세요 : 일단은 돌려돌려를 해볼까나
1


하고싶은 말을 입력해주세요 : 몇시까지 하나요
0




KeyboardInterrupt: Interrupted by user