# 0. 들어가기 앞서

* 고객 질문: 1, 상담원 질문: 2, 고객 및 상담원 대답: 0
* 참고: https://velog.io/@seolini43/KOBERT%EB%A1%9C-%EB%8B%A4%EC%A4%91-%EB%B6%84%EB%A5%98-%EB%AA%A8%EB%8D%B8-%EB%A7%8C%EB%93%A4%EA%B8%B0-%ED%8C%8C%EC%9D%B4%EC%8D%ACColab

* 한국어언어모델 다양하게 사용해보기 : https://littlefoxdiary.tistory.com/81

# 1. 라이브러리 로드

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
# from tqdm.notebook import tqdm
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [2]:
pip install numpy --upgrade

Note: you may need to restart the kernel to use updated packages.


In [3]:
##GPU 사용 시에
device = torch.device("cuda:0")

In [4]:
import torch
torch.cuda.is_available()

True

# 2. 모델, 사전, 데이터셋 불러오기

In [5]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /home/adminuser/notebooks/modeling/question/[1차] KoBERT_QA_자모음전처리/.cache/kobert_v1.zip
using cached model. /home/adminuser/notebooks/modeling/question/[1차] KoBERT_QA_자모음전처리/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [6]:
import os
os.listdir("../[1차] 자음모음_preprocessing")

['.ipynb_checkpoints',
 '질의응답_K쇼핑_질문유형분류_자모음전처리.csv',
 '질의응답_K쇼핑_질문분류_자모음전처리.csv',
 '1_data_processing.ipynb']

In [7]:
df=pd.read_csv("../[1차] 자음모음_preprocessing/질의응답_K쇼핑_질문분류_자모음전처리.csv", index_col=0)

# 3. 질문분류시작

* 고객 질문: 1, 상담원 질문: 2, 고객 및 상담원 대답: 0

In [8]:
df=df.dropna()
df.head(2)

Unnamed: 0,msg,QA
0,저는 입니다.,0
1,네. 아쿠아 청소기를 샀었는데요.,1


In [9]:
# 2중 리스트로 변환됨

data_list = []
for q, label in zip(df["msg"],df["QA"])  :
    data = []
    data.append(q)
    data.append(str(label))

    data_list.append(data)

## 3-1. Train / Test set 분리

* 라벨링은 이미 진행했으므로, 바로 train/ test 분리 진행

In [10]:
dataset_train, dataset_test = train_test_split(data_list, test_size=0.25, random_state=0)

In [11]:
print(len(dataset_train))
print(len(dataset_test))

751035
250345


## 3-2. KoBERT 입력 데이터로 만들기

* 데이터를 train data와 test data로 나누었다면 각 데이터가 KoBERT 모델의 입력으로 들어갈 수 있는 형태가 되도록 토큰화, 정수 인코딩, 패딩 등을 해주어야 한다

In [12]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [13]:
# Setting parameters

max_len = 64 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

In [14]:
# 토큰화
tokenizer= get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

using cached model. /home/adminuser/notebooks/modeling/question/[1차] KoBERT_QA_자모음전처리/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [15]:
# 첫 번째는 패딩된 시퀀스
# 두 번째는 길이와 타입에 대한 내용
# 세 번재는 어텐션 마스크 시퀀스

data_train[0]

(array([   2, 1469,  517,   54,  994, 5778, 3990, 7993, 4398, 6903, 4680,
        7736, 4004, 4981, 5330, 5330, 2391, 3155, 6553, 2443, 6116, 4756,
        6553, 1239, 7088, 5130, 7088, 4926, 3136, 6116, 1788, 6060, 6844,
        4931, 1434, 7132, 4223, 7111, 5130, 7794, 1706, 2964,  889, 6135,
         905,  830, 6701,  517,   54,    3,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32),
 array(50, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32),
 2)

In [16]:
from transformers import AutoModel, AutoTokenizer

In [17]:
# pytorch용 DataLoader 사용(torch 형식의 dataset을 만들어주기)
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

## 3-3. KoBERT 학습모델 만들기

* 고객 질문: 1, 상담원 질문: 2, 고객 및 상담원 대답: 0 
* 3가지의 class를 분류하기 때문에 num_classes는 3으로 입력

In [18]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [19]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

## 3-4. KoBERT 모델 학습시키기

In [22]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/11735 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.1500736474990845 train acc 0.375
epoch 1 batch id 201 loss 0.7987498641014099 train acc 0.49852300995024873
epoch 1 batch id 401 loss 0.40835580229759216 train acc 0.671290523690773
epoch 1 batch id 601 loss 0.352670818567276 train acc 0.753561772046589
epoch 1 batch id 801 loss 0.2385759949684143 train acc 0.7960362047440699
epoch 1 batch id 1001 loss 0.21680225431919098 train acc 0.8220529470529471
epoch 1 batch id 1201 loss 0.22247156500816345 train acc 0.8403934221482098
epoch 1 batch id 1401 loss 0.2968156039714813 train acc 0.8540551391862955
epoch 1 batch id 1601 loss 0.19841614365577698 train acc 0.8644889912554653
epoch 1 batch id 1801 loss 0.0808035358786583 train acc 0.8729698778456413
epoch 1 batch id 2001 loss 0.12221647053956985 train acc 0.8794743253373314
epoch 1 batch id 2201 loss 0.20200812816619873 train acc 0.8847185938209905
epoch 1 batch id 2401 loss 0.19360895454883575 train acc 0.8891672740524781
epoch 1 batch id 2601 loss 0.18798321485

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/3912 [00:00<?, ?it/s]

epoch 1 test acc 0.9468634852424062


  0%|          | 0/11735 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.07345065474510193 train acc 0.984375
epoch 2 batch id 201 loss 0.1601363867521286 train acc 0.9438743781094527
epoch 2 batch id 401 loss 0.2376118302345276 train acc 0.9446695760598504
epoch 2 batch id 601 loss 0.284757524728775 train acc 0.9460014559068219
epoch 2 batch id 801 loss 0.10044948756694794 train acc 0.9454783083645443
epoch 2 batch id 1001 loss 0.1155799999833107 train acc 0.9446491008991009
epoch 2 batch id 1201 loss 0.16350264847278595 train acc 0.9442651956702748
epoch 2 batch id 1401 loss 0.14466454088687897 train acc 0.9446266059957173
epoch 2 batch id 1601 loss 0.16562150418758392 train acc 0.9448489225484072
epoch 2 batch id 1801 loss 0.05261862277984619 train acc 0.9451172959466962
epoch 2 batch id 2001 loss 0.14288562536239624 train acc 0.9452695527236382
epoch 2 batch id 2201 loss 0.1508970558643341 train acc 0.94525215810995
epoch 2 batch id 2401 loss 0.21057377755641937 train acc 0.9451270304039984
epoch 2 batch id 2601 loss 0.20999878

  0%|          | 0/3912 [00:00<?, ?it/s]

epoch 2 test acc 0.9487806631565165


  0%|          | 0/11735 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.08281860500574112 train acc 0.984375
epoch 3 batch id 201 loss 0.12958069145679474 train acc 0.9509483830845771
epoch 3 batch id 401 loss 0.22628234326839447 train acc 0.9505922693266833
epoch 3 batch id 601 loss 0.20733550190925598 train acc 0.9510191347753744
epoch 3 batch id 801 loss 0.13298141956329346 train acc 0.9509597378277154
epoch 3 batch id 1001 loss 0.08878012001514435 train acc 0.9500655594405595
epoch 3 batch id 1201 loss 0.15067940950393677 train acc 0.9494822023313905
epoch 3 batch id 1401 loss 0.12806043028831482 train acc 0.9502587437544611
epoch 3 batch id 1601 loss 0.13606153428554535 train acc 0.950577763897564
epoch 3 batch id 1801 loss 0.07490627467632294 train acc 0.9506610910605219
epoch 3 batch id 2001 loss 0.1054530143737793 train acc 0.9507824212893553
epoch 3 batch id 2201 loss 0.1346980482339859 train acc 0.9507823148568832
epoch 3 batch id 2401 loss 0.14037488400936127 train acc 0.9506976259891712
epoch 3 batch id 2601 loss 0.211

  0%|          | 0/3912 [00:00<?, ?it/s]

epoch 3 test acc 0.9483133510399522


  0%|          | 0/11735 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.0873342901468277 train acc 0.96875
epoch 4 batch id 201 loss 0.13216979801654816 train acc 0.959032960199005
epoch 4 batch id 401 loss 0.1664436161518097 train acc 0.9574501246882793
epoch 4 batch id 601 loss 0.27084821462631226 train acc 0.9576487104825291
epoch 4 batch id 801 loss 0.09592141211032867 train acc 0.9578846754057428
epoch 4 batch id 1001 loss 0.0699266716837883 train acc 0.9573083166833167
epoch 4 batch id 1201 loss 0.14445099234580994 train acc 0.9567808076602831
epoch 4 batch id 1401 loss 0.12305692583322525 train acc 0.9573741970021413
epoch 4 batch id 1601 loss 0.06420314311981201 train acc 0.9574387101811368
epoch 4 batch id 1801 loss 0.09686057269573212 train acc 0.9574715435868961
epoch 4 batch id 2001 loss 0.10195808112621307 train acc 0.9574665792103948
epoch 4 batch id 2201 loss 0.09202592819929123 train acc 0.9574057246706043
epoch 4 batch id 2401 loss 0.12231045961380005 train acc 0.9573745314452311
epoch 4 batch id 2601 loss 0.13962

  0%|          | 0/3912 [00:00<?, ?it/s]

epoch 4 test acc 0.9486528512955759


  0%|          | 0/11735 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.0710766613483429 train acc 0.96875
epoch 5 batch id 201 loss 0.08026131987571716 train acc 0.9674284825870647
epoch 5 batch id 401 loss 0.13890217244625092 train acc 0.9655548628428927
epoch 5 batch id 601 loss 0.2610434591770172 train acc 0.9655522046589018
epoch 5 batch id 801 loss 0.06589925289154053 train acc 0.9648876404494382
epoch 5 batch id 1001 loss 0.0913609191775322 train acc 0.9646291208791209
epoch 5 batch id 1201 loss 0.08098041266202927 train acc 0.9641965029142381
epoch 5 batch id 1401 loss 0.11603882908821106 train acc 0.9648130799428979
epoch 5 batch id 1601 loss 0.04840148985385895 train acc 0.964787632729544
epoch 5 batch id 1801 loss 0.10716582089662552 train acc 0.9648198917268185
epoch 5 batch id 2001 loss 0.06180320680141449 train acc 0.964923788105947
epoch 5 batch id 2201 loss 0.07981313765048981 train acc 0.964781633348478
epoch 5 batch id 2401 loss 0.10174062848091125 train acc 0.9646956997084548
epoch 5 batch id 2601 loss 0.1295975

  0%|          | 0/3912 [00:00<?, ?it/s]

epoch 5 test acc 0.9485392624008678


In [23]:
import torch
import torchvision.models as models


#모델의 형태를 포함하여 저장하기
torch.save(model, 'KoBERT_QA_v.0.1.1_sona.pth')

#불러오기
# model = torch.load('model.pth')

## 3-5.새로운 문장 테스트

In [24]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            # 고객 질문: 1, 상담원 질문: 2, 고객 및 상담원 대답: 0

            if np.argmax(logits) == 0:
                test_eval.append("대답")
            elif np.argmax(logits) == 1:
                test_eval.append("고객 질문")
            elif np.argmax(logits) == 2:
                test_eval.append("상담원 질문")

        print(">> 입력하신 내용은 " + test_eval[0] + " 라고 판단됩니다.")

using cached model. /home/adminuser/notebooks/modeling/question/[1차] KoBERT_QA_자모음전처리/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [25]:
torch.cuda.empty_cache()

In [None]:
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == 0 :
        break
    predict(sentence)
    print("\n")

하고싶은 말을 입력해주세요 : 지금 몇시야??
>> 입력하신 내용은 고객 질문 라고 판단됩니다.


하고싶은 말을 입력해주세요 : 언제 방송 끝나냥??
>> 입력하신 내용은 고객 질문 라고 판단됩니다.


하고싶은 말을 입력해주세요 : 지금 분홍색 말씀하신 것 맞죠?
>> 입력하신 내용은 상담원 질문 라고 판단됩니다.


하고싶은 말을 입력해주세요 : 언제 가능한가요?
>> 입력하신 내용은 고객 질문 라고 판단됩니다.


하고싶은 말을 입력해주세요 : 헐렁한 것을 원하시나요?
>> 입력하신 내용은 상담원 질문 라고 판단됩니다.


