# 0. 들어가기 앞서

* 'AS':0, '주문':1, '배송':2, '업무처리':3, '교환':4, '반품':5, '결제':6
* 참고: https://velog.io/@seolini43/KOBERT%EB%A1%9C-%EB%8B%A4%EC%A4%91-%EB%B6%84%EB%A5%98-%EB%AA%A8%EB%8D%B8-%EB%A7%8C%EB%93%A4%EA%B8%B0-%ED%8C%8C%EC%9D%B4%EC%8D%ACColab

* 한국어언어모델 다양하게 사용해보기 : https://littlefoxdiary.tistory.com/81

# 1. 라이브러리 로드

In [1]:
!pip install numpy --upgrade



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
# from tqdm.notebook import tqdm

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup



RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

In [3]:
##GPU 사용 시
device = torch.device("cuda:0")

In [4]:
import torch
torch.cuda.is_available()

True

# 2. 모델, 사전, 데이터셋 불러오기

In [5]:
# pip install numpy --upgrade

In [6]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /home/adminuser/notebooks/modeling/question/[2차] KoBERT_cat_이응전처리/.cache/kobert_v1.zip
using cached model. /home/adminuser/notebooks/modeling/question/[2차] KoBERT_cat_이응전처리/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [7]:
import os
os.listdir("../[2차] 이응_preprocessing")

['질의응답_K쇼핑_질문유형분류_이응전처리.csv',
 '.ipynb_checkpoints',
 '질의응답_K쇼핑_질문분류_이응전처리.csv',
 '2_data_processing.ipynb']

In [8]:
df=pd.read_csv("../[2차] 이응_preprocessing/질의응답_K쇼핑_질문유형분류_이응전처리.csv", index_col=0)

# 3. 질문유형분류 시작

* 'AS':0, '주문':1, '배송':2, '업무처리':3, '교환':4, '반품':5, '결제':6

In [9]:
df= df.dropna()
df.head(2)

Unnamed: 0,msg,cat
1,네. 아쿠아 청소기를 샀었는데요.,0
3,네. 얼마 전에도 전화 한 번 드렸던 적이 있어요.,0


In [10]:
# 2중 리스트로 변환됨

data_list = []
for q, label in zip(df["msg"],df["cat"])  :
    data = []
    data.append(q)
    data.append(str(label))

    data_list.append(data)

## 3-1. Train / Test set 분리

* 라벨링은 이미 진행했으므로, 바로 train/ test 분리 진행

In [11]:
dataset_train, dataset_test = train_test_split(data_list, test_size=0.25, random_state=0)

In [12]:
print(len(dataset_train))
print(len(dataset_test))

70761
23587


## 3-2. KoBERT 입력 데이터로 만들기

* 데이터를 train data와 test data로 나누었다면 각 데이터가 KoBERT 모델의 입력으로 들어갈 수 있는 형태가 되도록 토큰화, 정수 인코딩, 패딩 등을 해주어야 한다

In [13]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [14]:
# Setting parameters

max_len = 64 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

In [15]:
# 토큰화
tokenizer= get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

using cached model. /home/adminuser/notebooks/modeling/question/[2차] KoBERT_cat_이응전처리/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [16]:
# 첫 번째는 패딩된 시퀀스
# 두 번째는 길이와 타입에 대한 내용
# 세 번재는 어텐션 마스크 시퀀스

data_train[0]

(array([   2, 4213, 6607, 6116, 1574, 4971, 4128, 5330, 2493, 5920, 6060,
        5859, 1772, 5591, 6999,  633,    3,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32),
 array(17, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32),
 2)

In [17]:
from transformers import AutoModel, AutoTokenizer

In [18]:
# pytorch용 DataLoader 사용(torch 형식의 dataset을 만들어주기)
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

## 3-3. KoBERT 학습모델 만들기

* 질문유형: 'AS':0, '주문':1, '배송':2, '업무처리':3, '교환':4, '반품':5, '결제':6

* 7가지의 class를 분류하기 때문에 num_classes는 7으로 입력

In [19]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=7,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [20]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

## 3-4. KoBERT 모델 학습시키기

In [21]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/1106 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 2.0826523303985596 train acc 0.140625
epoch 1 batch id 201 loss 1.7024240493774414 train acc 0.22737873134328357
epoch 1 batch id 401 loss 1.6031405925750732 train acc 0.2925498753117207
epoch 1 batch id 601 loss 1.6143873929977417 train acc 0.32729305324459235
epoch 1 batch id 801 loss 1.6281630992889404 train acc 0.34790496254681647
epoch 1 batch id 1001 loss 1.3357239961624146 train acc 0.3625124875124875
epoch 1 train acc 0.36879665516252813


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/369 [00:00<?, ?it/s]

epoch 1 test acc 0.422471447928765


  0%|          | 0/1106 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.7044872045516968 train acc 0.328125
epoch 2 batch id 201 loss 1.5365853309631348 train acc 0.40282960199004975
epoch 2 batch id 601 loss 1.4649910926818848 train acc 0.42749064059900166
epoch 2 batch id 801 loss 1.4705092906951904 train acc 0.43416432584269665
epoch 2 batch id 1001 loss 1.192931890487671 train acc 0.43921703296703296
epoch 2 train acc 0.4417175715608874


  0%|          | 0/369 [00:00<?, ?it/s]

epoch 2 test acc 0.4410532810685249


  0%|          | 0/1106 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.6318950653076172 train acc 0.328125
epoch 3 batch id 201 loss 1.4184070825576782 train acc 0.45732276119402987
epoch 3 batch id 401 loss 1.4705522060394287 train acc 0.46372350374064836


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



KeyboardInterrupt: 

In [None]:
import torch
import torchvision.models as models


#모델의 형태를 포함하여 저장하기
torch.save(model, 'KoBERT_cat_v.0.2.1_sona.pth')

#불러오기
# model = torch.load('model.pth')

## 3-5.새로운 문장 테스트

In [10]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            # 'AS':0, '주문':1, '배송':2, '업무처리':3, '교환':4, '반품':5, '결제':6

            if np.argmax(logits) == 0:
                test_eval.append("AS")
            elif np.argmax(logits) == 1:
                test_eval.append("주문")
            elif np.argmax(logits) == 2:
                test_eval.append("배송")
            elif np.argmax(logits) == 3:
                test_eval.append("업무처리")
            elif np.argmax(logits) == 4:
                test_eval.append("교환")
            elif np.argmax(logits) == 5:
                test_eval.append("반품")
            elif np.argmax(logits) == 6:
                test_eval.append("결제")

        print(">> 입력하신 질문은 " + test_eval[0] + "유형이라고 판단됩니다.")

using cached model. /home/adminuser/notebooks/modeling/question/[1차] KoBERT_cat_자모음전처리/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [18]:
# #모델의 형태를 포함하여 저장하기
# torch.save(model, 'KoBERT_cat_v.0.0.1_sona.pth')

#불러오기
model = torch.load('KoBERT_cat_v.0.1.1_sona.pth')

In [None]:
end = 1
while end == 1 :
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == 0 :
        break
    predict(sentence)
    print("\n")

하고싶은 말을 입력해주세요 : 라지는 몇 cm인가요?
>> 입력하신 질문은 주문유형이라고 판단됩니다.


하고싶은 말을 입력해주세요 : 1개 가격은 얼마인가요?
>> 입력하신 질문은 주문유형이라고 판단됩니다.


하고싶은 말을 입력해주세요 : 환불하면 언제 다시 입금되나요?
>> 입력하신 질문은 업무처리유형이라고 판단됩니다.


하고싶은 말을 입력해주세요 : 옷 색깔은 몇 가지 인가요?
>> 입력하신 질문은 주문유형이라고 판단됩니다.


하고싶은 말을 입력해주세요 : 모바일로도 살 수 잇나요?
>> 입력하신 질문은 주문유형이라고 판단됩니다.


하고싶은 말을 입력해주세요 : 반품 가능한가요?
>> 입력하신 질문은 반품유형이라고 판단됩니다.




> restart 후 필요한 것
1. device 변수
2. model 로드
3. 라이브러리패키지 로드
4. class BERTClassifier
5. bertmodel, vocab = get_pytorch_kobert_model() 등등
==> 다시한번 확인하기!