# .py 코드식

In [6]:
# 필요라이브러리 임포트
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup 
import re
#GPU 사용 시
device = torch.device("cuda:0")

#BERT vocab 모델 가져오기
bertmodel, vocab = get_pytorch_kobert_model()

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out) 

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

# Setting parameters
max_len = 32 
batch_size = 32
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

#저장된 가중치 가져오기
model_pt = BERTClassifier(bertmodel,  dr_rate=0.5)
model_pt.load_state_dict(torch.load('QA.pt'))
model_pt.to(device)

tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

# 예측하기 (고객 질문: 1, 상담원 질문: 2, 고객 및 상담원 대답: 0)
def predict(predict_sentence):
    predict_sentence = re.compile('[^ A-Za-z0-9ㄱ-ㅣ가-힣]+').sub('', str(predict_sentence))
    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model_pt.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)
        out = model_pt(token_ids, valid_length, segment_ids)
        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            idx= np.argmax(logits)
            
            return idx


# # 예측하기 (고객 질문: 1, 상담원 질문: 2, 고객 및 상담원 대답: 0에 대한 확률) 
# def softmax(vals, idx):
#     valscpu = vals.cpu().detach().squeeze(0)
#     a = 0
#     for i in valscpu:
#         a += np.exp(i)
#     return ((np.exp(valscpu[idx]))/a).item()
        
        
# def predict_softmax(predict_sentence):
#     predict_sentence = re.compile('[^ A-Za-z0-9ㄱ-ㅣ가-힣]+').sub('', predict_sentence)
#     data = [predict_sentence, '0']
#     dataset_another = [data]

#     another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
#     test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
#     model_pt.eval()

#     for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
#         token_ids = token_ids.long().to(device)
#         segment_ids = segment_ids.long().to(device)

#         valid_length= valid_length
#         label = label.long().to(device)
#         out = model_pt(token_ids, valid_length, segment_ids)

#         test_eval=[]
#         for i in out:
#             logits=i
#             logits = logits.detach().cpu().numpy()
#             idx=np.argmax(logits)
            
#             return softmax(out,idx)

using cached model. /usr/src/app/QA/[0차] KoBERT_QA_baseline/.cache/kobert_v1.zip
using cached model. /usr/src/app/QA/[0차] KoBERT_QA_baseline/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /usr/src/app/QA/[0차] KoBERT_QA_baseline/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [2]:
##GPU 사용 시
device = torch.device("cuda:0")

In [3]:
torch.cuda.is_available()

True

# 모델 테스트

In [2]:
import pandas as pd
df=pd.read_csv("total_cr_naver.csv",index_col=0)

In [3]:
df.head()

Unnamed: 0,id,comment,program,time,love,play
0,seniecrew,걱정마세요. 언제나 잘 보고 잘 듣고 있어요.😍,느와르블랑💚온리블랑 특집전 ☺️,2022-04-23_23:33:16,11488,1433
1,수연마미,있어요♡,느와르블랑💚온리블랑 특집전 ☺️,2022-04-23_23:33:16,11488,1433
2,sea,블루밍 예술이여요,느와르블랑💚온리블랑 특집전 ☺️,2022-04-23_23:33:16,11488,1433
3,seniecrew,아이고 그러셨구나,느와르블랑💚온리블랑 특집전 ☺️,2022-04-23_23:33:16,11488,1433
4,찐찐,지금 플라워 원피스 사이즈 몇까지 가능할까요?,느와르블랑💚온리블랑 특집전 ☺️,2022-04-23_23:33:16,11488,1433


In [7]:
tmp=df[["comment"]][:5].copy()
tmp.head()

Unnamed: 0,comment
0,걱정마세요. 언제나 잘 보고 잘 듣고 있어요.😍
1,있어요♡
2,블루밍 예술이여요
3,아이고 그러셨구나
4,지금 플라워 원피스 사이즈 몇까지 가능할까요?


In [8]:
tmp["qa_score"]=tmp.comment.map(lambda x: predict(x))

In [9]:
tmp.head()

Unnamed: 0,comment,qa_score
0,걱정마세요. 언제나 잘 보고 잘 듣고 있어요.😍,1
1,있어요♡,0
2,블루밍 예술이여요,0
3,아이고 그러셨구나,0
4,지금 플라워 원피스 사이즈 몇까지 가능할까요?,1
