In [22]:
from KoBERT.kobert.utils import get_tokenizer
from KoBERT.kobert.pytorch_kobert import get_pytorch_kobert_model
import gluonnlp as nlp
import torch
from torch import nn
from torch.utils.data import Dataset
import pandas as pd
import numpy as np

In [23]:
device = torch.device("cuda:0")

In [24]:
# 설정값

num_class = 3
max_len = 512
drop_out_rate = 0.2
batch_size = 4
column_name = 'text'
labels = '파트너십'
filename = 'fully_labelled_df_2'

In [25]:
class BERTDataset(Dataset):
    def __init__(self, dataset, bert_tokenizer, max_len, pad, pair, column_name=column_name, labels=labels):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        sent_data = []
        for i in range(len(dataset)):
            sent_data.append([str(dataset.iloc[i][column_name]), dataset.iloc[i][labels]])

        self.sentences = [transform([i[0]]) for i in sent_data]
        self.labels = [np.int32(i[1]) for i in sent_data]

    def __getitem__(self, i):
        return self.sentences[i] + (self.labels[i],)

    def __len__(self):
        return len(self.labels)

    

In [26]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size=768,
                 num_classes=num_class,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(),
                              attention_mask=attention_mask.float().to(token_ids.device))

        if self.dr_rate:
            out = self.dropout(pooler)
        softmax = nn.Softmax(dim=1)
        output = softmax(self.classifier(out))
        # output = self.classifier(out)
        # return self.classifier(out)
        return output

In [27]:
# 파일 읽기

def read_data(path='./', filename=filename, labels=labels):
    df = pd.read_csv(path + '/' + filename + '.csv', sep=',')
    df = df.dropna(axis=0)
    return df

In [32]:
sample = read_data(path='./data/', filename='fully_labelled_df_2', labels='파트너십')
sample = sample.loc[[2],['text']]
sample[labels] = 10

In [33]:
sample

Unnamed: 0,text,파트너십
2,고객 세그먼트는 주거용 건물 건축주(입주자아파트 등)\n건설부문 외주 및 자재 생산...,10


In [30]:
# 사전에 BERTDataset와 BERTClassifier를 전역변수로 불러와야함

bertmodel, vocab = get_pytorch_kobert_model() # 버트 모델, vocab 불러오기
tokenizer = get_tokenizer() #토큰나이저 불러오기
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) # vocab 단어 토큰화, tok()에 단어를 넣으면 vocab 속 단어로 토큰화

def predict(data, max_len=max_len, dr_rate=drop_out_rate, batch_size=batch_size):
    
    data_train = BERTDataset(sample, tok, max_len, True, False) # 토큰화, 정수 인코딩, 패딩
    dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=4) # 배치사이즈 적용
    
    model = BERTClassifier(bertmodel, dr_rate=drop_out_rate).to(device) # BERT 불러오기
    model.load_state_dict(torch.load('파트너십_best_model.pt'))         # 학습된 모델 불러오기
    model.eval() # 평가모드

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(dataloader):
        
        token_ids = token_ids.long().to(device)     # token_ids 를 'cuda:0'에 연결
        segment_ids = segment_ids.long().to(device) # segment_ids 를 'cuda:0'에 연결
        valid_length= valid_length                  # 임베딩된 단어 길이
        label = label.long().to(device)             # 라벨링 'cuda:0'에 연결
        out = model(token_ids, valid_length, segment_ids)  # 모델의 softmax 결과값
        
        test_eval=[]
        
        logits=out
        logits = logits.detach().cpu().numpy() # https://byeongjo-kim.tistory.com/32 참고
        
        # 값이 가장 높은 index 찾기
        if np.argmax(logits) == 0:      
            test_eval.append("C")
        elif np.argmax(logits) == 1:
            test_eval.append("B")
        elif np.argmax(logits) == 2:
            test_eval.append("A")
        
    print("{}의 결과는 {}입니다.".format(labels, test_eval))

using cached model
using cached model
using cached model


In [34]:
predict(sample)

파트너십의 결과는 ['B']입니다.
