In [10]:
from transformers.optimization import get_cosine_schedule_with_warmup
# from transformers import BertModel
from transformers import AdamW

from kobert import get_pytorch_kobert_model
from kobert import get_tokenizer

from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import torch

from sklearn.model_selection import train_test_split
import gluonnlp as nlp
from tqdm import tqdm
import pandas as pd
import numpy as np
import os

### 1. load data (naver shopping data)

In [11]:
data = pd.read_table("naver_shopping.txt", sep="\t")

# 긍정 : 1, 부정 : 0으로 변경
# 별점 1, 2 : 부정
# 별점 3, 4, 5 : 긍정
data['star'] = data['star'].map(lambda x: 0 if x < 3 else 1)

### 2. split review & star

In [12]:
data_list = [[d[0], d[1]] for d in data[['review', 'star']].values]

### 3. load pretrained model

In [13]:
bertmodel, vocab = get_pytorch_kobert_model()
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /home/gksthwls3566/.cache/kobert_v1.zip
using cached model. /home/gksthwls3566/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /home/gksthwls3566/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


### 4. define dataset

In [14]:
# dataset
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [i[label_idx] for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [15]:
# hyperparameter
PATH = os.getcwd()
seed = 0
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### 5. split train & test

In [16]:
train, valid = train_test_split(data_list, test_size=0.2, shuffle=True, random_state=seed)

train_dataset = BERTDataset(train, 0, 1, tok, max_len, True, False)
valid_dataset = BERTDataset(valid, 0, 1, tok, max_len, True, False)

train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=5)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=5)

### 6. define KoBERT model

In [17]:
# KoBERT 모델
class BERTClassifier(nn.Module):
  def __init__(self, bert, hidden_size=768, num_classes=2, dr_rate=None, params=None):
    super(BERTClassifier, self).__init__()
    self.bert = bert
    self.dr_rate = dr_rate

    self.classifier = nn.Linear(hidden_size , num_classes)
    if dr_rate:
      self.dropout = nn.Dropout(p=dr_rate)

  def gen_attention_mask(self, token_ids, valid_length):
    attention_mask = torch.zeros_like(token_ids)
    for i, v in enumerate(valid_length):
      attention_mask[i][:v] = 1
    return attention_mask.float()

  def forward(self, token_ids, valid_length, segment_ids):
    attention_mask = self.gen_attention_mask(token_ids, valid_length)
    _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
    if self.dr_rate:
        out = self.dropout(pooler)
    else:
        out = pooler
    return self.classifier(out)

In [18]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

# optimizer and schedule
no_decay = ['bias', 'LayerNorm.weight']

# 최적화해야 할 parameter를 optimizer에게 알려야 함
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [19]:
t_total = len(train_loader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [20]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [21]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy() / max_indices.size()[0]
    return train_acc

### 7. train

In [58]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_loader), total=len(train_loader)):
        optimizer.zero_grad()
        
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        
        print(token_ids, segment_ids, valid_length, label)
        
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()
        
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(valid_loader), total=len(valid_loader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/2500 [00:00<?, ?it/s]

tensor([[   2, 1773, 7767,  ...,    1,    1,    1],
        [   2, 3969, 5495,  ...,   54,    3,    1],
        [   2, 1096, 7601,  ...,    1,    1,    1],
        ...,
        [   2, 2287, 6621,  ...,    1,    1,    1],
        [   2, 3060, 7828,  ...,    1,    1,    1],
        [   2, 4832, 7788,  ...,    1,    1,    1]]) tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]) tensor([31, 63, 55, 64, 14, 23, 11, 31, 29, 54, 14, 16, 22, 40, 64, 34, 15, 53,
        60, 10, 15, 23, 18, 10, 20, 61, 11, 61, 35, 12, 64, 24,  8, 51, 12, 12,
        23, 24, 12,  8, 10, 13, 63, 13, 64, 37, 23, 55, 53, 60, 14, 18, 10, 25,
        25, 12, 33, 15, 29, 53, 64, 21, 14, 19], dtype=torch.int32) tensor([1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 

  0%|          | 0/2500 [00:14<?, ?it/s]


KeyboardInterrupt: 

In [56]:
# 모델 성능 accuracy
test_acc / (batch_id+1)

0.933025

In [23]:
# 모델 저장
torch.save(model.state_dict(), 'naverShoppingReview_state_dict.pt')  # 모델 객체의 state_dict 저장

### 9. prediction

In [52]:
def predict(predict_sentence):

    data = [predict_sentence, 0]
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
    test_loader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_loader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)
        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            if np.argmax(logits) == 0:
                test_eval.append("부정적")
            else:
                test_eval.append("긍정적")
        print(">> 해당 리뷰는 " + test_eval[0] + " 리뷰 입니다.")

In [53]:
predict("해양 산업이 지속해서 상승세를 띄고 있습니다.")

>> 해당 리뷰는 긍정적 리뷰 입니다.
