이번 장에서는 Bi-LSTM과 lr schduler를 사용해서 NSMC 감성 분석 성능을 끌어 올려보겠습니다.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (85/85), done.[K
remote: Total 91 (delta 43), reused 22 (delta 6), pack-reused 0[K
Unpacking objects: 100% (91/91), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 28.0MB/s 
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 41.5MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a20

In [5]:
from torchtext.legacy import data
from konlpy.tag import Mecab

In [6]:
cd /content/drive/MyDrive/torch_example/

/content/drive/MyDrive/torch_example


In [7]:
tokenizer = Mecab()

In [8]:
train = pd.read_csv("./data/nsmc/ratings_train.txt", sep='\t')
test = pd.read_csv("./data/nsmc/ratings_test.txt", sep='\t')

train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

print(train.shape)
print(test.shape)

(150000, 2)
(50000, 2)


In [9]:
train_data = train.dropna() #말뭉치에서 nan 값을 제거함
test_data = test.dropna()

print(train_data.shape)
print(test_data.shape)

(149995, 2)
(49997, 2)


In [10]:
train_data, valid_data = train_test_split(train_data, test_size=0.3, random_state=32)

In [11]:
print(len(train_data))
print(len(valid_data))

104996
44999


In [12]:
TEXT = data.Field(sequential=True, use_vocab=True, tokenize=tokenizer.morphs, lower=False, batch_first=True, fix_length=20)
LABEL = data.LabelField(dtype = torch.float) 

In [13]:
def convert_dataset(input_data, text, label):
    list_of_example = [data.Example.fromlist(row.tolist(), fields=[('text', text), ('label', label)])  for _, row in input_data.iterrows()]
    dataset = data.Dataset(examples=list_of_example, fields=[('text', text), ('label', label)])
    return dataset

In [14]:
train_data = convert_dataset(train_data,TEXT,LABEL)
valid_data = convert_dataset(valid_data, TEXT, LABEL)

In [15]:
TEXT.build_vocab(train_data, max_size=10000)
LABEL.build_vocab(train_data)

In [16]:
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))
print('label 의 크기 : {}'.format(len(LABEL.vocab)))

단어 집합의 크기 : 10002
label 의 크기 : 2


10000개 단어 + \<unk> + \<pad> 토큰이 들어감


In [17]:
print(LABEL.vocab.stoi) #라벨 값이 

defaultdict(None, {0: 0, 1: 1})


In [18]:
test_data = convert_dataset(test_data, TEXT, LABEL)

In [19]:
batch_size = 5
train_iter, valid_iter, test_iter = data.Iterator.splits((train_data, valid_data, test_data), batch_size=batch_size, sort=False, device=device)

In [20]:
print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_iter)))
print('평가 데이터의 미니 배치 수 : {}'.format(len(valid_iter)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_iter)))

훈련 데이터의 미니 배치 수 : 21000
평가 데이터의 미니 배치 수 : 9000
테스트 데이터의 미니 배치 수 : 10000


In [30]:
class biLSTM(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, embedding_dim, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=0)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True, dropout=dropout)
    self.linear = nn.Linear(hidden_dim*2, output_dim)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, x):
    embed = self.dropout(self.embedding(x))
    output, (hidden, cell) = self.rnn(embed)
    
    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
    hidden = self.linear(hidden)
    return hidden
  
  def _init_state(self, batch_size=1):
    weight = next(self.parameters()).data
    return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [23]:
def binary_accuracy(prediction, target):
  '''
from https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb
  '''
  # round predictions to the closest integer (0 or 1)
  rounded_preds = torch.round(torch.sigmoid(prediction))
  
  #convert into float for division
  correct = (rounded_preds == target).float()

  # rounded_preds = [ 1   0   0   1   1   1   0   1   1   1]
  # targets       = [ 1   0   1   1   1   1   0   1   1   0]
  # correct       = [1.0 1.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0]
  acc = correct.sum() / len(correct)
  return acc

In [36]:
def train(model, train_iter):
  model.train()

  epoch_loss, epoch_acc = 0, 0
  for batch in train_iter:
    optimizer.zero_grad()
    x, y = batch.text.to(device), batch.label.to(device)
    y_hat = model(x).squeeze(1)

    loss = criterion(y_hat, y)
    acc = binary_accuracy(y_hat, y)

    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(train_iter), epoch_acc / len(train_iter)


In [25]:
def evaluate(model, valid_iter):
  model.eval()
  with torch.no_grad():
    eval_loss, eval_acc = 0, 0
    for batch in valid_iter:
      x,y = batch.text.to(device), batch.label.to(device)
      y_hat = model(x).squeeze(1)
      
      loss = criterion(y_hat, y)
      acc = binary_accuracy(y_hat, y)

      eval_loss += loss.item()
      eval_acc += acc.item()
      
  return eval_loss / len(valid_iter), eval_acc / len(valid_iter)

In [26]:
def inference(model, test_iter):
  model.eval()
  with torch.no_grad():
    test_loss, test_acc = 0, 0
    for batch in test_iter:
      x, y = batch.text.to(device), batch.label.to(device)
      y_hat = model(x).squeeze(1)

      loss = criterion(y_hat, y)
      acc = binary_accuracy(y_hat, y)
      
      test_loss += loss.item()
      test_acc += acc.item()

  return test_loss / len(test_iter), test_acc / len(test_iter)

In [37]:
model = biLSTM(len(TEXT.vocab), 256, len(LABEL.vocab)-1, 300, 0.3)
model.to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
criterion.to(device)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2)

In [38]:
best_val_loss = float('inf')
for _epoch in range(1,6): # 5 epoch
  train_loss, train_acc = train(model, train_iter)
  valid_loss, valid_acc = evaluate(model, valid_iter)
  print("[Epoch: %d] train loss : %5.2f | train accuracy : %5.2f" % (_epoch, train_loss, train_acc))
  print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (_epoch, valid_loss, valid_acc))

  scheduler.step() #lr scheduler
  
    # 검증 오차가 가장 적은 최적의 모델을 저장
  if valid_loss < best_val_loss:
    best_val_loss = valid_loss
    #torch.save(model.state_dict(), 'biLSTMmodel.pt')

[Epoch: 1] train loss :  0.42 | train accuracy :  0.80
[Epoch: 1] val loss :  0.36 | val accuracy :  0.84
[Epoch: 2] train loss :  0.33 | train accuracy :  0.86
[Epoch: 2] val loss :  0.34 | val accuracy :  0.85
[Epoch: 3] train loss :  0.28 | train accuracy :  0.88
[Epoch: 3] val loss :  0.34 | val accuracy :  0.85
[Epoch: 4] train loss :  0.27 | train accuracy :  0.88
[Epoch: 4] val loss :  0.34 | val accuracy :  0.86
[Epoch: 5] train loss :  0.26 | train accuracy :  0.89
[Epoch: 5] val loss :  0.34 | val accuracy :  0.85


In [39]:
test_loss, test_acc = inference(model, test_iter)
print('Test Loss: %5.2f | Test Acc: %5.2f '%(test_loss, test_acc*100))

Test Loss:  0.34 | Test Acc: 85.40 


In [52]:
def predict_sentiment(model, sentence):
  model.eval()
  tokenized = [token for token in tokenizer.morphs(sentence)]
  indexed = [TEXT.vocab.stoi[t] for t in tokenized]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(1)
  prediction = torch.sigmoid(model(tensor))
  return prediction

In [53]:
predict_sentiment(model, "This film is terrible")

tensor([[0.1770],
        [0.4699],
        [0.3685],
        [0.4699]], device='cuda:0', grad_fn=<SigmoidBackward>)