이번에는 lstm에 attention layer를 넣어서 모델을 구현해보겠습니다.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [47]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (85/85), done.[K
remote: Total 91 (delta 43), reused 22 (delta 6), pack-reused 0[K
Unpacking objects: 100% (91/91), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 7.4MB/s 
[?25hCollecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 11.5MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a20

In [5]:
from torchtext.legacy import data
from konlpy.tag import Mecab

In [6]:
cd /content/drive/MyDrive/torch_example/

/content/drive/MyDrive/torch_example


In [7]:
tokenizer = Mecab()

In [8]:
train = pd.read_csv("./data/nsmc/ratings_train.txt", sep='\t')
test = pd.read_csv("./data/nsmc/ratings_test.txt", sep='\t')

train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

print(train.shape)
print(test.shape)

(150000, 2)
(50000, 2)


In [9]:
train_data = train.dropna() #말뭉치에서 nan 값을 제거함
test_data = test.dropna()

print(train_data.shape)
print(test_data.shape)

(149995, 2)
(49997, 2)


In [10]:
train_data, valid_data = train_test_split(train_data, test_size=0.3, random_state=32)

In [11]:
print(len(train_data))
print(len(valid_data))

104996
44999


In [12]:
TEXT = data.Field(sequential=True, use_vocab=True, tokenize=tokenizer.morphs, lower=False, batch_first=True, fix_length=20)
LABEL = data.LabelField(dtype = torch.float) 

In [13]:
def convert_dataset(input_data, text, label):
    list_of_example = [data.Example.fromlist(row.tolist(), fields=[('text', text), ('label', label)])  for _, row in input_data.iterrows()]
    dataset = data.Dataset(examples=list_of_example, fields=[('text', text), ('label', label)])
    return dataset

In [14]:
train_data = convert_dataset(train_data,TEXT,LABEL)
valid_data = convert_dataset(valid_data, TEXT, LABEL)

In [15]:
TEXT.build_vocab(train_data, max_size=10000)
LABEL.build_vocab(train_data)

In [16]:
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))
print('label 의 크기 : {}'.format(len(LABEL.vocab)))

단어 집합의 크기 : 10002
label 의 크기 : 2


10000개 단어 + \<unk> + \<pad> 토큰이 들어감


In [17]:
print(LABEL.vocab.stoi) #라벨 값이 

defaultdict(None, {0: 0, 1: 1})


In [18]:
test_data = convert_dataset(test_data, TEXT, LABEL)

In [19]:
batch_size = 5
train_iter, valid_iter, test_iter = data.Iterator.splits((train_data, valid_data, test_data), batch_size=batch_size, sort=False, device=device)

In [20]:
print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_iter)))
print('평가 데이터의 미니 배치 수 : {}'.format(len(valid_iter)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_iter)))

훈련 데이터의 미니 배치 수 : 21000
평가 데이터의 미니 배치 수 : 9000
테스트 데이터의 미니 배치 수 : 10000


In [63]:
class LSTM(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, embedding_dim, dropout):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=0)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout)
    self.linear = nn.Linear(hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, x):
    embed = self.dropout(self.embedding(x))
    output, (hidden, cell) = self.rnn(embed) #hidden size = 1 x batch x hidden size    
    attention_output = self.attention(output, hidden)

    output = self.linear(attention_output)
    return output
  
  def attention(self, lstm_output, final_hidden):
    '''
    lstm_output : batch x seq len x hidden
    final_hidden : 1 x batch x hidden
    '''
    final_hidden = final_hidden.squeeze(0)
    # final_hidden = batch x hidden

    # torch.bmm(lstm_output, final_hidden.unsqueeze(2)) -> size : batch x seq len x 1
    attention_output = torch.bmm(lstm_output, final_hidden.unsqueeze(2)).squeeze(2)
    # attention_output = batch x seq len 
    
    attention_score = F.softmax(attention_output,1) # 가로(seq len)에 대해 softmax
    # attention_score = batch x seq len

    final_score = torch.bmm(lstm_output.transpose(1,2), attention_score.unsqueeze(2)).squeeze(2)
    # final_score = batch x hidden

    return final_score

  def _init_state(self, batch_size=1):
    weight = next(self.parameters()).data
    return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

softmax 의 dimension을 0으로 설정할지 1로 설정할지 판단해봅시다.

In [56]:
a = torch.randn([5,10])

In [57]:
a

tensor([[ 1.5034,  0.2618,  0.6832, -0.1158,  0.6630, -1.8900, -1.0180, -0.7647,
          0.1500,  1.7564],
        [ 2.8097,  1.1322,  1.8509,  0.5302, -0.2789,  1.9784, -0.0143, -0.8318,
         -0.5852,  1.9326],
        [ 1.6498,  1.5592, -0.8956, -1.3253, -0.1215,  1.5682, -1.0549,  0.1157,
          1.0565, -0.2665],
        [ 0.3731, -0.8187, -0.7748, -0.5350,  0.3235, -0.4353, -0.4696, -0.4680,
         -1.1258,  1.5218],
        [ 1.5963, -1.7130, -0.6497, -1.5825, -0.8156, -0.3597, -0.9544, -0.1249,
          0.7314,  0.6127]])

우선 dimension을 0으로 설정한 경우 가로축을 다 더한 결과가 1을 넘습니다. 즉, 세로를 기준으로 sofmax를 진행한 것으로 판단할 수 있습니다. 

In [59]:
F.softmax(a,0) # 세로로

tensor([[0.1375, 0.1329, 0.2034, 0.2442, 0.3589, 0.0112, 0.1335, 0.1318, 0.1660,
         0.2912],
        [0.5079, 0.3173, 0.6537, 0.4660, 0.1399, 0.5346, 0.3644, 0.1232, 0.0796,
         0.3473],
        [0.1592, 0.4863, 0.0419, 0.0729, 0.1638, 0.3547, 0.1287, 0.3178, 0.4110,
         0.0385],
        [0.0444, 0.0451, 0.0473, 0.1606, 0.2556, 0.0478, 0.2311, 0.1773, 0.0464,
         0.2303],
        [0.1509, 0.0184, 0.0536, 0.0563, 0.0818, 0.0516, 0.1423, 0.2499, 0.2970,
         0.0928]])

In [62]:
sum([0.1375, 0.1329, 0.2034, 0.2442, 0.3589, 0.0112, 0.1335, 0.1318, 0.1660,
         0.2912])

1.8106

반면에 dimension이 1인 경우에는 가로의 합이 1이 되는 것을 확인할 수 있습니다. 저희가 원하는 것은 batch에 대해서 attention score가 나오는 것이 아니라, seq len에 대하여 나오길 원하기 때문에 1로 설정합니다.

In [60]:
F.softmax(a,1) #가로로

tensor([[0.2426, 0.0701, 0.1068, 0.0480, 0.1047, 0.0081, 0.0195, 0.0251, 0.0627,
         0.3124],
        [0.3719, 0.0695, 0.1426, 0.0381, 0.0169, 0.1620, 0.0221, 0.0097, 0.0125,
         0.1547],
        [0.2429, 0.2219, 0.0191, 0.0124, 0.0413, 0.2239, 0.0162, 0.0524, 0.1342,
         0.0357],
        [0.1305, 0.0396, 0.0414, 0.0526, 0.1242, 0.0582, 0.0562, 0.0563, 0.0292,
         0.4117],
        [0.4053, 0.0148, 0.0429, 0.0169, 0.0363, 0.0573, 0.0316, 0.0725, 0.1707,
         0.1516]])

In [61]:
sum([0.2426, 0.0701, 0.1068, 0.0480, 0.1047, 0.0081, 0.0195, 0.0251, 0.0627,
         0.3124])

0.9999999999999999

In [22]:
def binary_accuracy(prediction, target):
  '''
from https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb
  '''
  # round predictions to the closest integer (0 or 1)
  rounded_preds = torch.round(torch.sigmoid(prediction))
  
  #convert into float for division
  correct = (rounded_preds == target).float()

  # rounded_preds = [ 1   0   0   1   1   1   0   1   1   1]
  # targets       = [ 1   0   1   1   1   1   0   1   1   0]
  # correct       = [1.0 1.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0]
  acc = correct.sum() / len(correct)
  return acc

In [23]:
def train(model, train_iter):
  model.train()

  epoch_loss, epoch_acc = 0, 0
  for batch in train_iter:
    optimizer.zero_grad()
    x, y = batch.text.to(device), batch.label.to(device)
    y_hat = model(x).squeeze(1)

    loss = criterion(y_hat, y)
    acc = binary_accuracy(y_hat, y)

    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(train_iter), epoch_acc / len(train_iter)


In [24]:
def evaluate(model, valid_iter):
  model.eval()
  with torch.no_grad():
    eval_loss, eval_acc = 0, 0
    for batch in valid_iter:
      x,y = batch.text.to(device), batch.label.to(device)
      y_hat = model(x).squeeze(1)
      
      loss = criterion(y_hat, y)
      acc = binary_accuracy(y_hat, y)

      eval_loss += loss.item()
      eval_acc += acc.item()
      
  return eval_loss / len(valid_iter), eval_acc / len(valid_iter)

In [25]:
def inference(model, test_iter):
  model.eval()
  with torch.no_grad():
    test_loss, test_acc = 0, 0
    for batch in test_iter:
      x, y = batch.text.to(device), batch.label.to(device)
      y_hat = model(x).squeeze(1)

      loss = criterion(y_hat, y)
      acc = binary_accuracy(y_hat, y)
      
      test_loss += loss.item()
      test_acc += acc.item()

  return test_loss / len(test_iter), test_acc / len(test_iter)

In [64]:
model = LSTM(len(TEXT.vocab), 128, len(LABEL.vocab)-1, 300, 0.2)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()
criterion.to(device)

  "num_layers={}".format(dropout, num_layers))


BCEWithLogitsLoss()

In [65]:
best_val_loss = float('inf')
for _epoch in range(1,6): # 5 epoch
  train_loss, train_acc = train(model, train_iter)
  valid_loss, valid_acc = evaluate(model, valid_iter)
  print("[Epoch: %d] train loss : %5.2f | train accuracy : %5.2f" % (_epoch, train_loss, train_acc))
  print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (_epoch, valid_loss, valid_acc))

    # 검증 오차가 가장 적은 최적의 모델을 저장
  if valid_loss < best_val_loss:
    best_val_loss = valid_loss

[Epoch: 1] train loss :  0.42 | train accuracy :  0.81
[Epoch: 1] val loss :  0.36 | val accuracy :  0.84
[Epoch: 2] train loss :  0.33 | train accuracy :  0.86
[Epoch: 2] val loss :  0.35 | val accuracy :  0.85
[Epoch: 3] train loss :  0.30 | train accuracy :  0.87
[Epoch: 3] val loss :  0.35 | val accuracy :  0.85
[Epoch: 4] train loss :  0.28 | train accuracy :  0.88
[Epoch: 4] val loss :  0.36 | val accuracy :  0.85
[Epoch: 5] train loss :  0.27 | train accuracy :  0.89
[Epoch: 5] val loss :  0.36 | val accuracy :  0.85


In [66]:
test_loss, test_acc = inference(model, test_iter)
print('Test Loss: %5.2f | Test Acc: %5.2f '%(test_loss, test_acc*100))

Test Loss:  0.36 | Test Acc: 84.58 
