In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [8]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

fatal: destination path 'Mecab-ko-for-Google-Colab' already exists and is not an empty directory.
/content/drive/My Drive/torch_example/Mecab-ko-for-Google-Colab
Installing konlpy.....
Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.3MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 43.2MB/s 
[?25hCollecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e227

In [9]:
from torchtext.legacy import data
from konlpy.tag import Mecab

In [12]:
cd /content/drive/MyDrive/torch_example/

/content/drive/MyDrive/torch_example


In [10]:
tokenizer = Mecab()

In [13]:
train = pd.read_csv("./data/nsmc/ratings_train.txt", sep='\t')
test = pd.read_csv("./data/nsmc/ratings_test.txt", sep='\t')

train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

print(train.shape)
print(test.shape)

(150000, 2)
(50000, 2)


In [14]:
train_data = train.dropna() #말뭉치에서 nan 값을 제거함
test_data = test.dropna()

print(train_data.shape)
print(test_data.shape)

(149995, 2)
(49997, 2)


In [15]:
train_data, valid_data = train_test_split(train_data, test_size=0.3, random_state=32)

In [16]:
print(len(train_data))
print(len(valid_data))

104996
44999


In [17]:
TEXT = data.Field(sequential=True, use_vocab=True, tokenize=tokenizer.morphs, lower=False, batch_first=True, fix_length=20)
LABEL = data.LabelField(dtype = torch.float) 

In [18]:
def convert_dataset(input_data, text, label):
    list_of_example = [data.Example.fromlist(row.tolist(), fields=[('text', text), ('label', label)])  for _, row in input_data.iterrows()]
    dataset = data.Dataset(examples=list_of_example, fields=[('text', text), ('label', label)])
    return dataset

In [19]:
train_data = convert_dataset(train_data,TEXT,LABEL)
valid_data = convert_dataset(valid_data, TEXT, LABEL)

load pretrained embedding vector and used!

The pretrained embedding vector based on news paper with tokenized by Mecab. Dimension is 200, Iteration is 100, corpus size is 1.5GB

In [5]:
import torchtext.vocab as vocab
my_embedding = vocab.Vectors(name='./vector/mecab_200_vec.txt', cache='my_embedding', unk_init=torch.Tensor.normal_)

  0%|          | 0/181670 [00:00<?, ?it/s]Skipping token b'181670' with 1-dimensional vector [b'200']; likely a header
 99%|█████████▉| 180614/181670 [00:14<00:00, 11987.35it/s]

In [20]:
TEXT.build_vocab(train_data, max_size=10000, vectors=my_embedding)
LABEL.build_vocab(train_data)

In [21]:
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))
print('label 의 크기 : {}'.format(len(LABEL.vocab)))

단어 집합의 크기 : 10002
label 의 크기 : 2


10000개 단어 + \<unk> + \<pad> 토큰이 들어감


In [23]:
print(LABEL.vocab.stoi) #라벨 값이 

defaultdict(None, {0: 0, 1: 1})


In [22]:
print(TEXT.vocab.vectors.shape) #벡터 차원200, 토큰 개수10002개

torch.Size([10002, 200])


In [24]:
test_data = convert_dataset(test_data, TEXT, LABEL)

In [25]:
batch_size = 5
train_iter, valid_iter, test_iter = data.Iterator.splits((train_data, valid_data, test_data), batch_size=batch_size, sort=False, device=device)

In [26]:
print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_iter)))
print('평가 데이터의 미니 배치 수 : {}'.format(len(valid_iter)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_iter)))

훈련 데이터의 미니 배치 수 : 21000
평가 데이터의 미니 배치 수 : 9000
테스트 데이터의 미니 배치 수 : 10000


In [27]:
class LSTM(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, embedding_dim, dropout):
    super().__init__()
    # pretrained embedding vector with Freezing
    self.embedding = nn.Embedding.from_pretrained(TEXT.vocab.vectors)

    # if you want use unFreezing pretrained embedding write freezing option like under line.
    # self.embedding = nn.Embedding.from_pretrained(TEXT.vocab.vector, freeze=False)
    
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout)
    self.linear = nn.Linear(hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, x):
    embed = self.dropout(self.embedding(x))
    output, _ = self.rnn(embed)
    output = self.linear(output[:, -1, :])
    return output
  
  def _init_state(self, batch_size=1):
    weight = next(self.parameters()).data
    return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

In [28]:
def binary_accuracy(prediction, target):
  '''
from https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb
  '''
  # round predictions to the closest integer (0 or 1)
  rounded_preds = torch.round(torch.sigmoid(prediction))
  
  #convert into float for division
  correct = (rounded_preds == target).float()

  # rounded_preds = [ 1   0   0   1   1   1   0   1   1   1]
  # targets       = [ 1   0   1   1   1   1   0   1   1   0]
  # correct       = [1.0 1.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0]
  acc = correct.sum() / len(correct)
  return acc

In [29]:
def train(model, train_iter):
  model.train()

  epoch_loss, epoch_acc = 0, 0
  for batch in train_iter:
    optimizer.zero_grad()
    x, y = batch.text.to(device), batch.label.to(device)
    y_hat = model(x).squeeze(1)

    loss = criterion(y_hat, y)
    acc = binary_accuracy(y_hat, y)

    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(train_iter), epoch_acc / len(train_iter)


In [30]:
def evaluate(model, valid_iter):
  model.eval()
  with torch.no_grad():
    eval_loss, eval_acc = 0, 0
    for batch in valid_iter:
      x,y = batch.text.to(device), batch.label.to(device)
      y_hat = model(x).squeeze(1)
      
      loss = criterion(y_hat, y)
      acc = binary_accuracy(y_hat, y)

      eval_loss += loss.item()
      eval_acc += acc.item()
      
  return eval_loss / len(valid_iter), eval_acc / len(valid_iter)

In [31]:
def inference(model, test_iter):
  model.eval()
  with torch.no_grad():
    test_loss, test_acc = 0, 0
    for batch in test_iter:
      x, y = batch.text.to(device), batch.label.to(device)
      y_hat = model(x).squeeze(1)

      loss = criterion(y_hat, y)
      acc = binary_accuracy(y_hat, y)
      
      test_loss += loss.item()
      test_acc += acc.item()

  return test_loss / len(test_iter), test_acc / len(test_iter)

In [33]:
model = LSTM(len(TEXT.vocab), 128, len(LABEL.vocab)-1, 200, 0.2)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()
criterion.to(device)

  "num_layers={}".format(dropout, num_layers))


BCEWithLogitsLoss()

In [34]:
best_val_loss = float('inf')
for _epoch in range(1,6): # 5 epoch
  train_loss, train_acc = train(model, train_iter)
  valid_loss, valid_acc = evaluate(model, valid_iter)
  print("[Epoch: %d] train loss : %5.2f | train accuracy : %5.2f" % (_epoch, train_loss, train_acc))
  print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (_epoch, valid_loss, valid_acc))

    # 검증 오차가 가장 적은 최적의 모델을 저장
  if valid_loss < best_val_loss:
    best_val_loss = valid_loss
    torch.save(model.state_dict(),'tut1-model.pt')

[Epoch: 1] train loss :  0.45 | train accuracy :  0.78
[Epoch: 1] val loss :  0.38 | val accuracy :  0.83
[Epoch: 2] train loss :  0.37 | train accuracy :  0.83
[Epoch: 2] val loss :  0.35 | val accuracy :  0.84
[Epoch: 3] train loss :  0.34 | train accuracy :  0.85
[Epoch: 3] val loss :  0.35 | val accuracy :  0.85
[Epoch: 4] train loss :  0.32 | train accuracy :  0.86
[Epoch: 4] val loss :  0.34 | val accuracy :  0.85
[Epoch: 5] train loss :  0.30 | train accuracy :  0.87
[Epoch: 5] val loss :  0.35 | val accuracy :  0.85


In [35]:
test_loss, test_acc = inference(model, test_iter)
print('Test Loss: %5.2f | Test Acc: %5.2f '%(test_loss, test_acc*100))

Test Loss:  0.35 | Test Acc: 84.76 


check word vector in pretrained embedding

In [36]:
TEXT.vocab.vectors[TEXT.vocab.stoi['사과']]

tensor([-1.8117e-02, -1.3958e-01,  2.3630e-01, -2.7158e-02,  2.0225e-01,
        -8.9995e-02,  5.0562e-01, -5.4435e-01, -7.8093e-02, -1.8677e-01,
        -2.1117e-01, -1.3257e-01, -6.2217e-02, -1.2082e-01,  1.5681e-01,
        -3.7383e-01,  6.7868e-02,  8.6042e-01,  3.3801e-01,  8.7876e-02,
        -6.5293e-01,  5.4694e-02,  8.2626e-02,  2.8089e-01, -1.3018e-01,
        -3.1103e-02, -5.2466e-01,  2.0239e-01,  5.2359e-01,  2.1270e-01,
        -1.1202e-01, -1.4046e-01,  4.2592e-01, -3.5816e-01,  3.2228e-01,
        -2.0581e-02,  8.8021e-02, -7.8883e-02,  5.4781e-01, -4.8231e-01,
        -1.1365e-01,  3.7111e-01,  3.4829e-01, -1.0040e-01,  3.0546e-01,
        -1.1306e-01, -1.1919e-01,  1.4974e-01,  2.5124e-01, -3.1577e-01,
        -3.2229e-01, -2.5127e-01,  3.9752e-01,  1.5526e-01,  6.0606e-02,
        -1.5560e-03, -5.4798e-02,  3.5820e-01,  4.6365e-01, -3.5335e-01,
         9.4730e-02,  4.6722e-01, -4.6081e-02, -1.7977e-01, -5.2109e-01,
        -8.9270e-03, -1.5695e-01,  4.0390e-03,  6.9