In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (85/85), done.[K
remote: Total 91 (delta 43), reused 22 (delta 6), pack-reused 0[K
Unpacking objects: 100% (91/91), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.6MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/98/88/f817ef1af6f794e8f11313dcd1549de833f4599abcec82746ab5ed086686/JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448kB

In [5]:
from torchtext.legacy import data
from konlpy.tag import Mecab

In [6]:
cd /content/drive/MyDrive/torch_example/

/content/drive/MyDrive/torch_example


In [7]:
tokenizer = Mecab()

In [8]:
train = pd.read_csv("./data/nsmc/ratings_train.txt", sep='\t')
test = pd.read_csv("./data/nsmc/ratings_test.txt", sep='\t')

train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

print(train.shape)
print(test.shape)

(150000, 2)
(50000, 2)


In [9]:
train_data = train.dropna() #말뭉치에서 nan 값을 제거함
test_data = test.dropna()

print(train_data.shape)
print(test_data.shape)

(149995, 2)
(49997, 2)


In [10]:
train_data, valid_data = train_test_split(train_data, test_size=0.3, random_state=32)

In [11]:
print(len(train_data))
print(len(valid_data))

104996
44999


In [12]:
TEXT = data.Field(sequential=True, use_vocab=True, tokenize=tokenizer.morphs, lower=False, batch_first=True, fix_length=20)
LABEL = data.LabelField(dtype = torch.float) 

In [13]:
def convert_dataset(input_data, text, label):
    list_of_example = [data.Example.fromlist(row.tolist(), fields=[('text', text), ('label', label)])  for _, row in input_data.iterrows()]
    dataset = data.Dataset(examples=list_of_example, fields=[('text', text), ('label', label)])
    return dataset

In [14]:
train_data = convert_dataset(train_data,TEXT,LABEL)
valid_data = convert_dataset(valid_data, TEXT, LABEL)

In [15]:
TEXT.build_vocab(train_data, max_size=10000)
LABEL.build_vocab(train_data)

In [16]:
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))
print('label 의 크기 : {}'.format(len(LABEL.vocab)))

단어 집합의 크기 : 10002
label 의 크기 : 2


10000개 단어 + \<unk> + \<pad> 토큰이 들어감


In [17]:
print(LABEL.vocab.stoi) #라벨 값이 

defaultdict(None, {0: 0, 1: 1})


In [18]:
test_data = convert_dataset(test_data, TEXT, LABEL)

In [19]:
batch_size = 5
train_iter, valid_iter, test_iter = data.Iterator.splits((train_data, valid_data, test_data), batch_size=batch_size, sort=False, device=device)

In [20]:
print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_iter)))
print('평가 데이터의 미니 배치 수 : {}'.format(len(valid_iter)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_iter)))

훈련 데이터의 미니 배치 수 : 21000
평가 데이터의 미니 배치 수 : 9000
테스트 데이터의 미니 배치 수 : 10000


text를 cnn으로 돌릴때, conv2d와 conv1d로 나눠서 돌릴 수 있다.
두 가지 모두 작성해보도록 한다. 

CNN을 2d로 볼 때, 이미지와 다르게 text는 input_chanel 이 1이다. (이미지는 3임) 왜냐면 이미지는 R,G,B로 3개이기 때문이다.
그 외에는 별다를게 없다.

CNN을 1d 로 볼 때, input_chanel은 embed dim이다. 

In [None]:
class CNN2D(nn.Module):
  def __init__(self,  vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

    self.conv1 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[0], embedding_dim))
    self.conv2 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[1], embedding_dim))
    self.conv3 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[2], embedding_dim))
    
    self.fc = nn.Linear(n_filters * len(filter_sizes), output_dim)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, x):
    # x : batch x seq len
    embedded = self.embedding(x)
    # embedded = batch x seq len x embed dim

    embedded = embedded.unsqueeze(1)
    # embedded = batch x 1 x seq len x embed dim

    out_conv1 = self.conv1(embedded)
    #out_conv1 = batch x feature map x (seq len - filter_size[0] + 1) x 1
    out_conv1 = F.relu(out_conv1.squeeze(3))
    # out_conv1 = batch x feature map x (seq len - filter size[0] + 1)

    out_conv2 = F.relu(self.conv2(embedded).squeeze(3))
    out_conv3 = F.relu(self.conv3(embedded).squeeze(3))

    pooled_1 = F.max_pool1d(out_conv1, out_conv1.shape[2]).squeeze(2)
    pooled_2 = F.max_pool1d(out_conv2, out_conv2.shape[2]).squeeze(2)
    pooled_3 = F.max_pool1d(out_conv3, out_conv3.shape[2]).squeeze(2)
    # pooled : batch x feature map

    output = self.fc(self.dropout(torch.cat((pooled_1, pooled_2, pooled_3), dim=1)))

    return output
    

In [None]:
#위의 코드를 nn.ModuleList를 활용해서 작성한 코드입니다.
class CNN2D(nn.Module):
  def __init__(self,  vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

    self.conv = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filtersize, embedding_dim)) \
                               for filtersize in filter_sizes])
    
    self.fc = nn.Linear(n_filters * len(filter_sizes), output_dim)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, x):
    # x : batch x seq len
    embedded = self.embedding(x)
    # embedded = batch x seq len x embed dim

    embedded = embedded.unsqueeze(1)
    # embedded = batch x 1 x seq len x embed dim

    conv_out = [F.relu(conv(embedded).squeeze(3)) for conv in self.conv]
    # conv_out : batch x feature map x (sent len - filter_size[n] + 1)

    pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conv_out]
    # pooled : batch x feature map

    output = self.fc(self.dropout(torch.cat((pooled[0], pooled[1], pooled[2]), dim=1)))

    return output

In [1]:
class CNN1D(nn.Module):
  def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
    super(CNN1D, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

    self.conv = nn.ModuleList([ nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=filter)\
                               for filter in filter_sizes])

    self.fc = nn.Linear( n_filters * len(filter_size), output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    embedded = self.embedding(x)
    #embedded : batch x seq len x embed dim

    embedded = embedded.premute(0,2,1)
    #embedded : batch x embed dim x seq len

    conv_out = [F.relu(conv(embedded)) for conv in self.conv] 
    #conv_out : batch x feature map x (sent len - filter_size[n] + 1)

    pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conv_out]
    # pooled : batch x feature map

    output = self.fc(self.dropout(torch.cat(pooled, dim =1)))

    return output
  
  def attention(self, conv_output, final_hidden):
    

SyntaxError: ignored

In [None]:
def binary_accuracy(prediction, target):
  '''
from https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb
  '''
  # round predictions to the closest integer (0 or 1)
  rounded_preds = torch.round(torch.sigmoid(prediction))
  
  #convert into float for division
  correct = (rounded_preds == target).float()

  # rounded_preds = [ 1   0   0   1   1   1   0   1   1   1]
  # targets       = [ 1   0   1   1   1   1   0   1   1   0]
  # correct       = [1.0 1.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0]
  acc = correct.sum() / len(correct)
  return acc

In [None]:
def train(model, train_iter):
  model.train()

  epoch_loss, epoch_acc = 0, 0
  for batch in train_iter:
    optimizer.zero_grad()
    x, y = batch.text.to(device), batch.label.to(device)
    y_hat = model(x).squeeze(1)

    loss = criterion(y_hat, y)
    acc = binary_accuracy(y_hat, y)

    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(train_iter), epoch_acc / len(train_iter)


In [None]:
def evaluate(model, valid_iter):
  model.eval()
  with torch.no_grad():
    eval_loss, eval_acc = 0, 0
    for batch in valid_iter:
      x,y = batch.text.to(device), batch.label.to(device)
      y_hat = model(x).squeeze(1)
      
      loss = criterion(y_hat, y)
      acc = binary_accuracy(y_hat, y)

      eval_loss += loss.item()
      eval_acc += acc.item()
      
  return eval_loss / len(valid_iter), eval_acc / len(valid_iter)

In [None]:
def inference(model, test_iter):
  model.eval()
  with torch.no_grad():
    test_loss, test_acc = 0, 0
    for batch in test_iter:
      x, y = batch.text.to(device), batch.label.to(device)
      y_hat = model(x).squeeze(1)

      loss = criterion(y_hat, y)
      acc = binary_accuracy(y_hat, y)
      
      test_loss += loss.item()
      test_acc += acc.item()

  return test_loss / len(test_iter), test_acc / len(test_iter)

In [None]:
#  vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN2D(len(TEXT.vocab), 128, 100, [3,4,5], 1, 0.2, PAD_IDX)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()
criterion.to(device)

BCEWithLogitsLoss()

In [None]:
best_val_loss = float('inf')
for _epoch in range(1,6): # 5 epoch
  train_loss, train_acc = train(model, train_iter)
  valid_loss, valid_acc = evaluate(model, valid_iter)
  print("[Epoch: %d] train loss : %5.2f | train accuracy : %5.2f" % (_epoch, train_loss, train_acc))
  print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (_epoch, valid_loss, valid_acc))

    # 검증 오차가 가장 적은 최적의 모델을 저장
  if valid_loss < best_val_loss:
    best_val_loss = valid_loss

[Epoch: 1] train loss :  0.46 | train accuracy :  0.78
[Epoch: 1] val loss :  0.39 | val accuracy :  0.83
[Epoch: 2] train loss :  0.37 | train accuracy :  0.84
[Epoch: 2] val loss :  0.38 | val accuracy :  0.84
[Epoch: 3] train loss :  0.32 | train accuracy :  0.87
[Epoch: 3] val loss :  0.39 | val accuracy :  0.83
[Epoch: 4] train loss :  0.27 | train accuracy :  0.89
[Epoch: 4] val loss :  0.43 | val accuracy :  0.83
[Epoch: 5] train loss :  0.23 | train accuracy :  0.91
[Epoch: 5] val loss :  0.50 | val accuracy :  0.83


In [None]:
test_loss, test_acc = inference(model, test_iter)
print('Test Loss: %5.2f | Test Acc: %5.2f '%(test_loss, test_acc*100))

Test Loss:  0.49 | Test Acc: 82.58 
