# 0. Preprocessing
- 파일 로드하기(File Loading) : 다양한 포맷의 코퍼스를 로드합니다.
- 토큰화(Tokenization) : 문장을 단어 단위로 분리해줍니다.
- 단어 집합(Vocab) : 단어 집합을 만듭니다.
- 정수 인코딩(Integer encoding) : 전체 코퍼스의 단어들을 각각의 고유한 정수로 맵핑합니다.
- 단어 벡터(Word Vector) : 단어 집합의 단어들에 고유한 임베딩 벡터를 만들어줍니다. 랜덤값으로 초기화한 값일 수도 있고, 사전 훈련된 임베딩 벡터들을 로드할 수도 있습니다.
- 배치화(Batching) : 훈련 샘플들의 배치를 만들어줍니다. 이 과정에서 패딩 작업(Padding)도 이루어집니다.

In [None]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
# colab으로 mecab 실행 시 (설치 코드)
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 91, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (85/85), done.[K
remote: Total 91 (delta 43), reused 22 (delta 6), pack-reused 0[K
Unpacking objects: 100% (91/91), done.
/content/Mecab-ko-for-Google-Colab/Mecab-ko-for-Google-Colab/Mecab-ko-for-Google-Colab
Installing konlpy.....
Done
Installing mecab-0.996-ko-0.9.2.tar.gz.....
Downloading mecab-0.996-ko-0.9.2.tar.gz.......
from https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
--2021-06-18 12:12:34--  https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
Resolving bitbucket.org (bitbucket.org)... 18.205.93.2, 18.205.93.1, 18.205.93.0, ...
Connecting to bitbucket.org (bitbucket.org)|18.205.93.2|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://bbuseruploads.s3.amazonaws.com/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz?Si

In [None]:
import os
import io
import copy
import time
import tarfile
import numpy as np
from konlpy.tag import Mecab
import matplotlib.pyplot as plt
from collections import Counter

import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torch.nn import Dropout
from torch.nn import Parameter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer

In [None]:
dim_embed = 128
dim_model = 64
batch_size = 16
num_layers = 2
is_bidirection = True
truncated = 5
dropout_ratio = 0.3
base_path = '/content/drive/MyDrive/04_프로젝트/seq2seq with attention/'
device = torch.device('cuda' if True and torch.cuda.is_available() else 'cpu')
epochs = 30
clip = 1

In [None]:
# path = '/content/drive/MyDrive/04_프로젝트/seq2seq with attention/dataset/'
# os.chdir(path)
# save_name = 'unzipped/'
# file_list = os.listdir(path)
# for file in file_list:
#   tar = tarfile.open(file)
#   tar.extractall('./' + save_name + file.split(".")[0])
#   tar.close()

In [None]:
en_tokenizer = get_tokenizer('spacy')
ko_tokenizer = Mecab()

In [None]:
path_base = "/content/drive/MyDrive/04_프로젝트/seq2seq with attention/dataset/unzipped/korean-english-park/korean-english-park."
train_path = ('train.en', 'train.ko')
dev_path = ('dev.en', 'dev.ko')
test_path = ('test.en', 'test.ko')

In [None]:
# 출처 : https://pytorch.org/tutorials/beginner/torchtext_translation_tutorial.html
ko_tokenizer = Mecab()
en_tokenizer = get_tokenizer('spacy')
def build_vocab(train_path, tokenizer):
  counter = Counter()
  with open(train_path, encoding = 'UTF-8', newline = '\n') as f:
    for string_ in f:
      if 'ko' in train_path[-10:]:
        counter.update(tokenizer.morphs(string_))
      else:
        counter.update(tokenizer(string_))
    return Vocab(counter, min_freq = 3, specials = ('<unk>', '<BOS>', '<EOS>', "<PAD>"))

ko_vocab = build_vocab(path_base + train_path[1], ko_tokenizer)
en_vocab = build_vocab(path_base + train_path[0], en_tokenizer)

In [None]:
def data_preprocess(file_paths):
  raw_ko_iter = iter(io.open(file_paths[1], encoding = 'UTF-8', newline = '\n'))
  raw_en_iter = iter(io.open(file_paths[0], encoding = 'UTF-8', newline = '\n'))
  data = []
  for raw_ko, raw_en in zip(raw_ko_iter, raw_en_iter):
    ko_tensor = torch.tensor([ko_vocab[token] for token in ko_tokenizer.morphs(raw_ko)], dtype = torch.long)
    en_tensor = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)], dtype = torch.long)
    data.append((ko_tensor, en_tensor))

  return data
train_data= data_preprocess([path_base + path for path in train_path])
dev_data = data_preprocess([path_base + path for path in dev_path])
test_data = data_preprocess([path_base + path for path in test_path])

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 16
PAD_IDX = ko_vocab['<PAD>']
BOS_IDX = ko_vocab['<BOS>']
EOS_IDX = ko_vocab['<EOS>']

def generate_batch(data_batch):
  ko_batch, en_batch = [], []
  for (ko_item, en_item) in data_batch:
    ko_batch.append(torch.cat([torch.tensor([BOS_IDX]), ko_item, torch.tensor([EOS_IDX])], dim=0))
    en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
  ko_batch = pad_sequence(ko_batch, padding_value=PAD_IDX, batch_first = True)
  en_batch = pad_sequence(en_batch, padding_value=PAD_IDX, batch_first = True)
  return ko_batch, en_batch

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
dev_iter = DataLoader(dev_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=True, collate_fn=generate_batch)

# 1. modeling

## 1-1. Encoder

In [None]:
class gru_cell(nn.Module):
  def __init__(self, dim_model, dim_embed, batch_size):
    super().__init__()
    self.W_z = nn.Linear(dim_embed, dim_model)
    self.W_r = nn.Linear(dim_embed, dim_model)
    self.W_ = nn.Linear(dim_embed, dim_model)
    self.U_z = nn.Linear(dim_model, dim_model)
    self.U_r = nn.Linear(dim_model, dim_model)
    self.U_ = nn.Linear(dim_model, dim_model)
    self.h = torch.zeros(dim_model, device = device).repeat(batch_size, 1)
  def forward(self, x, h_old = None): # x : 모든 배치의 한 시점의 단어 임베딩 벡터 (batch_size, dim_embed)
    if h_old is None:
      h_old = self.h
    r = torch.sigmoid(self.W_r(x) + self.U_r(h_old))
    z = torch.sigmoid(self.W_z(x) + self.U_z(h_old))
    h_new = torch.tanh(self.W_(x) + self.U_(r*h_old))
    h = (1 - z)*h_old + z*h_new
    return h

In [None]:
class gru(nn.Module):
  def __init__(self, dim_model, dim_embed, batch_size, gru_cell, truncated):
    super().__init__()
    self.layer = nn.Sequential(gru_cell(dim_model, dim_embed, batch_size))
    self.truncated = truncated
    # self.bptt_truncated = bptt_truncated # bptt를 몇 시점마다 짜를지
  def forward(self, x, max_seq, return_states = True, backward = False):
    h = None
    states = []
    input = x.permute(1, 0, 2) # (time_step, batch_size, dim_model) 로 바꿈.
    for time_step in range(max_seq):
      if backward: # 역방향 레이어면, 맨 뒤의 값부터 입력
        time_step = max_seq - time_step - 1
      h = self.layer[0].forward(input[time_step], h)
      if ((max_seq - self.truncated) % self.truncated == 0): # truncated BPTT, truncated 마다 h의 backprop을 끊음으로써 구현
        h.detach()
      if backward: # 역방향 레이어면 출력 순서도 역방향이므로 다시 돌려서 저장
        states.insert(0, h)
      else: 
        states.append(h)
          
    states = torch.stack(states).permute(1, 0, 2)
    if return_states:
      return states
    else: 
      return self.h

In [None]:
class Encoder(nn.Module):
  def __init__(self, dim_model, dim_embed, batch_size, num_layers, gru, gru_cell, truncated, bidirection = False):
    super().__init__()
    self.direction = 2 if bidirection else 1
    rest_layer_input = 2*dim_model if bidirection else dim_model
    first_forward_floor = gru(dim_model, dim_embed, batch_size, gru_cell, truncated)
    forward_layers = [gru(dim_model, rest_layer_input, batch_size, gru_cell, truncated) for layer in range(num_layers - 1)]
    forward_layers.insert(0, first_forward_floor)
    self.forward_layers = nn.ModuleList(forward_layers)
    if self.direction == 2:
      first_backward_floor = gru(dim_model, dim_embed, batch_size, gru_cell, truncated)
      backward_layers = [gru(dim_model, rest_layer_input, batch_size, gru_cell, truncated) for layer in range(num_layers - 1)]
      backward_layers.insert(0, first_forward_floor)
      self.backward_layers = nn.ModuleList(backward_layers)
    self.max_seq = max_seq
    self.num_layers = num_layers
  def forward(self, x, return_states = False): # x : 모든 배치의 모든 시점의 단어 임베딩 벡터 (batch_size, max_len, dim_embed)
    input_list = x
    max_seq = x.size()[1]
    h_output = []
    for floor in range(self.num_layers): # 양방향 모델로 만들 경우 이전 층의 양 방향 hidden state를 입력값으로 받기 위해 forward 메소드 내에 시점 반영
      h_output_layer = []
      h_forward = self.forward_layers[floor].forward(input_list, max_seq)
      h_output_layer.append(h_forward)
      if self.direction == 2:        
        h_backward = self.backward_layers[floor].forward(input_list, max_seq)
        h_output_layer.append(h_backward)
      h_output.append(h_output_layer)
      input_list = torch.cat(h_output_layer, dim = 2)
    self.states = torch.cat(h_output_layer, dim = 2)
    return self.states, self.states[:, -1, :] # 마지막 시점의  hidden state를 따로 뽑음

## 1-2. Decoder

In [None]:
dim_embed = 512
vocab_size = 1000
dim_model = 128
max_seq = 50
batch_size = 16
encoder_bidirectional = True

In [None]:
class attention(nn.Module):
  def __init__(self, dim_model, batch_size):
    super().__init__()
    self.W = nn.Linear(2*dim_model, dim_model)
    self.U = nn.Linear(2*dim_model, dim_model)
    self.v = nn.Linear(dim_model, 1)
    self.batch_size = batch_size
  def get_attention(self, hidden_state, encoding_matrix): # hidden state : (batch_size, 2*dim_model), encoding_matrix : (bath_size, max_seq, 2*dim_model)
    if len(hidden_state.size()) == 2 :
      hidden_state = hidden_state.unsqueeze(1) # (batch_size, 1, 2*dim_model)
    max_seq = encoding_matrix.size()[1]
    attention_score = self.v(torch.tanh(self.W(hidden_state) + self.U(encoding_matrix))) # (batch_size, max_seq, 1)
    attention_dist = torch.softmax(attention_score, dim = 1) # (batch_size, max_seq, 1)
    attention_vect = (attention_dist.expand(self.batch_size, max_seq, 2*dim_model)*encoding_matrix).sum(dim = 1)
    return attention_vect

In [None]:
class Decoder(nn.Module):
  def __init__(self, dim_model, dim_embed, batch_size, encoder_bidirectional = True):
    super().__init__()
    self.direction = 2 if encoder_bidirectional else 1
    dim_decoder = self.direction*dim_model
    self.W_z = nn.Linear(dim_embed, dim_decoder)
    self.W_r = nn.Linear(dim_embed, dim_decoder)
    self.W_s = nn.Linear(dim_embed, dim_decoder)

    self.U_z = nn.Linear(2*dim_model, dim_decoder)
    self.U_r = nn.Linear(2*dim_model, dim_decoder)
    self.U_s = nn.Linear(2*dim_model, dim_decoder)

    self.V_z = nn.Linear(2*dim_model, dim_decoder)
    self.V_r = nn.Linear(2*dim_model, dim_decoder)
    self.V_s = nn.Linear(2*dim_model, dim_decoder)

    self.attention = nn.ModuleList([attention(dim_model, batch_size)])
  def forward(self, x, h_old, encoding_matrix): # x : 모든 배치의 이전 시점 출력 단어 임베딩 벡터 (batch_size, dim_embed) / h_old 이전 시점 hidden state (batch_size, 2*dim_model)
    attention_vect = self.attention[0].get_attention(h_old, encoding_matrix) # (batch_size, 2*dim_embed)
    r = torch.sigmoid(self.W_r((x)) + self.U_r(h_old) + self.V_r(attention_vect))
    z = torch.sigmoid(self.W_z((x)) + self.U_z(h_old) + self.V_z(attention_vect))
    h_new = torch.tanh(self.W_s((x)) + self.U_s(r*h_old) + self.V_s(attention_vect))
    h = (1 - z)*h_old + z*h_new
    return h

## 3. Full Model

In [None]:
class Model(nn.Module):
  def __init__(self, Encoder, Decoder, dim_model, dim_embed, batch_size, num_layers, gru, gru_cell, is_bidirection, truncated, dropout_ratio, encoder_vocab, decoder_vocab, device):
    super().__init__()
    self.device = device

    self.encoder_vocab = encoder_vocab
    self.decoder_vocab = decoder_vocab
    self.encoder_vocab_size = len(self.encoder_vocab)
    self.decoder_vocab_size = len(self.decoder_vocab) 

    encoder_partial = Encoder(dim_model, dim_embed, batch_size, 3, gru, gru_cell, truncated, bidirection = is_bidirection)
    encoder_embedding = nn.Embedding(self.encoder_vocab_size, dim_embed)
    encoder_dropout = Dropout(dropout_ratio)
    encoder = nn.Sequential(encoder_embedding, encoder_dropout, encoder_partial)

    decoder = Decoder(dim_model, dim_embed, batch_size, encoder_bidirectional = is_bidirection)
    linear_input_dim = 2*dim_model if is_bidirection else dim_model
    decoder_linear = nn.Linear(linear_input_dim, self.decoder_vocab_size)
    
    self.layers = nn.ModuleDict({"encoder" : encoder, "decoder" : decoder, "dense" : decoder_linear})


    self.__sos_token_index = torch.tensor(self.decoder_vocab.stoi["<BOS>"], device = self.device)
    self.__eos_token_index = torch.tensor(self.decoder_vocab.stoi["<EOS>"], device = self.device)
    self.__pad_token_index = torch.tensor(self.decoder_vocab.stoi['<PAD>'], device = self.device)
    self.batch_size = batch_size
    self.decoder_embedding = nn.Sequential(nn.Embedding(num_embeddings = self.decoder_vocab_size, embedding_dim = dim_embed, padding_idx = self.__pad_token_index))   
    
    

  def forward(self, input_tokens, gold_tokens, teach_forcing_ratio = 0.7): # input_tokens : 번역할 문장 (batch_size, max_seq) gold_token : 실제 정답 토큰 <sos> 부착 상태 (batch_size, max_seq_decoder)
    gold_embed = self.decoder_embedding(gold_tokens) # (batch_size, max_seq, dim_embed)
    encoder_max_seq = input_tokens.size()[-1]
    encoding_matrix, hidden_state = self.layers.encoder.forward(input_tokens)

    output_embed = self.decoder_embedding(self.__sos_token_index).squeeze(0).expand(self.batch_size, dim_embed) # (batch_size, dim_embed)
    max_seq_decoder = gold_tokens.size()[-1]
    teach_forcing_prob = torch.rand(max_seq_decoder)

    sentence_made = torch.zeros((batch_size, max_seq_decoder, self.decoder_vocab_size))

    for seq in range(max_seq_decoder):
      hidden_state = self.layers.decoder.forward(output_embed, hidden_state, encoding_matrix)  #(batch_size, 2*dim_model)
      
      output_token_vect = self.layers.dense(hidden_state)
      sentence_made[:, seq, :] = output_token_vect # 손실 함수를 위해 출력할 값 저장. 이 값은 softmax 통과하기 전 (batch_size, decoder_vocab_size)
      output_token = output_token_vect.argmax(dim = 1) # 어차피 소프트맥스 하나 안하나 이 시점의 가장 큰 값이 output token
      output_embed = self.decoder_embedding(output_token)
      output_embed = gold_embed[:, seq, :] if (teach_forcing_prob[seq] < teach_forcing_ratio) else output_embed # teacher forcing을 사용할지 정해짐

    return sentence_made 


In [None]:
model = Model(Encoder, Decoder, dim_model, dim_embed, batch_size, num_layers, gru, gru_cell, is_bidirection, truncated, dropout_ratio, encoder_vocab = en_vocab, decoder_vocab = ko_vocab, device = device)

In [None]:
# count_parameters 함수 출처 : https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 10,831,532 trainable parameters


# 2. Training

In [None]:
class early_stopping:
  def __init__(self, patience = 10, save_path = "./save_model"):
    self.patience = patience
    self.save_path = save_path
    self.count = 0
    self.best_score = np.Inf
    self.stop = False
    self.best_model = None

  def __call__(self, val_loss, model):
    if self.best_score == None:
      self.best_score = val_loss
      self.save_model(model)
    elif val_loss < self.best_score:
      self.best_score = val_loss
      self.save_model(model)
      self.count = 0
      print("new best model is saved")
      self.best_model = copy.deepcopy(model)
    else:
      self.count += 1
      if self.count == self.patience:
        print('-'*50)
        print(f"training is over")
        print('-'*50)
        self.stop = True

  def save_mode(self, model):
    torch.save(model.state_dict(), self.save_path)

In [None]:
def train_per_epoch(model, iterator, optimizer, criterion, clip, device):
  model.train()
  epoch_loss = 0
  for enu, (target_lang, source_lang) in enumerate(iterator):
    source_lang, target_lang = source_lang.to(device), target_lang.to(device)
    optimizer.zero_grad()
    output = model(source_lang, target_lang)
    output = output[1:].view(-1, output.size()[-1]).to(device)
    target_lang = target_lang[1:].view(-1)
    loss = criterion(output, target_lang)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    
    optimizer.step()
    
    print(f"-----------------------the loss of {enu} iter is {loss:.3f}-----------------------")
    
    epoch_loss += loss.item()

  return epoch_loss/len(iterator)

def evaluate(model, iterator, criterion, device):
  model.eval()
  epoch_loss = 0 
  with torch.no_grad():
    for enu, (target_lang, source_lang) in enumerate(iterator):
      source_lang, target_lang = source_lang.to(device), target_lang.to(device)

      output = model(source_lang, target_lang, teach_forcing_ratio = 0)

      output = output[1:].view(-1, output.size()[-1]).to(device)
      target_lang = target_lang[1:].view(-1)

      loss = criterion(output, target_lang)
        

      epoch_loss += loss.item()
    return epoch_loss/len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train(num_epochs, clip, model, train_iter, dev_iter, test_iter, criterion, device):
  train_loss_log = []
  valid_loss_log = []
  early_stopper = early_stopping(patience = 10, save_path = "./save_model")
  for epoch in range(num_epochs):
    start_time = time.time()

    train_loss = train_per_epoch(model, train_iter, optimizer, criterion, clip, device)
    valid_loss = evaluate(model, dev_iter, criterion, device)
    early_stopper(valid_loss, model)
    train_loss_log.append(train_loss)
    valid_loss_log.append(valid_loss)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    if early_stopper.stop:
      break
  test_loss = evaluate(early_stopper.best_model, test_iter, criterion, device)

  plt.plot(train_loss_log)
  plt.plot(valid_loss_log)
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'dev'], loc='upper left')
  plt.savefig(f"./training_fig.png", )

  print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

### 이미 1100 이터 정도 학습된 이후이다. 

In [None]:
model.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters())
train(10, 1, model, train_iter, dev_iter, test_iter, criterion, device)

-----------------------the loss of 0 iter is 10.214-----------------------
-----------------------the loss of 1 iter is 10.181-----------------------
-----------------------the loss of 2 iter is 10.148-----------------------
-----------------------the loss of 3 iter is 10.107-----------------------
-----------------------the loss of 4 iter is 10.031-----------------------
-----------------------the loss of 5 iter is 9.950-----------------------
-----------------------the loss of 6 iter is 9.859-----------------------
-----------------------the loss of 7 iter is 9.778-----------------------
-----------------------the loss of 8 iter is 9.642-----------------------
-----------------------the loss of 9 iter is 9.498-----------------------
-----------------------the loss of 10 iter is 9.369-----------------------
-----------------------the loss of 11 iter is 9.163-----------------------
-----------------------the loss of 12 iter is 8.932-----------------------
-----------------------the los

성능이 무척 안좋다. 이유는 다음과 같을 것으로 추측된다. 
1. 길이 제한을 두지 않았다. 이로 인해 길이가 50 이상인 긴 문장들이 다수 존재하고, 이는 seq2seq의 한계로 작용한다. 
2. 모델 구조가 단순하다. 인코더를 더 쌓고 싶었으나, 학습에 너무 오랜 시간이 걸려 2층 밖에 쌓지 못했다. 
3. 2번과 비슷한 이유로, 코랩을 사용하여 학습하다보니 램의 한계로 모델을 크게 만들 수 없었다. 이로 인해 임베딩 벡터와 모델 내부 벡터의 크기를 128과 64로 작게 잡을 수 밖에 없었다. 실제로 벡터 크기를 늘리자 조금 더 학습이 원활해지는 모습이다. 
4. pretrained-embedding을 사용하지 않았다. 
5. tokenizer의 문제가 있다. mecab과 spacy를 사용했는데 전처리를 거의 가하지 않아, 고유명사, 인명, 숫자 등이 그대로 입력되었다. 이로인해 모델이 맥락을 파악하는 것이 어려웠을 것으로 추측된다. 
