In [None]:
!pip install IPython
from IPython.display import Image

In [None]:
!apt install python3.7
!pip install -U torchtext==0.8.1

# 문장을 토큰화하는 모듈 설치
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

In [None]:
!git clone https://github.com/Jiyoon52/LG_ES_RNN.git # colab 사용시

# [Sequence to Sequence - many to many] 

##### jupyter notebook 단축키

- ctrl+enter: 셀 실행   
- shift+enter: 셀 실행 및 다음 셀 이동   
- alt+enter: 셀 실행, 다음 셀 이동, 새로운 셀 생성
- a: 상단에 새로운 셀 만들기
- b: 하단에 새로운 셀 만들기
- dd: 셀 삭제(x: 셀 삭제)

In [None]:
Image('/content/LG_ES_RNN/image/image13.JPG')

### 1. 모듈 불러오기

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

import torchtext
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import io

import random
from typing import Tuple

import warnings
warnings.filterwarnings(action='ignore') 

### 2. 데이터 불러오기

In [None]:
random_seed = 2022
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

- https://www.statmt.org/wmt16/multimodal-task.html#task1
- 독일어-영어 번역 작업을 위해 제공하는 데이터셋
- DE: 독일어, EN: 영어

In [None]:
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

- tokenizers는 문장을 개별 token으로 변환해주는 데 사용
  - e.g. "I love you!" --> ["I", "love", "you", "!"]
- nlp를 쉽게 할 수 있도록 도와주는 python package인 `spaCy`를 이용하여, token화

In [None]:
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

- url로부터 데이터를 부르고, tokenizing한뒤 구성된 단어로 vocab(단어 군) 생성

In [None]:
def build_vocab(filepath, tokenizer):
  counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for string_ in f:
      counter.update(tokenizer(string_))
  return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

- vocab(단어 군)에서의 index를 활용하여 각 단어(토큰)에 숫자를 할당

In [None]:
def data_process(filepaths):
  raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
                            dtype=torch.long)
    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
                            dtype=torch.long)
    data.append((de_tensor_, en_tensor_))
  return data

train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

# 각 set을 구성하는 문장 pair의 개수를 의미
print(f'test_data length is {len(test_data)}')
print(f'val_data length is {len(val_data)}')
print(f'test_data length is {len(test_data)}')

- 각 pair에 대한 예시 확인해보기

In [None]:
train_data[0][0]
train_data[0][1]

de_itos = de_vocab.itos
en_itos = en_vocab.itos

exam_de = ([de_itos[i] for i in train_data[0][0]])
exam_en = ([en_itos[i] for i in train_data[0][1]])

print('Germany sentence')
print(exam_de)

print('English sentence')
print(exam_en)

In [None]:
Image('/content/LG_ES_RNN/image/image14.JPG')

In [None]:
batch_size = 128
PAD_IDX = de_vocab['<pad>'] # padding token
BOS_IDX = de_vocab['<bos>'] # begin of sentence token
EOS_IDX = de_vocab['<eos>'] # end of sentence token

In [None]:
def generate_batch(data_batch):
  de_batch, en_batch = [], []
  for (de_item, en_item) in data_batch:
    de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
    en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
  de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
  en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
  return de_batch, en_batch

In [None]:
train_iter = DataLoader(train_data, batch_size,
                        shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size,
                        shuffle=True, collate_fn=generate_batch)
test_iter = DataLoader(test_data, batch_size,
                       shuffle=True, collate_fn=generate_batch)

### 3. Seq2Seq Modeling

#### 3.1 Define the Model Structure

In [None]:
Image('/content/LG_ES_RNN/image/image15.JPG')
# 참고: https://deep-learning-study.tistory.com/686

Encoder는 입력 senquence를 입력받아 encode하여 고정된 크기의 context vector를 생성하며, 
입력 cell과 입력 hidden state는 zero tensor

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        # embedding: 입력값을 emd_dim 벡터로 변경
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        # embedding을 입력받아 hid_dim 크기의 hidden state, cell 출력
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        # src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        # embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        
        # outputs are always from the top hidden layer
        
        return hidden, cell

In [None]:
Image('/content/LG_ES_RNN/image/image16.JPG')

Decoder은 encode된 context vector를 입력받아 decode하여 단어를 예측

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        # content vector(output_dim)를 입력받아 emb_dim 출력
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        # embedding을 입력받아 hid_dim 크기의 hidden state, cell 출력
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        
        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hid dim]
        # context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        # input = [1, batch size]
        # 하나의 token씩 decoding, 첫번째 input은  <bos>
        embedded = self.dropout(self.embedding(input))
        
        # embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        # output = [seq len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        
        # seq len and n directions will always be 1 in the decoder, therefore:
        # output = [1, batch size, hid dim]
        # hidden = [n layers, batch size, hid dim]
        # cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        # src = [src len, batch size]
        # trg = [trg len, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # teacher forcing은 다음 입력으로 디코더의 예측을 사용하는 대신 실제 목표 출력을 다음 입력으로 사용
        
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0] # 타겟 토큰 길이 얻기
        trg_vocab_size = self.decoder.output_dim # context vector의 차원
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

- encoder에 source(input) sentence를 입력
- encoder를 학습시켜 고정된 크기의 context vector를 출력
- context vector를 decoder에 넣어 예측된 target(output) sentence를 생성

#### 3.2 Define The Training Testing Strategy

In [None]:
def train(model: nn.Module,
          iterator: torch.utils.data.DataLoader,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):

    model.train()

    epoch_loss = 0

    for _, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: torch.utils.data.DataLoader,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator), output
  
def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

### 4. Model Training

#### 4.1 Hyperparameter Setting

In [None]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
enc_emb_dim = 256
dec_emb_dim = 256
hid_dim = 512
n_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5
num_epochs = 10
clip = 1
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu') 
best_model_path = '/content/LG_ES_RNN/ckpt/seq2seq.pt'

#### 4.2 Construct Data Loaders and Model

In [None]:
enc = Encoder(input_dim, enc_emb_dim, hid_dim, n_layers, enc_dropout)
dec = Decoder(output_dim, dec_emb_dim, hid_dim, n_layers, dec_dropout)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
# 가중치 초기화
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
model.apply(init_weights)

In [None]:
# 모델의 학습대상인 파라미터 수 측정
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

#### 4.3 Model Training and Save Weights(Parameters)

In [None]:
optimizer = optim.Adam(model.parameters())
PAD_IDX = en_vocab.stoi['<pad>']

# <pad> token에 해당하는 index는 무시
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
best_valid_loss = float('inf')

for epoch in range(num_epochs):

    start_time = time.time()

    train_loss = train(model, train_iter, optimizer, criterion, clip)
    valid_loss, _ = evaluate(model, valid_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

test_loss, pred = evaluate(model, test_iter, criterion)
torch.save(model.state_dict(), best_model_path)
print(f'| Test Loss: {test_loss:.3f}')

### 5. Model Validation

In [None]:
model.load_state_dict(torch.load(best_model_path))

In [None]:
test_loss, pred_output = evaluate(model, test_iter, criterion)

# EOD