In [1]:
# Code from https://codlingual.tistory.com/91

In [2]:
!pip install torchtext==0.6.0
!python -m spacy download en_core_web_sm
!python -m spacy download de

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 634 kB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 40.7 MB/s 
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
Successfully installed sentencepiece-0.1.97 torchtext-0.6.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data
from torchtext import datasets

import spacy
import numpy as np

import random
import math
import time

In [4]:
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [5]:
# 각 언어에 맞는 tokenizer 불러오기 
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [6]:
def tokenize_de(text):
    # 독일어 tokenize해서 단어들을 리스트로 만든 후 reverse 
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]
    
def tokenize_en(text):
    # 영어 tokenize해서 단어들을 리스트로 만들기
    return [tok.text for tok in spacy_en.tokenizer(text)]

# <b>1. Torchtext로 전처리하기</b>

## <b>(1) Field 정의하기</b>

In [7]:
# SRC = source = input
SRC = data.Field(tokenize = tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True)
# TRG = target = output
TRG = data.Field(tokenize = tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)

## <b>(2) dataset 생성하기</b>

In [8]:
# exts : 어떤 언어 사용할지 명시 (input 언어를 먼저 씀)
# fields = (입력, 출력) 
trn_data, val_data, tst_data = datasets.Multi30k.splits(exts=('.de', '.en'), 
                                                        fields=(SRC,TRG))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 5.52MB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 1.79MB/s]

downloading mmt_task1_test2016.tar.gz



mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 1.59MB/s]


In [9]:
# 독일어는 단어 순서가 거꾸로...?
from pprint import pprint
pprint(vars(trn_data.examples[0]))

{'src': ['.',
         'büsche',
         'vieler',
         'nähe',
         'der',
         'in',
         'freien',
         'im',
         'sind',
         'männer',
         'weiße',
         'junge',
         'zwei'],
 'trg': ['two',
         'young',
         ',',
         'white',
         'males',
         'are',
         'outside',
         'near',
         'many',
         'bushes',
         '.']}


## <b>(3) 단어 집합(vocab) 만들기</b>

In [10]:
SRC.build_vocab(trn_data, min_freq=2)
TRG.build_vocab(trn_data, min_freq=2)

In [11]:
pprint(SRC.vocab.freqs.most_common(20))

[('.', 28809),
 ('ein', 18851),
 ('einem', 13711),
 ('in', 11895),
 ('eine', 9909),
 (',', 8938),
 ('und', 8925),
 ('mit', 8843),
 ('auf', 8745),
 ('mann', 7805),
 ('einer', 6765),
 ('der', 4990),
 ('frau', 4186),
 ('die', 3949),
 ('zwei', 3873),
 ('einen', 3479),
 ('im', 3107),
 ('an', 3062),
 ('von', 2363),
 ('sich', 2273)]


In [12]:
pprint(TRG.vocab.freqs.most_common(20))

[('a', 49165),
 ('.', 27623),
 ('in', 14886),
 ('the', 10955),
 ('on', 8035),
 ('man', 7781),
 ('is', 7525),
 ('and', 7379),
 ('of', 6871),
 ('with', 6179),
 ('woman', 3973),
 (',', 3963),
 ('two', 3886),
 ('are', 3717),
 ('to', 3128),
 ('people', 3122),
 ('at', 2927),
 ('an', 2861),
 ('wearing', 2623),
 ('shirt', 2324)]


## <b>(4) data를 불러오기 위한 iterator 생성하기</b>
- torchtext.data.BucketIterator

In [13]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HIDDEN_DIM = 512

ENC_EMBED_DIM = 256
DEC_EMBED_DIM = 256

N_LAYERS=2

ENC_DROPOUT=0.5
DEC_DROPOUT=0.5
BATCH_SIZE = 64
LR = 0.001
EPOCHS = 10

In [14]:
trn_iter, val_iter, tst_iter = data.BucketIterator.splits(datasets = (trn_data, val_data, tst_data), 
                                                          batch_size = BATCH_SIZE, device=device)

# <b> 2. seq-to-seq model 구현하기(LSTM)</b>

## <b>(1) Encoder</b>

In [15]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.embed = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        # src = [src len, batch_size]
        embedded = self.dropout(self.embed(src))
        
        # embedded = [src len, batch size, emb dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        
        # outputs = [src len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        
        return hidden, cell


## <b>(2) Decoder</b>

In [16]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.embed = nn.Embedding(output_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        # Decoder에서 항상 n_directions = 1
        # 따라서 hidden = [n layers, batch size, hid dim]
        # context = [n layers, batch size, hid dim]
        
        # input = [1, batch size]
        input = input.unsqueeze(0)
        
        # embedded = [1, batch size, emb dim]
        embed = self.dropout(self.embed(input))
        
        output, (hidden, cell) = self.rnn(embed, (hidden, cell))
        
        # output = [seq len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        
        # Decoder에서 항상 seq len = n directions = 1 
        # 한 번에 한 토큰씩만 디코딩하므로 seq len = 1
        # 따라서 output = [1, batch size, hid dim]
        # hidden = [n layers, batch size, hid dim]
        # cell = [n layers, batch size, hid dim]
        
        # prediction = [batch size, output dim]
        pred = self.fc_out(output.squeeze(0))
        
        return pred, hidden, cell

## <b>(3)Seq2Seq</b>

In [17]:
class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder):
       super().__init__()
       
       self.encoder = encoder
       self.decoder = decoder
       
       # Encoder와 Decoder의 hidden dim이 같아야 함
       assert encoder.hidden_dim == decoder.hidden_dim
       # Encoder와 Decoder의 layer 개수가 같아야 함
       assert encoder.n_layers == decoder.n_layers
       
   def forward(self, src, trg, teacher_forcing_ratio=0.5):
       # src = [src len, batch size]
       # trg = [trg len, batch size]
       
       trg_len = trg.shape[0]
       batch_size = trg.shape[1]
       trg_vocab_size = self.decoder.output_dim
       
       # decoder 결과를 저장할 텐서
       outputs = torch.zeros(trg_len, batch_size, trg_vocab_size)
       
       # Encoder의 마지막 은닉 상태가 Decoder의 초기 은닉상태로 쓰임
       hidden, cell = self.encoder(src)
       
       # Decoder에 들어갈 첫 input은 <sos> 토큰
       input = trg[0, :]
       
       # target length만큼 반복
       # range(0,trg_len)이 아니라 range(1,trg_len)인 이유 : 0번째 trg는 항상 <sos>라서 그에 대한 output도 항상 0 
       for t in range(1, trg_len):
           output, hidden, cell = self.decoder(input, hidden, cell)
           outputs[t] = output
           
           # random.random() : [0,1] 사이 랜덤한 숫자 
           # 랜덤 숫자가 teacher_forcing_ratio보다 작으면 True니까 teacher_force=1
           teacher_force = random.random() < teacher_forcing_ratio
           
           # 확률 가장 높게 예측한 토큰
           top1 = output.argmax(1) 
           
           # techer_force = 1 = True이면 trg[t]를 아니면 top1을 input으로 사용
           input = trg[t] if teacher_force else top1
       
       return outputs

## <b>(4) 모델 객체 생성 및 optimizer 정의</b>

In [18]:
enc = Encoder(input_dim=INPUT_DIM, 
              embed_dim=ENC_EMBED_DIM, 
              hidden_dim=HIDDEN_DIM, 
              n_layers=N_LAYERS, 
              dropout=ENC_DROPOUT).to(device)
dec = Decoder(output_dim = OUTPUT_DIM, 
              embed_dim = DEC_EMBED_DIM, 
              hidden_dim=HIDDEN_DIM,
              n_layers=N_LAYERS, 
              dropout = DEC_DROPOUT).to(device)
model = Seq2Seq(enc, dec).to(device)


# <b>3. 모델 학습 및 평가 함수 생성하기</b>

In [19]:
optimizer = optim.Adam(model.parameters())

# <pad> 토큰의 index를 넘겨 받으면 오차 계산하지 않고 ignore하기
# <pad> = padding
trg_pad_idx = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = trg_pad_idx)

In [20]:
def train(model, iterator, optimizer, criterion, clip, device):
    model.train()
    epoch_loss=0
    
    for i, batch in enumerate(iterator):
        src = batch.src.to(device)
        trg = batch.trg.to(device)
        
        optimizer.zero_grad()
        output = model(src, trg).to(device)
        
        # trg = [trg len, batch size]
        # output = [trg len, batch size, output dim]
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        # trg = [(trg len-1) * batch size]
        # output = [(trg len-1) * batch size, output dim)]
        loss = criterion(output, trg)
        loss.backward()
        
        # 기울기 폭발 막기 위해 clip
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss+=loss.item()
        
    return epoch_loss/len(iterator)

In [21]:
def evaluate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src.to(device)
            trg = batch.trg.to(device)
            
            # teacher_forcing_ratio = 0 (아무것도 알려주면 안 됨)
            output = model(src, trg, 0).to(device)
            
            # trg = [trg len, batch size]
            # output = [trg len, batch size, output dim]
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            # trg = [(trg len - 1) * batch size]
            # output = [(trg len - 1) * batch size, output dim]
            
            loss = criterion(output, trg)
            
            epoch_loss+=loss.item()
        
        return epoch_loss/len(iterator)

In [22]:
CLIP = 1

best_val_loss = float('inf')

for epoch in range(EPOCHS):
    
    start_time = time.time()
    
    trn_loss = train(model, trn_iter, optimizer, criterion, CLIP, device)
    val_loss = evaluate(model, val_iter, criterion, device)
    
    end_time = time.time()
    
    elapsed_time = end_time-start_time
    
    if val_loss < best_val_loss:
        best_valid_loss = val_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Elapsed Time: {elapsed_time:.2f}s')
    print(f'\tTrain Loss: {trn_loss:.4f} | Train PPL: {math.exp(trn_loss):7.3f}')
    print(f'\t Val. Loss: {val_loss:.4f} |  Val. PPL: {math.exp(val_loss):7.3f}')

Epoch: 01 | Elapsed Time: 176.19s
	Train Loss: 4.8179 | Train PPL: 123.710
	 Val. Loss: 4.7150 |  Val. PPL: 111.613
Epoch: 02 | Elapsed Time: 175.90s
	Train Loss: 4.2756 | Train PPL:  71.922
	 Val. Loss: 4.5061 |  Val. PPL:  90.565
Epoch: 03 | Elapsed Time: 176.21s
	Train Loss: 4.0001 | Train PPL:  54.603
	 Val. Loss: 4.3038 |  Val. PPL:  73.981
Epoch: 04 | Elapsed Time: 175.83s
	Train Loss: 3.7943 | Train PPL:  44.449
	 Val. Loss: 4.1464 |  Val. PPL:  63.205
Epoch: 05 | Elapsed Time: 176.03s
	Train Loss: 3.6027 | Train PPL:  36.699
	 Val. Loss: 4.0731 |  Val. PPL:  58.737
Epoch: 06 | Elapsed Time: 174.95s
	Train Loss: 3.4380 | Train PPL:  31.124
	 Val. Loss: 3.9179 |  Val. PPL:  50.294
Epoch: 07 | Elapsed Time: 174.96s
	Train Loss: 3.3156 | Train PPL:  27.538
	 Val. Loss: 3.8592 |  Val. PPL:  47.429
Epoch: 08 | Elapsed Time: 175.49s
	Train Loss: 3.1845 | Train PPL:  24.156
	 Val. Loss: 3.8551 |  Val. PPL:  47.235
Epoch: 09 | Elapsed Time: 175.12s
	Train Loss: 3.0755 | Train PPL:  21.6