### data  로드
* https://github.com/cpm0722/transformer_pytorch/ 참고 

In [1]:
import torch
import torchtext
import spacy
import os
import pickle
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
import torchtext.transforms as T

In [2]:
# ! python -m spacy download en
# ! python -m spacy download de

spacy_lang_dict = {
        'en': "en_core_web_sm",
        'de': "de_core_news_sm"
        }

tokenizer_src = get_tokenizer("spacy", spacy_lang_dict['en'])
tokenizer_tgt = get_tokenizer("spacy", spacy_lang_dict['de'])

In [7]:
cache_dir = '../data/multi30k'
raw_dir = os.path.join(cache_dir, "raw")

train_file = os.path.join(cache_dir, "train.pkl")
valid_file = os.path.join(cache_dir, "valid.pkl")
test_file = os.path.join(cache_dir, "test.pkl")

# train.pkl
with open(os.path.join(raw_dir, "train.en"), "r") as f:
    train_en = [text.rstrip() for text in f]
    
with open(os.path.join(raw_dir, "train.de"), "r") as f:
    train_de = [text.rstrip() for text in f]
    
train = [(en, de) for en, de in zip(train_en, train_de)]

# with open(train_file, "wb") as f:
#         pickle.dump(train, f)
        
# valid.pkl

with open(os.path.join(raw_dir, "val.en"), "r") as f:
        valid_en = [text.rstrip() for text in f]
with open(os.path.join(raw_dir, "val.de"), "r") as f:
        valid_de = [text.rstrip() for text in f]
valid = [(en, de) for en, de in zip(valid_en, valid_de)]

# with open(valid_file, "wb") as f:
#         pickle.dump(valid, f)
        
# test.pkl

with open(os.path.join(raw_dir, "test_2016_flickr.en"), "r") as f:
                test_en = [text.rstrip() for text in f]
with open(os.path.join(raw_dir, "test_2016_flickr.de"), "r") as f:
    test_de = [text.rstrip() for text in f]
test = [(en, de) for en, de in zip(test_en, test_de)]

# with open(test_file, "wb") as f:
#         pickle.dump(test, f)

In [8]:
# build_vocab()

def yield_tokens(is_src=True):
    for text_pair in train:
        if is_src:
            yield [str(token) for token in tokenizer_src(text_pair[0])]
        else:
            yield [str(token) for token in tokenizer_tgt(text_pair[1])]

            
specials={
        "<unk>": 0,
        "<pad>": 1,
        "<sos>": 2,
        "<eos>": 3
        }

vocab_src_file = os.path.join(cache_dir, "vocab_en.pkl")
vocab_src = build_vocab_from_iterator(yield_tokens(is_src=True), min_freq=2, specials=specials.keys())
vocab_src.set_default_index(0)
# with open(vocab_src_file, "wb") as f:
#         pickle.dump(vocab_src, f)
        
vocab_tgt_file = os.path.join(cache_dir, "vocab_de.pkl")
vocab_tgt = build_vocab_from_iterator(yield_tokens(is_src=False), min_freq=2, specials=specials.keys())
vocab_tgt.set_default_index(0)
# with open(vocab_tgt_file, "wb") as f:
#         pickle.dump(vocab_tgt_file, f)
        
def get_transform(vocab):
    max_seq_len = 256
    sos_idx = 2
    eos_idx = 3
    pad_idx = 1
    return T.Sequential(
            T.VocabTransform(vocab),
            T.Truncate(max_seq_len-2),
            T.AddToken(token=sos_idx, begin=True),
            T.AddToken(token=eos_idx, begin=False),
            T.ToTensor(padding_value=pad_idx))

transform_src = get_transform(vocab_src)
transform_tgt = get_transform(vocab_tgt)


def collate_fn(pairs):
    src = [tokenizer_src(pair[0]) for pair in pairs]
    tgt = [tokenizer_tgt(pair[1]) for pair in pairs]
    batch_src = transform_src(src)
    batch_tgt = transform_tgt(tgt)
    return (batch_src, batch_tgt)

train_iter = DataLoader(train, collate_fn=collate_fn)
valid_iter = DataLoader(valid, collate_fn=collate_fn)
test_iter = DataLoader(test, collate_fn=collate_fn)

In [9]:
for idx, (src, trg) in enumerate(train_iter):
    trg_x = trg[:, :-1]
    trg_y = trg[:, 1:]
    break
    

### mask 생성 
* https://github.com/cpm0722/transformer_pytorch/ 참고 

In [13]:
# Transformer class 내부 mask 생성방법 이해 

def make_src_mask(src):
    pad_mask = make_pad_mask(src, src)
    return pad_mask

def make_tgt_mask(tgt):
    pad_mask = make_pad_mask(tgt, tgt)
    seq_mask = make_subsequent_mask(tgt, tgt)
    mask = pad_mask & seq_mask
    return mask

def make_src_tgt_mask(src, tgt):
    pad_mask = make_pad_mask(tgt, src)
    return pad_mask

import numpy as np

def make_pad_mask(query, key, pad_idx=1):
    # query: (n_batch, query_seq_len)
    # key: (n_batch, key_seq_len)
    query_seq_len, key_seq_len = query.size(1), key.size(1)

    key_mask = key.ne(pad_idx).unsqueeze(1).unsqueeze(2)  # (n_batch, 1, 1, key_seq_len)
    key_mask = key_mask.repeat(1, 1, query_seq_len, 1)    # (n_batch, 1, query_seq_len, key_seq_len)

    query_mask = query.ne(pad_idx).unsqueeze(1).unsqueeze(3)  # (n_batch, 1, query_seq_len, 1)
    query_mask = query_mask.repeat(1, 1, 1, key_seq_len)      # (n_batch, 1, query_seq_len, key_seq_len)

    mask = key_mask & query_mask
    mask.requires_grad = False
    return mask
 
def make_subsequent_mask(query, key): #  make_no_peak_mask()
    query_seq_len, key_seq_len = query.size(1), key.size(1)

    tril = np.tril(np.ones((query_seq_len, key_seq_len)), k=0).astype('uint8') # lower triangle without diagonal
    mask = torch.tensor(tril, dtype=torch.bool, requires_grad=False, device=query.device)
    return mask

## src mask 
query = src
key = src
pad_idx = 1

query_seq_len, key_seq_len = query.size(1),key.size(1) # (n_batch, seq_len) 13
 
key_mask = key.ne(pad_idx).unsqueeze(1).unsqueeze(2)  # (n_batch, 1, 1, key_seq_len)
key_mask = key_mask.repeat(1, 1, query_seq_len, 1)    # (n_batch, 1, query_seq_len, key_seq_len)

query_mask = query.ne(pad_idx).unsqueeze(1).unsqueeze(3) # (n_batch, 1, query_seq_len, 1)
query_mask = query_mask.repeat(1, 1, 1, key_seq_len)     # (n_batch, 1, query_seq_len, key_seq_len)

src_mask = key_mask & query_mask
src_mask.requires_grad = False

## src_tgt mask
query = trg_x
key = src
pad_idx = 1

query_seq_len, key_seq_len = query.size(1),key.size(1) # 14, 13
 
key_mask = key.ne(pad_idx).unsqueeze(1).unsqueeze(2)  # (n_batch, 1, 1, key_seq_len)
key_mask = key_mask.repeat(1, 1, query_seq_len, 1)    # (n_batch, 1, query_seq_len, key_seq_len)

query_mask = query.ne(pad_idx).unsqueeze(1).unsqueeze(3) # (n_batch, 1, query_seq_len, 1)
query_mask = query_mask.repeat(1, 1, 1, key_seq_len)     # (n_batch, 1, query_seq_len, key_seq_len)

src_trg_mask = key_mask & query_mask
src_trg_mask.requires_grad = False

## trg mask 
pad_mask = make_pad_mask(trg_x, trg_x)

# subsequent mask 
query = trg_x
key = trg_x
query_seq_len, key_seq_len = query.size(1), key.size(1) # 14, 14
tril = np.tril(np.ones((query_seq_len, key_seq_len)), k=0).astype('uint8') # lower triangle without diagonal
seq_mask = torch.tensor(tril, dtype=torch.bool, requires_grad=False, device=query.device)

trg_mask = pad_mask & seq_mask
trg_mask.requires_grad = False

### Transformer
* https://github.com/hyunwoongko/transformer 참고 

In [53]:
import math
import torch
from torch import nn

class TokenEmbedding(nn.Embedding):
    
    def __init__(self, vocab_size, d_model):
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1) # nn.Embedding 상속 
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_embed = d_model
    
    def forward(self, x):
        # x = [[1, 4, 204, 2, 5], [] ,...  []] (batch_size, seq_len)
        out = self.embedding(x) * math.sqrt(self.d_embed) #(batch_size, seq_len, d_model)
        return out

class PositionalEncoding(nn.Module):
    # 트랜스포머는 단어 입력을 순차적으로 받는 방식이 아니므로 단어의 위치 정보를 다른 방식으로 알려줘야 함
    
    def __init__(self, d_model, max_len, device):
        super(PostionalEncoding, self).__init__()
        
        self.encoding = torch.zeros(max_len, d_model, device=device) # (max_len, d_model)
        self.encoding.requires_grad = False # gradient 계산 필요없음 
         
        pos = torch.arange(0, max_len)     # if max_len= 50 then [0,1,..., 49]
        pos = pos.float().unsqueeze(dim=1) # (max_len, 1)
        
        _2i = torch.arange(0, d_model, step=2).float() # [0,2,4, ..., 510]
    
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model))) # [0, 2, ...] 
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model))) # [1, 3, ...]
        
    def forward(self, x):
        batch_size, seq_len = x.size()    # [batch_size = 128, seq_len = 30]
        return self.encoding[:seq_len, :] # [seq_len = 30, d_model = 512]
    

In [24]:
from torch import nn

class TransformerEmbedding(nn.Module):
    """
    token embedding + positional encoding
    """
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super(TransformerEmbedding, self).__init__()
        
        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        self.pos_emb = PostionalEncoding(d_model, max_len, device)
        self.drop_out = nn.Dropout(p=drop_prob)

    def forward(self, x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        return self.drop_out(tok_emb + pos_emb)

In [26]:
class ScaleDotProductAttention(nn.Module):
    
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, q, k, v, mask=None):
        '''
        q (batch_size, num_head, seq_len, d_model//num_head)
        k (batch_size, num_head, seq_len, d_model//num_head)
        v (batch_size, num_head, seq_len, d_model//num_head)
        '''
        batch_size, num_head, seq_len, d_tensor = k.size()
            
        # 1. 유사도계산
        k_t = k.transepose(2, 3)
        score = q @ k_t  # (batch_size, num_head, seq_len, seq_len)

        # 2. 마스크적용
        if mask:
            score = score.masked_fill(0, -100000)

        # 3. 가중치 계산
        score = score/math.sqrt(d_tensor)
        score = self.softmax(score) # (batch_size, num_head, seq_len, seq_len)

        # 4. 가중합 계산 
        v = score @ v  # (batch_size, num_head, seq_len, d_model//num_head)
        
        return v, score # attention_value, attention_score

In [71]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, d_model, num_head):
        super(MultiHeadAttention, self).__init__()
        
        self.num_head = num_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        
    def forward(self, q, k, v, mask):
        
        ## 1. weight matrix 계산
        # _q (batch_size, seq_len, d_model)
        # _k (batch_size, seq_len, d_model)
        # _v (batch_size, seq_len, d_model)
        _q, _k, _v = self.w_q(q), self.w_k(k), self.w_v(v)
        
        ## 2. num_head 만큼 split
        # q (batch_size, num_head, seq_len, d_model//num_head)
        # k (batch_size, num_head, seq_len, d_model//num_head)
        # v (batch_size, num_head, seq_len, d_model//num_head)
        q, k, v = self.split(_q), self.split(_k), self.split(_v)
        
        ## 3. attention layer 
        out, attention_score = self.attention(q, k, v, mask)
        
        ## 4. num_head 합치기 
        out = self.concat(out)
        
        return out
        
    def split(self, tensor):
        # tensor (batch_size, seq_len, d_model)
        # spt_tensor (batch_size, num_head, seq_len, d_model//num_head)
        batch_size, seq_len, d_model = tensor.size()
        d_tensor = d_model // self.num_head
        spt_tensor = tensor.view(batch_size, seq_len, self.num_head, d_tensor).transpose(1, 2)
        
        return spt_tensor
        
    def concat(self, tensor):
        # tensor (batch_size, num_head, seq_len, d_model//num_head)
        # concat_tensor (batch_size, seq_len, d_model)
        batch_size, num_head, seq_len, d_tensor = tensor.size()
        d_model = num_head * d_tensor
        concat_tensor = tensor.transepose(1,2).contiguous().view(batch_size, seq_len, d_model)
        
        return concat_tensor
    
class LayerNorm(nn.Module):
    
    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model)) # (d_model)
        self.beta = nn.Parameter(torch.zeros(d_model)) # (d_model)
        self.eps = eps
    
    def forward(self, x):
        # x: (batch_size, seq_len, d_model)
        mean = x.mean(dim=2, keepdim=True)                 # (batch_size, seq_len, 1), dim 부분을 없앰 
        var = x.var(dim=-1, unbiased=False, keepdim=True)  # (batch_size, seq_len, 1)  dim=-1 마지막 차원

        out = (x - mean) / torch.sqrt(var + self.eps)      # (batch_size, seq_len, d_model)
        out = out * self.gamma + self.beta                 # (batch_size, seq_len, d_model)
        
        return out 
    
class PositionwiseFeedForward(nn.Module):
    
    def __init__(self, d_model, hidden, drop_prob=0.1):
        
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden, bias=True) # bias=True 디폴트 
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)
        
    def forward(self, x):
        # x (batch_size, seq_len, d_model)
        x = self.linear1(x)  # (batch_size, seq_len, hidden)
        x = self.relu(x)     # (batch_size, seq_len, hidden)
        x = self.dropout(x)  # (batch_size, seq_len, hidden)
        x = self.linear2(x)  # (batch_size, seq_len, d_model)
        
        return x
    
class EncoderLayer(nn.Module):
    
    def __init__(self, d_model, n_head, hidden, drop_prob):
        super(EncoderLayer, self).__init__()
        
        self.attention = MultiHeadAttention(d_model, n_head)
        
        self.norm1 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)
        
        self.ffn = PositionwiseFeedForward(d_model, hidden, drop_prob)
        
        self.norm2 = LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)
        
    def forward(self, x, src_mask):
        
        # 1. compute self-attention
        _x = x                                           # (batch_size, seq_len, d_model)
        x = self.attention(q=x, k=x, v=x, mask=src_mask) # (batch_size, seq_len, d_model)
            
        # 2. add and norm
        x = self.dropout1(x)                             # (batch_size, seq_len, d_model)
        x = self.norm1(x + _x)                           # (batch_size, seq_len, d_model)
        
        # 3. positionwise feed forward network
        _x = x                                           # (batch_size, seq_len, d_model)
        x = self.ffn(x)                                  # (batch_size, seq_len, d_model)
        
        # 4. add and norm
        x = self.dropout2(x)                             # (batch_size, seq_len, d_model)
        x = self.norm2(x + _x)                           # (batch_size, seq_len, d_model)
        
        return x
    
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device, n_head, ffn_hidden, num_layer):
        super(Encoder, self).__init__()
        
        self.embd = TransformerEmbedding(vocab_size, d_model, max_len, drop_prob, device)
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_head, ffn_hidden, drop_prob) for _ in range(num_layer)]) 
        
    def forward(self, x, src_mask):
        
        x = self.embd(x)
        for encoder_layer in self.layers:
            x = encoder_layer(x, src_mask)
        return x
    
class DecoderLayer(nn.Module):
    
    def __init__(self, d_model, n_head, drop_prob, ffn_hidden):
        super(DecoderLayer, self).__init__()
        
        self.self_attention = MultiHeadAttention(d_model=d_model, num_head=n_head)
        
        self.norm1 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)
        
        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, num_head=n_head)
        
        self.norm2 = LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)
        
        self.ffn = PositionwiseFeedForward(d_model, ffn_hidden, drop_prob)
        
        self.norm3 = LayerNorm(d_model)
        self.dropout3 = nn.Dropout(p = drop_prob)
        
    def forward(self, dec, enc, trg_mask, src_trg_mask):
        
        # 1. compute self attention
        _x = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask) # 룩어헤드 마스크 
    
        # 2. add and norm 
        x = self.dropout1(x)
        x = self.norm1(x + _x)
        
        # 3. compute encoder-decoder attention 
        _x = x
        x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_trg_mask)  
        
        # 4. add and norm 
        x = self.dropout2(x)
        x = self.norm2(x + _x)
        
        # 5. pointwise feedforward 
        _x = x
        x = self.ffn(x)
        
        # 6. add and norm 
        x = self.dropout3(x)
        x = self.norm3(x + _x)
        
        return x
    
class Decoder(nn.Module):
    
    def __init__(self, dec_voc_size, d_model, max_len, drop_prob, device, num_layer, n_head):
        super(Decoder, self).__init__()
        
        self.emb = TransformerEmbedding(dec_voc_size, d_model, max_len, drop_prob, device)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_head, drop_prob, ffn_hidden) for _ in range(num_layer)])
        self.linear = nn.Linear(d_model, dec_voc_size)

        
    def forward(self, dec, enc, trg_mask, src_trg_mask):
        x = self.emb(dec)
        
        for decoder_layer in self.layers :
            x = decoder_layer(x, enc, trg_mask, src_trg_mask)

        output = self.linear(x)
        
        return output
    

In [73]:
class Transformer(nn.Module):
    
    def __init__(self, src_pad_idx, trg_pad_idx, trg_sos_idx, \
                 enc_voc_size, dec_voc_size, d_model, n_head, max_len, ffn_hidden, num_layer, drop_prob, device):
        super(Transformer, self).__init__()
        
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.trg_sos_idx = trg_sos_idx
        self.device = device
        
        self.encoder = Encoder(enc_voc_size, d_model, max_len, drop_prob, device, n_head, ffn_hidden, num_layer)
        self.decoder = Decoder(dec_voc_size, d_model, max_len, drop_prob, device, num_layer, n_head)
        
    def forward(self, src, trg):
        
        # 1. 마스크 생성
        src_mask = self.make_pad_mask(src, src, self.src_pad_idx, self.src_pad.idx)
        
        trg_mask = self.make_pad_mask(trg, trg, self.trg_pad_idx, self.trg_pad_idx) * \
                   self.make_no_peak_mask(trg, trg)
        
        src_trg_mask = self.make_pad_mask(trg, src, self.trg_pad_idx, self.src_pad.idx)
        # 2.인코더 
        enc_src = self.encoder(src, src_mask)
        # 3. 디코더 
        output = self.decoder(trg, enc_src, trg_mask, src_trg_mask)
        
        return output 
    
    def make_pad_mask(self, q, k, q_pad_idx, k_pad_idx):
        
        q_seq_len, k_seq_len = q.size(1), k.size(1)        # (n_batch, seq_len) 
        
        k_mask = k.ne(k_pad_idx).unsqueeze(1).unsqueeze(2) # (n_batch, 1, 1, key_seq_len)
        k_mask = k_mask.repeat(1, 1, q_seq_len, 1)         # (n_batch, 1, query_seq_len, key_seq_len)
                                                           # 0방향으로 1번, 1방향으로 1번, 2방향으로 len_q번 3방향으로 1번 반복하란 소리 

        q_mask = q.ne(q_pad_idx).unsqueeze(1).unsqueeze(3) # (n_batch, 1, query_seq_len, 1)
        q_mask = q_mask.repeat(1, 1, 1, k_seq_len)         # (n_batch, 1, query_seq_len, key_seq_len)

        mask = k_mask & q_mask
        return mask
    

    def make_no_peak_mask(self, q, k):
        q_seq_len, k_seq_len = q.size(1), k.size(1)

        # q_seq_len x k_seq_len
        mask = torch.tril(torch.ones(q_seq_len, k_seq_len)).type(torch.BoolTensor).to(self.device)

        return mask


In [234]:
# 인풋 데이터 이해  
for idx, (src, trg) in enumerate(train_iter):
    trg_x = trg[:, :-1]
    trg_y = trg[:, 1:]
    break
    
print(src, '\n')
print(trg_x)

# model 선언 
model = Transformer()
output = model(srg, trg_x)

tensor([[   2,   19,   25,   15, 1169,  808,   17,   57,   84,  336, 1339,    5,
            3]]) 

tensor([[   2,   21,   85,  257,   31,   87,   22,   94,    7,   16,  112, 7910,
         3209,    4]])


### train.py

In [None]:
# https://github.com/hyunwoongko/transformer/blob/master/train.py

In [76]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)


In [None]:
# 1) 모델 선언 
src_pad_idx = 1
trg_pad_idx = 1
trg_sos_idx = 0
d_model = 64
enc_voc_size = len(vocab_src)
dec_voc_size = len(vocab_tgt)
max_len = 1000
ffn_hidden = 32
n_heads = 2
n_layers = 2
drop_prob = 0.1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = Transformer(src_pad_idx=src_pad_idx,
                    trg_pad_idx=trg_pad_idx,
                    trg_sos_idx=trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=enc_voc_size,
                    dec_voc_size=dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    num_layer=n_layers,
                    drop_prob=drop_prob,
                    device=device).to(device)

print(f'The model has {count_parameters(model):,} trainable parameters')

# 가중치 초기화 
model.apply(initialize_weights)

# 이거랑 같은 코드  
# for submodule in model.children() :
#     initialize_weights(submodule)

In [None]:
# 2) loss 정의
criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)

In [99]:
# 3) optimizer 및 lr 스케쥴러 정의 
from torch import nn, optim
from torch.optim import Adam

init_lr = 1e-5
weight_decay = 5e-4
adam_eps = 5e-9

clip = 1.0
inf = float('inf')

optimizer = Adam(params=model.parameters(),
                 lr=init_lr,
                 weight_decay=weight_decay,
                 eps=adam_eps)

# 학습과정에서 learning rate를 조정하는 스케쥴러
# ReduceLROnPlateau: 성능 향상이 없을 때 learning rate를 감소(성능기준: valid_loss나 metric 등으로 지정)
factor = 0.9  # 감소시킬 비율 lr*factor 
patience = 10 # metric이 향상되지 않을 때, patience 만큼 참고, 그 이후에 lr 감소  
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 verbose=True,
                                                 factor=factor,
                                                 patience=patience)

In [None]:
# 학습
num_epoch = 1000
warmup = 100
best_loss = inf

for epoch in range(num_epoch)

    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(train_iter):

        # input 
        src = batch[0]
        trg = batch[1]

        # output
        output = model(src, trg)

        # loss
        loss = criterion(output, trg)

        optimizer.zero_grad() # loss.backward()를 호출할때 초기설정은 매번 gradient를 더해주는 것으로 설정 따라서 한번 돌고 초기화해줘야함
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    epoch_loss = epoch_loss/(i+1)
    print(f'epoch:{epoch}  loss:{epoch_loss}')
    
    
    # 성능체크(scheduler 사용)
    if epoch > warmup:
        scheduler.step(valid_loss) # 이 경우 valid_loss를 성능으로 지정
        
    # 모델저장(valid_loss가 감소할때만 저장)
    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), 'saved/model-{0}.pt'.format(valid_loss))
