In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
 
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

In [2]:
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split

import matplotlib.ticker as ticker
import matplotlib.pyplot as plt

import time
import re
import os
import io

# Step 1. 데이터 다운로드

아래 링크에서 korean-english-park.train.tar.gz 를 다운로드 받아 한영 병렬 데이터를 확보합니다.

https://github.com/jungyeul/korean-parallel-corpora/tree/master/korean-english-news-v1

In [3]:
import os

folder_path = os.getenv('HOME')+'/aiffel/transformer/'

In [4]:
kor_data_path = folder_path + 'korean-english-park.train.ko'
en_data_path  = folder_path + 'korean-english-park.train.en'

In [5]:
with open(kor_data_path, "r") as f:
    kor_data = f.read().splitlines()

    
with open(en_data_path, "r") as f:
    en_data = f.read().splitlines()

In [6]:
print("Data Size:", len(kor_data))
print("Example:")
for sen in kor_data[0:100][::20]: print(">>", sen)

Data Size: 94123
Example:
>> 개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"
>> 북한의 핵무기 계획을 포기하도록 하려는 압력이 거세지고 있는 가운데, 일본과 북한의 외교관들이 외교 관계를 정상화하려는 회담을 재개했다.
>> "경호 로보트가 침입자나 화재를 탐지하기 위해서 개인적으로, 그리고 전문적으로 사용되고 있습니다."
>> 수자원부 당국은 논란이 되고 있고, 막대한 비용이 드는 이 사업에 대해 내년에 건설을 시작할 계획이다.
>> 또한 근력 운동은 활발하게 걷는 것이나 최소한 20분 동안 뛰는 것과 같은 유산소 활동에서 얻는 운동 효과를 심장과 폐에 주지 않기 때문에, 연구학자들은 근력 운동이 심장에 큰 영향을 미치는지 여부에 대해 논쟁을 해왔다.


In [7]:
print("Data Size:", len(en_data))
print("Example:")
for sen in en_data[0:100][::20]: print(">>", sen)

Data Size: 94123
Example:
>> Much of personal computing is about "can you top this?"
>> Amid mounting pressure on North Korea to abandon its nuclear weapons program Japanese and North Korean diplomats have resumed talks on normalizing diplomatic relations.
>> “Guard robots are used privately and professionally to detect intruders or fire,” Karlsson said.
>> Authorities from the Water Resources Ministry plan to begin construction next year on the controversial and hugely expensive project.
>> Researchers also have debated whether weight-training has a big impact on the heart, since it does not give the heart and lungs the kind of workout they get from aerobic activities such as brisk walking or running for at least 20 minutes.


# Step 2. 데이터 정제

1) set 데이터형이 중복을 허용하지 않는다는 것을 활용해 중복된 데이터를 제거하도록 합니다. 데이터의 병렬 쌍이 흐트러지지 않게 주의하세요! 중복을 제거한 데이터를 cleaned_corpus 에 저장합니다.

2) 정제 함수를 아래 조건을 만족하게 정의하세요.

- 모든 입력을 소문자로 변환합니다.
- 알파벳, 문장부호, 한글만 남기고 모두 제거합니다.
- 문장부호 양옆에 공백을 추가합니다.
- 문장 앞뒤의 불필요한 공백을 제거합니다.

3) 한글 말뭉치 kor_corpus 와 영문 말뭉치 eng_corpus 를 각각 분리한 후, 정제하여 토큰화를 진행합니다! 토큰화에는 Sentencepiece를 활용하세요. 첨부된 공식 사이트를 참고해 아래 조건을 만족하는 generate_tokenizer() 함수를 정의합니다.

최종적으로 ko_tokenizer 과 en_tokenizer 를 얻으세요. en_tokenizer에는 set_encode_extra_options("bos:eos") 함수를 실행해 타겟 입력이 문장의 시작 토큰과 끝 토큰을 포함할 수 있게 합니다.

4) 모든 데이터를 사용할 경우 학습에 굉장히 오랜 시간이 걸립니다. 토크나이저를 활용해 토큰의 길이가 50 이하인 데이터를 선별하여 src_corpus 와 tgt_corpus 를 각각 구축하고 텐서 enc_train 과 dec_train 으로 변환하세요!

### 2-1

In [8]:
cleaned_corpus = set(zip(kor_data,en_data))

### 2-2

In [9]:
def preprocess_sentence_ko(sentence):
    
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^ㄱ-ㅎ|가-힣?.!,]+", " ", sentence)
    sentence = sentence.strip()

    return sentence

In [10]:
def preprocess_sentence_en(sentence):
    
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
    sentence = sentence.strip()

    return sentence

In [11]:
kor_corpus = []
en_corpus = []

for i in cleaned_corpus:
    temp_kor = preprocess_sentence_ko(i[0])
    temp_en  = preprocess_sentence_en(i[1])
    kor_corpus.append(temp_kor)
    en_corpus.append(temp_en)

In [12]:
print(len(kor_corpus))
print(len(en_corpus))

78968
78968


In [13]:
print(kor_corpus[100])
print(en_corpus[100])

모슬리 회장은 매춘부들을 자신의 아파트에 부른 것을 인정하지만 , 나치처럼 꾸미고 성관계를 가진 것은 부인했다 .
mosley admits to visiting the prostitutes , but denies there were nazi overtones to the encounter .


### 2-3

In [14]:
def generate_tokenizer_ko(corpus,
                       vocab_size,
                       lang="ko",
                       pad_id=0,
                       bos_id=1,
                       eos_id=2,
                       unk_id=3):
    file = "./%s_corpus.txt" % lang
    model = "%s_spm" % lang

    with open(file, 'w') as f:
        for row in corpus: f.write(str(row) + '\n')

    import sentencepiece as spm
    spm.SentencePieceTrainer.Train(
        '--input=./%s --model_prefix=%s --vocab_size=%d'\
        % (file, model, vocab_size) + \
        '--pad_id==%d --bos_id=%d --eos_id=%d --unk_id=%d'\
        % (pad_id, bos_id, eos_id, unk_id)
    )

    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load('%s.model' % model)

    return tokenizer

In [15]:
def generate_tokenizer_en(corpus,
                       vocab_size,
                       lang="eng",
                       pad_id=0,
                       bos_id=1,
                       eos_id=2,
                       unk_id=3):
    file = "./%s_corpus.txt" % lang
    model = "%s_spm" % lang

    with open(file, 'w') as f:
        for row in corpus: f.write(str(row) + '\n')

    import sentencepiece as spm
    spm.SentencePieceTrainer.Train(
        '--input=./%s --model_prefix=%s --vocab_size=%d'\
        % (file, model, vocab_size) + \
        '--pad_id==%d --bos_id=%d --eos_id=%d --unk_id=%d'\
        % (pad_id, bos_id, eos_id, unk_id)
    )

    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load('%s.model' % model)

    return tokenizer

In [16]:
VOCAB_SIZE = 20000
tokenizer_ko = generate_tokenizer_ko(kor_corpus, VOCAB_SIZE)
tokenizer_en = generate_tokenizer_en(en_corpus, VOCAB_SIZE)
tokenizer_en.set_encode_extra_options("bos:eos")

True

In [17]:
from tqdm import tqdm_notebook

src_corpus = []
tgt_corpus = []

for i in tqdm_notebook(range(len(kor_corpus))):

    src_tokens = tokenizer_ko.encode_as_ids(kor_corpus[i])
    tgt_tokens = tokenizer_en.encode_as_ids(en_corpus[i])

    if (len(src_tokens) > 50): continue
    if (len(tgt_tokens) > 50): continue
    
    src_corpus.append(src_tokens)
    tgt_corpus.append(tgt_tokens)

len(src_corpus)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=78968.0), HTML(value='')))




68324

In [18]:
enc_tensor = tf.keras.preprocessing.sequence.pad_sequences(src_corpus, padding='post')
dec_tensor = tf.keras.preprocessing.sequence.pad_sequences(tgt_corpus, padding='post')

enc_train, enc_val, dec_train, dec_val = \
train_test_split(enc_tensor, dec_tensor, test_size=0.01)

print(len(enc_train), len(enc_val), len(dec_train), len(dec_val))

67640 684 67640 684


# Step 3. 훈련하기

앞서 필요한 것들을 모두 정의했기 때문에 우리는 훈련만 하면 됩니다! 아래 과정을 차근차근 따라가며 모델을 훈련하고, 예문에 대한 멋진 번역을 제출하세요!

1. 2 Layer를 가지는 트랜스포머를 선언하세요. 하이퍼파라미터는 자유롭게 조절합니다.
2. 논문에서 사용한 것과 동일한 Learning Rate Scheduler를 선언하고, 이를 포함하는 Adam Optimizer를 선언하세요. Optimizer의 파라미터 역시 논문과 동일하게 설정합니다.
3. Loss 함수를 정의하세요. Sequence-to-sequence 모델에서 사용했던 Loss와 유사하되, Masking 되지 않은 입력의 개수로 Scaling하는 과정을 추가합니다. (트랜스포머가 모든 입력에 대한 Loss를 한 번에 구하기 때문)
4. 입력 데이터에 알맞은 Mask를 생성하고, 이를 모델에 전달하여 연산에서 사용할 수 있게 합니다.
5. 매 Epoch 마다 제시된 예문에 대한 번역을 생성하고, 멋진 번역이 생성되면 그때의 하이퍼파라미터와 생성된 번역을 제출하세요!

### Positional Encoding

In [19]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, int(i) / d_model)

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])

    return sinusoid_table

### Generate Padding Mask

In [20]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_mask = generate_padding_mask(tgt)

    dec_causality_mask = generate_causality_mask(tgt.shape[1], tgt.shape[1])
    dec_mask = tf.maximum(dec_mask, dec_causality_mask)

    dec_enc_causality_mask = generate_causality_mask(tgt.shape[1], src.shape[1])
    dec_enc_mask = tf.maximum(enc_mask, dec_enc_causality_mask)

    return enc_mask, dec_enc_mask, dec_mask

### Multi Head Attention

In [21]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        self.depth = d_model // self.num_heads

        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)

        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions


    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x


    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)

        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)

        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)

        out = self.combine_heads(out)
        out = self.linear(out)

        return out, attention_weights

### Position-wise Feed Forward Network

In [22]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.fc1(x)
        out = self.fc2(out)

        return out

### EncoderLayer

In [23]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, mask):

        """
        Multi-Head Attention
        """
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual

        """
        Position-Wise Feed Forward Network
        """
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, enc_attn

### Decoder Layer

In [24]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, enc_out, causality_mask, padding_mask):

        """
        Masked Multi-Head Attention
        """
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual

        """
        Multi-Head Attention
        """
        residual = out
        out = self.norm_2(out)
        out, dec_enc_attn = self.dec_self_attn(out, enc_out, enc_out, causality_mask)
        out = self.do(out)
        out += residual

        """
        Position-Wise Feed Forward Network
        """
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn

### Encoder

In [25]:
class Encoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, mask):
        out = x

        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)

        return out, enc_attns

### Decoder

In [26]:
class Decoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]


    def call(self, x, enc_out, causality_mask, padding_mask):
        out = x

        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, causality_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

### Transformer(Full Model)

In [27]:
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared_fc=True,
                    shared_emb=False):
        super(Transformer, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        out = emb(x)

        if self.shared_fc: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out


    def call(self, enc_in, dec_in, enc_mask, causality_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)

        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, causality_mask, dec_mask)

        logits = self.fc(dec_out)

        return logits, enc_attns, dec_attns, dec_enc_attns

In [28]:
N_LAYERS= 2
D_MODEL = 512
N_HEADS = 8
D_FF    = 2048
DROPOUT = 0.3

In [29]:
transformer = Transformer(
    n_layers=N_LAYERS,
    d_model=D_MODEL,
    n_heads=N_HEADS,
    d_ff=D_FF,
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    pos_len=200,
    dropout=DROPOUT,
    shared_fc=True,
    shared_emb=True)

d_model = 512

### Learning Rate Scheduler

In [30]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()

        self.d_model = d_model
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)

        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

### Learning Rate & Optimizer

In [31]:
learning_rate = LearningRateScheduler(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                        beta_1=0.9,
                                        beta_2=0.98, 
                                        epsilon=1e-9)

### Loss Function

In [32]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

### Train Step

In [33]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1]
    gold = tgt[:, 1:]

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

### Attention 시각화 함수

In [34]:
def visualize_attention(src, tgt, enc_attns, dec_attns, dec_enc_attns):
    def draw(data, ax, x="auto", y="auto"):
        import seaborn
        seaborn.heatmap(data, 
                        square=True,
                        vmin=0.0, vmax=1.0, 
                        cbar=False, ax=ax,
                        xticklabels=x,
                        yticklabels=y)

    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Encoder Layer", layer + 1)
        for h in range(4):
            draw(enc_attns[layer][0, h, :len(src), :len(src)], axs[h], src, src)
        plt.show()

    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Decoder Self Layer", layer+1)
        for h in range(4):
            draw(dec_attns[layer][0, h, :len(tgt), :len(tgt)], axs[h], tgt, tgt)
        plt.show()

        print("Decoder Src Layer", layer+1)
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        for h in range(4):
            draw(dec_enc_attns[layer][0, h, :len(tgt), :len(src)], axs[h], src, tgt)
        plt.show()

### 번역 생성 함수

In [35]:
def evaluate(sentence, model, src_tokenizer, tgt_tokenizer):
    sentence = preprocess_sentence_ko(sentence)

    pieces = src_tokenizer.encode_as_pieces(sentence)
    tokens = src_tokenizer.encode_as_ids(sentence)

    _input = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                           maxlen=enc_train.shape[-1],
                                                           padding='post')

    ids = []
    output = tf.expand_dims([tgt_tokenizer.bos_id()], 0)
    for i in range(dec_train.shape[-1]):
        enc_padding_mask, combined_mask, dec_padding_mask = \
        generate_masks(_input, output)

        predictions, enc_attns, dec_attns, dec_enc_attns =\
        model(_input, 
              output,
              enc_padding_mask,
              combined_mask,
              dec_padding_mask)

        predicted_id = \
        tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()

        if tgt_tokenizer.eos_id() == predicted_id:
            result = tgt_tokenizer.decode_ids(ids)
            return pieces, result, enc_attns, dec_attns, dec_enc_attns

        ids.append(predicted_id)
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)

    result = tgt_tokenizer.decode_ids(ids)

    return pieces, result, enc_attns, dec_attns, dec_enc_attns

### 번역 생성 및 Attention 시각화 결합

In [36]:
def translate(sentence, model, src_tokenizer, tgt_tokenizer, plot_attention=False):
    pieces, result, enc_attns, dec_attns, dec_enc_attns = \
    evaluate(sentence, model, src_tokenizer, tgt_tokenizer)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    if plot_attention:
        visualize_attention(pieces, result.split(), enc_attns, dec_attns, dec_enc_attns)

In [37]:
sentences = ['오바마는 대통령이다.', '시민들은 도시 속에 산다.', '커피는 필요 없다.', '일곱 명의 사망자가 발생했다.']

### Train

In [38]:
import warnings

warnings.filterwarnings(action='ignore') 

In [39]:
import random
from tqdm import tqdm_notebook 

BATCH_SIZE = 64
EPOCHS = 15



for epoch in range(EPOCHS):
    total_loss = 0

    idx_list = list(range(0, enc_train.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm_notebook(idx_list)

    for (batch, idx) in enumerate(t):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = \
        train_step(enc_train[idx:idx+BATCH_SIZE],
                    dec_train[idx:idx+BATCH_SIZE],
                    transformer,
                    optimizer)

        total_loss += batch_loss

        t.set_description_str('Epoch %2d' % (epoch + 1))
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))
    
    print('Translations')
    for i in sentences:
        translate(i, transformer, tokenizer_ko, tokenizer_en, plot_attention=False)
    print('\n','Hyperparameters')
    print('> n_layers:', N_LAYERS)
    print('> d_model:', D_MODEL)
    print('> n_heads:', N_HEADS)
    print('> d_ff:', D_FF)
    print('> dropout:', DROPOUT)
    print('\n','Training Parameters')
    print('> Warmup Steps: 4000')
    print('> Batch Size: 64')
    print('> Epoch At', epoch+1)

HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: obama says they is the first time .
Input: 시민들은 도시 속에 산다.
Predicted translation: they are they are being they .
Input: 커피는 필요 없다.
Predicted translation: they are not to they to they .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the ministry were killed in the town of the town of the town of the town of the town of the town of the capital .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 1


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: obama has been a president elect barack obama s president elect barack obama s president elect obama .
Input: 시민들은 도시 속에 산다.
Predicted translation: they have been in the city of the city .
Input: 커피는 필요 없다.
Predicted translation: he is not a major league .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: a day of deaths were killed in the deaths .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 2


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: obama s presidential president .
Input: 시민들은 도시 속에 산다.
Predicted translation: they have to been in the city of the city .
Input: 커피는 필요 없다.
Predicted translation: no longer than they were not going to the way .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the death toll from the death toll at the death toll at the death toll at the death toll on thursday .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 3


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: obama is the president of the president .
Input: 시민들은 도시 속에 산다.
Predicted translation: the city of the city .
Input: 커피는 필요 없다.
Predicted translation: and don t need to be needed to be needed to be needed to be needed to be needed to be needed to be needed .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the death toll from the death toll from a seven day .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 4


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: obama s president .
Input: 시민들은 도시 속에 산다.
Predicted translation: they re gonna be the city of the city .
Input: 커피는 필요 없다.
Predicted translation: don t do not have no reservations .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the death toll from the u . s . s .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 5


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: obama is a good time .
Input: 시민들은 도시 속에 산다.
Predicted translation: they have long been in the city .
Input: 커피는 필요 없다.
Predicted translation: don t need care .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: cnn seven deaths were reported tuesday .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 6


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: president barack obama takes on the presidency .
Input: 시민들은 도시 속에 산다.
Predicted translation: louis city citizens in mountain citys in the city .
Input: 커피는 필요 없다.
Predicted translation: coffee needs no coffee
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the seven death toll rose to thursday .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 7


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: obama is the president obama presiden too good .
Input: 시민들은 도시 속에 산다.
Predicted translation: they are in the city .
Input: 커피는 필요 없다.
Predicted translation: coffee is no need to doesn t .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were killed and were initially climbed into the seven deaths .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 8


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: obama is the president .
Input: 시민들은 도시 속에 산다.
Predicted translation: in the city is in for the city of the city .
Input: 커피는 필요 없다.
Predicted translation: cup cannot be no longer de coffee .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: a seven death toll was killed and others were sunday when a death toll soared .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 9


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: president barack obama says he s been a lot of morning .
Input: 시민들은 도시 속에 산다.
Predicted translation: some people have even at the city of the city only one .
Input: 커피는 필요 없다.
Predicted translation: not coffee no coffees or coffee is no cause .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were killed and seven died wednesday s parliament tuesday .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 10


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: obama is morning .
Input: 시민들은 도시 속에 산다.
Predicted translation: some in the city of yeah city mountain is the city on track .
Input: 커피는 필요 없다.
Predicted translation: not necessarily palestinians .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the death toll was killed and another were reported thursday .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 11


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: president barack obama is the president .
Input: 시민들은 도시 속에 산다.
Predicted translation: some republicans inside city mountain mountain mountain god .
Input: 커피는 필요 없다.
Predicted translation: not cause no coffee does not need .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the death toll from the may have been killed by a may metropolitan police community and put the death toll more than a day .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 12


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: president barack obama is the president .
Input: 시민들은 도시 속에 산다.
Predicted translation: some citizens in the city .
Input: 커피는 필요 없다.
Predicted translation: not necessarily is no energy is no alcohol or never stops from coffee .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: at the same time killed people were arrested .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 13


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: president barack obama is the head of the obama camp .
Input: 시민들은 도시 속에 산다.
Predicted translation: some people in the city of the city .
Input: 커피는 필요 없다.
Predicted translation: not necessary is no needy .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: seven people were killed and others were reported tuesday .

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 14


HBox(children=(FloatProgress(value=0.0, max=1057.0), HTML(value='')))


Translations
Input: 오바마는 대통령이다.
Predicted translation: obama the democratic president is the same
Input: 시민들은 도시 속에 산다.
Predicted translation: the territory has even bothered the city
Input: 커피는 필요 없다.
Predicted translation: the coffee is no needs to de its coffee .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the death toll hit the seven death toll

 Hyperparameters
> n_layers: 2
> d_model: 512
> n_heads: 8
> d_ff: 2048
> dropout: 0.3

 Training Parameters
> Warmup Steps: 4000
> Batch Size: 64
> Epoch At 15


# 결과 및 평가

Transformer를 사용하여 번역기를 만들어 보았습니다.  
확실히 seq2seq를 썼을 때보다 결과가 훨씬 좋은 것 같습니다.  
또 epoch이 증가하면 증가할수록 번역의 결과가 좋아지는 것을 확인 할 수 있었습니다.  