<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Modeling" data-toc-modified-id="Modeling-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Modeling</a></span></li><li><span><a href="#Training" data-toc-modified-id="Training-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Inference" data-toc-modified-id="Inference-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Inference</a></span></li></ul></div>

In [2]:
!pip install -q tf-nightly

In [3]:
import time
import tensorflow_datasets as tfds
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import tensorflow as tf
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Layer, Dense, Flatten, Dropout, Concatenate, Add, Dot, Multiply, Reshape, Activation, BatchNormalization, LayerNormalization, SimpleRNNCell, RNN, SimpleRNN, LSTM, Embedding, Bidirectional, TimeDistributed, Conv1D, Conv2D, MaxPool1D, MaxPool2D, GlobalMaxPool1D, GlobalMaxPool2D, AveragePooling1D, AveragePooling2D, GlobalAveragePooling1D, GlobalAveragePooling2D, ZeroPadding2D, RepeatVector
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from tensorflow.keras.optimizers import SGD, Adagrad, Adam
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
# MeanSquaredError, MeanAbsoluteError, MeanAbsolutePercentageError, BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy, CosineSimilarity
from tensorflow.keras import losses
# MeanSquaredError, RootMeanSquaredError, MeanAbsoluteError, MeanAbsolutePercentageError, BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy, CosineSimilarity
from tensorflow.keras import metrics
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.activations import linear, sigmoid, relu
from tensorflow.keras.initializers import RandomNormal, glorot_uniform, he_uniform, Constant

plt.style.use("dark_background")

In [4]:
# `with_info`: If `True`, `tfds.load()` will return the tuple `(tf.data.Dataset, tfds.core.DatasetInfo)`, the latter containing the info associated with the builder.
# `as_supervised`: If `True`, the returned `tf.data.Dataset` will have a 2-tuple structure `(input, label)` according to `builder.info.supervised_keys`. If `False`, the returned `tf.data.Dataset` will have a dictionary with all the features.
dataset, metadata = tfds.load("ted_hrlr_translate/pt_to_en", with_info=True, as_supervised=True)
dataset_tr = dataset["train"]
dataset_val = dataset["validation"]
dataset_te = dataset["test"]

In [5]:
tokenizer_src = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus((pt.numpy() for pt, en in dataset_tr), target_vocab_size=2**13)
tokenizer_tar = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus((en.numpy() for pt, en in dataset_tr), target_vocab_size=2**13)

In [6]:
max_len = 40
buffer_size = tf.data.experimental.cardinality(dataset_tr).numpy()
batch_size = 64

def encode(lang1, lang2):
    lang1 = [tokenizer_src.vocab_size] + tokenizer_src.encode(lang1.numpy()) + [tokenizer_src.vocab_size + 1]
    lang2 = [tokenizer_tar.vocab_size] + tokenizer_tar.encode(lang2.numpy()) + [tokenizer_tar.vocab_size + 1]
    return lang1, lang2

def tf_encode(pt, en):
    # `func`: A Python function that accepts `inp` as arguments, and returns a value (or list of values) whose type is described by `Tout`.
    # `inpt`: Input arguments for func. A list whose elements are Tensors or a single Tensor.
    result_pt, result_en = tf.py_function(func=encode, inp=[pt, en], Tout=[tf.int64, tf.int64])
    result_pt.set_shape([None])
    result_en.set_shape([None])
    return result_pt, result_en

def filter_max_len(x, y):
    return tf.logical_and(tf.size(x) <= max_len, tf.size(y) <= max_len)

# This transformation applies `map_func` to each element of this dataset, and returns a new dataset containing the transformed elements, in the same order as they appeared in the input. `map_func` can be used to change both the values and the structure of a dataset's elements.
dataset_tr = dataset_tr.map(tf_encode)
# `predicate`: A function mapping a dataset element to a boolean.
# Returns the dataset containing the elements of this dataset for which `predicate` is `True`.
dataset_tr = dataset_tr.filter(filter_max_len)
# The first time the dataset is iterated over, its elements will be cached either in the specified file or in memory. Subsequent iterations will use the cached data.
# `filename`: When caching to a file, the cached data will persist across runs. Even the first iteration through the data will read from the cache file. Changing the input pipeline before the call to `cache()` will have no effect until the cache file is removed or the `filename` is changed. If a `filename` is not provided, the dataset will be cached in memory.
dataset_tr = dataset_tr.cache()
# For perfect shuffling, a `buffer_size` greater than or equal to the full size of the dataset is required.
# If not, only the first `buffer_size` elements will be selected randomly.
# `reshuffle_each_iteration` controls whether the shuffle order should be different for each epoch.
dataset_tr = dataset_tr.shuffle(buffer_size)
# Pad to the smallest per-`batch size` that fits all elements.
# Unlike `batch()`, the input elements to be batched may have different shapes, and this transformation will pad each component to the respective shape in `padded_shapes`. The `padded_shapes` argument determines the resulting shape for each dimension of each component in an output element.
# `padded_shapes`:
    # If `None`: The dimension is unknown, the component will be padded out to the maximum length of all elements in that dimension.
    # If not `None`: The dimension is a constant, the component will be padded out to that length in that dimension.
# `padding_values`
# `drop_remainder`
dataset_tr = dataset_tr.padded_batch(batch_size)
# Most dataset input pipelines should end with a call to prefetch. This allows later elements to be prepared while the current element is being processed. This often improves latency and throughput, at the cost of using additional memory to store prefetched elements.
# `buffer_size`: The maximum number of elements that will be buffered when prefetching. If the value `tf.data.AUTOTUNE` is used, then the buffer size is dynamically tuned.
dataset_tr = dataset_tr.prefetch(tf.data.AUTOTUNE)

dataset_val = dataset_val.map(tf_encode)
dataset_val = dataset_val.filter(filter_max_len)
dataset_val = dataset_val.padded_batch(batch_size)

# Modeling

In [203]:
n_layers = 4
d_model = 128
dff = 512
n_heads = 8
dk = d_model//n_heads

def padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    # `(batch_size, 1, 1, seq_len)`
    return seq[:, None, None, :]

def look_ahead_mask(tar):
    size = tf.shape(tar)[1]
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    p_mask = padding_mask(tar)
    return tf.maximum(p_mask, mask)

# `x`: `(batch_size, seq_len)`
class Embedder(Layer):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.embedding = Embedding(input_dim=vocab_size, output_dim=d_model)
        self.dropout = Dropout(rate=0.1)
        
    def __call__(self, x, training):
        # `(batch_size, seq_len, d_model)`
        z = self.embedding(x)

        # Positional Encoding
        seq_len = x.shape[1]
        a, b = np.meshgrid(np.arange(d_model), np.arange(1000))
        pe_mat = b/10000**(2*(a//2)/d_model)
        pe_mat[:, 0::2] = np.sin(pe_mat[:, 0::2])
        pe_mat[:, 1::2] = np.cos(pe_mat[:, 1::2])
        pe_mat = pe_mat[None, :seq_len, :]

        z = (d_model**0.5)*z + pe_mat
    #     z = (d_model**0.5)*z + pe_mat[:, :seq_len, :]
        z = self.dropout(z, training=training)
        # `(batch_size, seq_len, d_model)`
        return z

class MultiheadAttention(Layer):
    def __init__(self):
        super().__init__()

        self.dense_q = Dense(units=d_model)
        self.dense_k = Dense(units=d_model)
        self.dense_v = Dense(units=d_model)
        self.dense = Dense(units=d_model)
        
    def __call__(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]

        def split_heads(x):
            x = tf.reshape(x, shape=(batch_size, -1, n_heads, dk))
            return tf.transpose(x, perm=[0, 2, 1, 3])

        # `(batch_size, seq_len_dec, d_model)`
        queries = self.dense_q(queries)
        # `(batch_size, seq_len_enc, d_model)`
        keys = self.dense_k(keys)
        # `(batch_size, seq_len_enc, d_model)`
        values = self.dense_v(values)

        # `(batch_size, n_heads, seq_len_dec, dk)`
        queries = split_heads(queries)
        # `(batch_size, n_heads, seq_len_enc, dk)`
        keys = split_heads(keys)
        # `(batch_size, n_heads, seq_len_enc, dk)`
        values = split_heads(values)

        # Scaled Dot-Product Attention
        # `(batch_size, n_heads, seq_len_dec, seq_len_enc)`
        attn_scores = tf.matmul(queries, keys, transpose_b=True)/dk**0.5
        if mask is not None:
            attn_scores = attn_scores + (mask*-1e9)
        attn_weights = tf.nn.softmax(attn_scores, axis=-1)
        # `(batch_size, n_heads, seq_len_dec, dk)`
        context_vec = tf.matmul(attn_weights, values)

        # `(batch_size, seq_len_dec, n_heads, dk)`
        z = tf.transpose(context_vec, perm=[0, 2, 1, 3])
        # `(batch_size, seq_len_dec, d_model)`
        z = tf.reshape(z, shape=(batch_size, -1, d_model))
        z = self.dense(z)
        return z, attn_weights

def positionwise_ffnn(x):
    # `(batch_size, seq_len, dff)`
    z = Dense(units=dff, activation="relu")(x)
    # `(batch_size, seq_len, d_model)`
    z = Dense(units=d_model)(z)
    return z

# ENCODER
class Encoder(Layer):
    def __init__(self):
        super().__init__()
        
        self.mha = MultiheadAttention()
        
def __call__(self, x, training, padding_mask):
    ## "Self-Attention" Part
    z1, _ = self.mha(queries=x, keys=x, values=x, mask=padding_mask) 
    z1 = Dropout(rate=0.1)(z1, training=training)
    ## "Add & Normalize" Part
    z1 = x + z1
    z1 = LayerNormalization(epsilon=1e-6)(z1)
    ## "Feed Forward" Part
    z2 = positionwise_ffnn(z1)
    z2 = Dropout(rate=0.1)(z2, training=training)
    ## "Add & Normalize" Part
    z2 = z1 + z2
    z2 = LayerNormalization(epsilon=1e-6)(z2)
    # (batch_size, seq_len_enc, d_model)
    return z2

# DECODER
class Decoder(Layer):
    def __init__(self):
        super().__init__()
def decoder(x, enc_output, training, padding_mask, look_ahead_mask):
    ## "Self-Attention" Part
    z1, attn_weights1 = multihead_attention(queries=x, keys=x, values=x, mask=look_ahead_mask)
    z1 = Dropout(rate=0.1)(z1, training=training)
    ## "Add & Normalize" Part
    z1 = x + z1
    z1 = LayerNormalization(epsilon=1e-6)(z1)
    ## "Encoder-Decoder Attention" Part
    z2, attn_weights2 = multihead_attention(queries=z1, keys=enc_output, values=enc_output, mask=padding_mask)
    z2 = Dropout(rate=0.1)(z2, training=training)
    ## "Add & Normalize" Part
    z2 = z1 + z2
    z2 = LayerNormalization(epsilon=1e-6)(z2)
    ## "Feed Forward" Part
    z3 = positionwise_ffnn(z2)
    z3 = Dropout(rate=0.1)(z3, training=training)
    ## "Add & Normalize" Part
    z3 = z2 + z3
    # (batch_size, seq_len_dec, d_model)
    z3 = LayerNormalization(epsilon=1e-6)(z3)
    return z3, attn_weights1, attn_weights2

class Transformer(Model):
    def __init__(self):
        super().__init__()
        
        self.embedder_enc = Embedder(vocab_size=tokenizer_src.vocab_size + 2)
        self.encoder = Encoder()
        
    def __call__(self, enc, dec, training):
        enc_p_mask = padding_mask(enc)
        dec_p_mask = padding_mask(enc)
        l_mask = look_ahead_mask(dec)

        z_enc = self.embedder_enc(enc, training=training)
        for _ in range(n_layers):
            z_enc = self.encoder(z_enc, training=training, padding_mask=enc_p_mask)

        z_dec = embedder(dec, vocab_size=tokenizer_tar.vocab_size + 2, training=training)
#         dic = dict()
        for i in range(n_layers):
            z_dec, attn_weights1, attn_weights2 = decoder(z_dec, enc_output=z_enc, training=training, padding_mask=dec_p_mask, look_ahead_mask=l_mask)
#             dic[f"self_attention_attn_weights_{i}"] = attn_weights1
#             dic[f"encoder_decoder_attention_attn_weights_{i}"] = attn_weights2

        outputs = Dense(units=tokenizer_tar.vocab_size + 2)(z_dec)
        return outputs

# Training

In [204]:
# model = Transformer()

# # Optimizer
# class CustomSchedule(LearningRateSchedule):
#     def __init__(self, warmup_steps=4000):
#         super(CustomSchedule, self).__init__()

#         self.warmup_steps = warmup_steps
    
#     def __call__(self, step):
#         arg1 = tf.math.rsqrt(step)
#         arg2 = step*(self.warmup_steps**-1.5)
#         return tf.math.rsqrt(d_model) * tf.math.minimum(arg1, arg2)

# learning_rate = CustomSchedule()
# optimizer = Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
optimizer = Adam()

In [205]:
# Loss function
def loss_func(y_true, y_pred):
    # `False` if `0` else `True`
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    # `1` if True else `0`
    mask = tf.cast(mask, dtype=tf.float32)
    # Padding을 고려하지 않은 Losses를 계산합니다.
    # `from_logits=True`: Computes loss from a logit (i.e, value in [int, int])
# `from_logits=False`: Computes loss from a probability (i.e, value in [0, 1])
    # `reduction="none"`: 계산된 Losses에 대해 별도의 연산을 하지 않습니다.
    # `reduction="auto"`: Batch에 대해 평균을 출력합니다.
    # `reduction="sum"`: Batch에 대해 합을 출력합니다.
    loss_obj = losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
    loss = loss_obj(y_true, y_pred)
    loss *= mask
    return tf.math.reduce_sum(loss)/tf.math.reduce_sum(mask)

# def accuracy_function(real, pred):
#     accuracies = tf.equal(real, tf.argmax(pred, axis=2))

#     mask = tf.math.logical_not(tf.math.equal(real, 0))
#     accuracies = tf.math.logical_and(mask, accuracies)

#     accuracies = tf.cast(accuracies, dtype=tf.float32)
#     mask = tf.cast(mask, dtype=tf.float32)
#     return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

metrics_obj_mean = metrics.Mean(name="train_loss")
metrics_obj_sca = metrics.SparseCategoricalAccuracy(name="train_accuracy")

# tf.function을 사용하면 그래프를 미리 컴파일 하기 때문에 속도가 상당히 빠름
# 같은 GPU여도 케라스에 비해서 체감상 7~8배 정도의 차이가 나는 것 같음
# `input_signature`: A possibly nested sequence of `tf.TensorSpec()` objects specifying the `shape`s and `dtype`s of the Tensors that will be supplied to this function. If `None`, a separate function is instantiated for each inferred `input_signature`. If `input_signature` is specified, every input to func must be a Tensor, and `func` cannot accept `**kwargs`.
# Since TensorFlow matches tensors based on their shape, using a `None` dimension as a wildcard will allow functions to reuse traces for variably-sized input. Variably-sized input can occur if you have sequences of different length, or images of different sizes for each batch.
@tf.function(input_signature=[tf.TensorSpec(shape=(None, None), dtype=tf.int64), tf.TensorSpec(shape=(None, None), dtype=tf.int64)])
# @tf.function
def train_step(enc, dec):
    dec_input = dec[:, :-1]
    dec_true = dec[:, 1:]
    with tf.GradientTape() as tape:
        dec_pred = model(enc=enc, dec=dec_input, training=True)
        loss = loss_func(dec_true, dec_pred)
    grads = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    metrics_obj_mean(loss)
    metrics_obj_sca(dec_true, dec_pred)

In [206]:
ckpt_path = "./pt_to_en_transformer"
# TensorFlow objects may contain trackable state, such as `tf.Variables`, `tf.keras.optimizers.Optimizer` implementations, `tf.data.Dataset` iterators, `tf.keras.Layer` implementations, or `tf.keras.Model` implementations. These are called trackable objects.
# A `Checkpoint` object can be constructed to save either a single or group of trackable objects to a checkpoint file. It maintains a `save_counter` for numbering checkpoints.
ckpt = tf.train.Checkpoint(optimizer=optimizer, model=model)
# Manages multiple checkpoints by keeping some and deleting unneeded ones.
ckpt_manager = tf.train.CheckpointManager(ckpt, directory=ckpt_path, max_to_keep=5)
# The prefix of the most recent checkpoint in directory.
if ckpt_manager.latest_checkpoint:
    # `save_path`: The path to the checkpoint, as returned by `save` or `tf.train.latest_checkpoint`.
    ckpt.restore(save_path=ckpt_manager.latest_checkpoint)
    print ("Latest checkpoint restored!")

In [207]:
epochs = 20
for epoch in range(1, epochs + 1):
    start = time.time()
    metrics_obj_mean.reset_states()
    metrics_obj_sca.reset_states()
    for (batch, (src, tar)) in enumerate(dataset_tr):
        train_step(src, tar)

        if batch%50 == 0:
            print(f"Epoch: {epoch:3d} | Batch: {batch:5d} | Loss: {metrics_obj_mean.result():5.4f} | Accuracy: {metrics_obj_sca.result():5.4f}")

    if epoch%1 == 0:
        # Every time `ckpt_manager.save()` is called, `save_counter` is increased.
        # `save_path`: The path to the new checkpoint. It is also recorded in the `checkpoints` and `latest_checkpoint` properties. `None` if no checkpoint is saved.
        save_path = ckpt_manager.save()
        print (f"Saving checkpoint for epoch {epoch} at {save_path}")
        print(f"Epoch: {epoch:3d} | Loss: {metrics_obj_mean.result():5.4f} | Accuracy: {metrics_obj_sca.result():5.4f}")
        print (f"Time taken for 1 epoch: {time.time() - start:5.0f} secs\n")

ValueError: in user code:

    File "<ipython-input-205-0967798310c4>", line 41, in train_step  *
        dec_pred = model(enc=enc, dec=dec_input, training=True)
    File "<ipython-input-183-5b205cefb16d>", line 158, in __call__  *
        z_enc = encoder(z_enc, training=training, padding_mask=enc_p_mask)
    File "<ipython-input-170-d198ae4ad3df>", line 105, in encoder  *
        z1, _ = multihead_attention(queries=x, keys=x, values=x, mask=padding_mask)
    File "<ipython-input-170-d198ae4ad3df>", line 66, in multihead_attention  *
        queries = Dense(units=d_model)(queries)
    File "C:\Users\82104\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler  **
        raise e.with_traceback(filtered_tb) from None

    ValueError: tf.function only supports singleton tf.Variables created on the first call. Make sure the tf.Variable is only created once or created outside tf.function. See https://www.tensorflow.org/guide/function#creating_tfvariables for more information.


In [164]:
@tf.function(input_signature=[tf.TensorSpec(shape=(None, None), dtype=tf.int64), tf.TensorSpec(shape=(None, None), dtype=tf.int64)])
def train_step(enc, dec):
    dec_input = dec[:, :-1]
    dec_true = dec[:, 1:]
    embedder(src, vocab_size=tokenizer_src.vocab_size + 2, training=True)
#     encoder(z, training=True, padding_mask=padding_mask(enc))
    
epochs = 20
for epoch in range(1, epochs + 1):
    start = time.time()
    metrics_obj_mean.reset_states()
    metrics_obj_sca.reset_states()
    for (batch, (src, tar)) in enumerate(dataset_tr):
        if batch == 1:
            train_step(src, tar)

NotImplementedError: in user code:

    File "<ipython-input-164-935a436ba5f3>", line 5, in train_step  *
        embedder(src, vocab_size=tokenizer_src.vocab_size + 2, training=True)
    File "<ipython-input-152-26dbc7406331>", line 41, in embedder  *
        z = (d_model**0.5)*z + pe_mat

    NotImplementedError: Cannot convert a symbolic tf.Tensor (mul:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported.


In [132]:
src

<tf.Tensor: shape=(64, 37), dtype=int64, numpy=
array([[8214,   42,    8, ...,    0,    0,    0],
       [8214,   42,    8, ...,    0,    0,    0],
       [8214,   32,    6, ...,  393,  437, 8215],
       ...,
       [8214, 2112, 7990, ...,    0,    0,    0],
       [8214,  104,    1, ...,    0,    0,    0],
       [8214,   66,   89, ...,    0,    0,    0]], dtype=int64)>

# Inference
- 번역할 포르투갈어는 인코더 레이어를 거쳐 인코딩이 되고, 디코더에는 영어 문장을 넣지 않고, 영어 문장의 시작 토큰만 인풋으로 들어가게 됩니다. 그러면 인코딩 된 것과 + 시작 토큰을 활용해서 다음 단어를 예측하고, 인코딩 된 것 + 시작 토큰 + 전에 예측된 단어를 활용해서 다음 단어를 예측하는 방식입니다.

In [None]:
def evaluate(src_sentence):
    # src_sentence : 문자 (string)
    bos = [tokenizer_src.vocab_size]
    eos = [tokenizer_src.vocab_size + 1]

    src_sentence = bos + tokenizer_src.encode(src_sentence) + eos
    encoder_input = tf.expand_dims(src_sentence, 0)

    decoder_input = [tokenizer_tar.vocab_size]
    output = tf.expand_dims(decoder_input, 0)
    for i in range(max_len):
        # (batch_size, seq_len, vocab_size)
        predictions = model(encoder_input, output, False)
        # 예측 결과에서 마지막 부분만 추출
        # (batch_size, 1, vocab_size)
        predictions = predictions[:, -1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
        if predicted_id == tokenizer_tar.vocab_size + 1:
            return tf.squeeze(output, axis=0)
        # 예측된 단어를 전 단어와 결합하여 다음 예측에 써먹음
        output = tf.concat([output, predicted_id], axis=-1)
        return tf.squeeze(output, axis=0)

def translate(sentence):
    result= evaluate(sentence)
    predicted_sentence = tokenizer_tar.decode([i for i in result if i < tokenizer_tar.vocab_size])  

    print("input: {}".format(sentence))
    print("Predicted translation: {}".format(predicted_sentence))

In [None]:
translate("este é um problema que temos que resolver.")
print ("Real translation: this is a problem we have to solve .")

In [None]:
translate("os meus vizinhos ouviram sobre esta ideia.")
print ("Real translation: and my neighboring homes heard about this idea .")

In [None]:
translate("vou então muito rapidamente partilhar convosco algumas histórias de algumas coisas mágicas que aconteceram.")
print ("Real translation: so i "ll just share with you some stories very quickly of some magical things that have happened .")

In [None]:
translate("este é o primeiro livro que eu fiz.")
print ("Real translation: this is the first book i"ve ever done.")

출처  
http://jalammar.github.io/illustrated-gpt2/  
https://d2l.ai/chapter_recurrent-modern/seq2seq.html  
https://www.tensorflow.org/tutorials/text/transformer
