<a href="https://colab.research.google.com/github/KennethanCeyer/research/blob/master/DL/transformer/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Review: Transformer

## Credits

- **Presentor**: JaeYoung Lee
- **Reviewer**: SungMin Han
- **Source code was copied from:** https://pytorch.org/tutorials/beginner/transformer_tutorial.html

## Paper information

- https://arxiv.org/abs/1706.03762


# Cell 1

This section covers most of the model architectures discussed in Attention Is All You Need, such as the definition of the Transformer encoder and decoder, the implementation of MultiHeadAttention and Positional Encoding.

Comments on each implementation are specified for each block, and the utility functions are separated into separate blocks and explained.

## Imports

Preload a list of packages for future use.

In [None]:
import copy
import math
from typing import Optional, Any

import torchtext
from torchtext.data.utils import get_tokenizer

import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import functional as F
from torch.nn import Module
from torch.nn import ModuleList
from torch.nn.init import xavier_uniform_
from torch.nn.init import constant_
from torch.nn import Dropout
from torch.nn import Linear
from torch.nn import LayerNorm
from torch.nn import Parameter
from torch.nn.modules.linear import _LinearWithBias

## Utility functions

Afterwards, functions frequently used in the code are bundled and provided as utility functions. The feature of each function is as follows.

function name | description
--------------|--------------
`_get_clones()` | It duplicates the layers of each encoder and decoder by the number of given arguments N.
`_get_activation_fn()` | Returns the active function defined in torch.nn according to the active function name ('relu', 'gelu') provided as a string.

In [None]:
def _get_clones(module, N):
    return ModuleList([copy.deepcopy(module) for i in range(N)])


def _get_activation_fn(activation):
    if activation == "relu":
        return F.relu
    elif activation == "gelu":
        return F.gelu

    raise RuntimeError("activation should be relu/gelu, not {}".format(activation))

# MultiheadAttention

Multi-head attention is the process of obtaining and concatinating the separated `Q`, `K`, `V` by separating by `h` to perform the attention operation in parallel.


In [None]:
class MultiheadAttention(Module):
    bias_k: Optional[torch.Tensor]
    bias_v: Optional[torch.Tensor]

    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
        super(MultiheadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout = dropout

        # Separate embeds by the number of heads performing multi-head attention.
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        # Separate dim parameters are defined for Q, K, and V weights.
        if self._qkv_same_embed_dim is False:
            self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
            self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
            self.register_parameter('in_proj_weight', None)
        else:
        # Since Q, K, and V are the same dimension, we manage Tensor of embed_size * 3, embed_size as input projection.
            self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
            self.register_parameter('q_proj_weight', None)
            self.register_parameter('k_proj_weight', None)
            self.register_parameter('v_proj_weight', None)

        # If there is a bias, it is defined as a tensor of size embed_dim.
        if bias:
            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
        else:
            self.register_parameter('in_proj_bias', None)

        # Applies a linear transformation to the embed_dim data by using torch.nn.Linear(..., bias=True)
        self.out_proj = _LinearWithBias(embed_dim, embed_dim)

        if add_bias_kv:
            self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
            self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        # Initialize parameters
        self._reset_parameters()

    def _reset_parameters(self):
        # Initialize the weights from the sampled criteria U(−a,a).
        if self._qkv_same_embed_dim:
            xavier_uniform_(self.in_proj_weight)
        else:
            xavier_uniform_(self.q_proj_weight)
            xavier_uniform_(self.k_proj_weight)
            xavier_uniform_(self.v_proj_weight)

        # Initialize each bias appropriately.
        if self.in_proj_bias is not None:
            constant_(self.in_proj_bias, 0.)
            constant_(self.out_proj.bias, 0.)
        if self.bias_k is not None:
            xavier_normal_(self.bias_k)
        if self.bias_v is not None:
            xavier_normal_(self.bias_v)

    def __setstate__(self, state):
        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
        if '_qkv_same_embed_dim' not in state:
            state['_qkv_same_embed_dim'] = True

        super(MultiheadAttention, self).__setstate__(state)

    def forward(self, query, key, value, key_padding_mask=None,
                need_weights=True, attn_mask=None):
        # Here, multi-head attention operation is performed through the torch's multi_head_attention_forward method.
        # So I'll read https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py separately and review it further.
        # First, if Q, K, and V are all the same (like embed_dim), we calculate q, k, and v corresponding to the head through linear operation based on the input projection and bias. (https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py#L4131)
        # If they are not equal to each other, the weight and bias of each tensor area are calculated linearly to obtain Q, K, and V corresponding to the head. (https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py#L4160-L4185)
        # Scale for Q. Scale is done by multiplying pow(dim size, -0.5). (https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py#L4207)
        # We later apply triu to the chunk sliced input value of bptt through `generate_square_subsequent_mask` and provide the mask input src_mask as attn_mask.
        # In torch's multi-head attention, the weight is masked for the next token with value -inf through the corresponding attn_mask through masked_fill (https://github.com/pytorch/pytorch/blob/master/torch/nn/functional. py#L4282-L4286)
        # This helps the transformer to understand the context by giving a little more attention to the previous word when understanding the sentence.
        # The masked weight is used as an argument to the softmax function and a dropout is applied afterwards. (https://github.com/pytorch/pytorch/blob/671ee71ad4b6f507218d1cad278a8e743780b716/torch/nn/functional.py#L4297-L4299)
        # The weight is finally multiplied by V to get attention. (https://github.com/pytorch/pytorch/blob/671ee71ad4b6f507218d1cad278a8e743780b716/torch/nn/functional.py#L4301-L4304)

        if not self._qkv_same_embed_dim:
            return F.multi_head_attention_forward(
                query, key, value, self.embed_dim, self.num_heads,
                self.in_proj_weight, self.in_proj_bias,
                self.bias_k, self.bias_v, self.add_zero_attn,
                self.dropout, self.out_proj.weight, self.out_proj.bias,
                training=self.training,
                key_padding_mask=key_padding_mask, need_weights=need_weights,
                attn_mask=attn_mask, use_separate_proj_weight=True,
                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
                v_proj_weight=self.v_proj_weight)
        else:
            return F.multi_head_attention_forward(
                query, key, value, self.embed_dim, self.num_heads,
                self.in_proj_weight, self.in_proj_bias,
                self.bias_k, self.bias_v, self.add_zero_attn,
                self.dropout, self.out_proj.weight, self.out_proj.bias,
                training=self.training,
                key_padding_mask=key_padding_mask, need_weights=need_weights,
                attn_mask=attn_mask)

# TransformerEncoder

A TransformerEncoder consisting of `N` TransformerEncoderLayers is calculated by sequentially using the output returned from the model as the input value of the next layer in the forward process. This process is a sequential operation and cannot be processed in parallel.

In [None]:
class TransformerEncoder(Module):
    __constants__ = ['norm']

    def __init__(self, encoder_layer, num_layers, norm=None):
        super(TransformerEncoder, self).__init__()
        self.layers = _get_clones(encoder_layer, num_layers)
        self.num_layers = num_layers
        self.norm = norm

    def forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        output = src

        for mod in self.layers:
            output = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)

        if self.norm is not None:
            output = self.norm(output)

        return output

# TransformerDecoder

It is similar in shape to `TransformerEncoder`. A TransformerDecoder consisting of `N` TransformerDecoderLayers is calculated using the target inputs sequentially returned from the model in the forward process as the tgt value of the next layer. This process is a sequential operation and cannot be processed in parallel.

In [None]:
class TransformerDecoder(Module):
    __constants__ = ['norm']

    def __init__(self, decoder_layer, num_layers, norm=None):
        super(TransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.num_layers = num_layers
        self.norm = norm

    def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None,
                memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None,
                memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        output = tgt

        for mod in self.layers:
            output = mod(output, memory, tgt_mask=tgt_mask,
                         memory_mask=memory_mask,
                         tgt_key_padding_mask=tgt_key_padding_mask,
                         memory_key_padding_mask=memory_key_padding_mask)

        if self.norm is not None:
            output = self.norm(output)

        return output

# TransformerEncoderLayer

The encoder has 2 sub-layers. The first layer performs multi-head attention and normalizes the residual connection `src` and attention dropout values. In the second layer, feed forward is performed and the result is applied as a dropout. Once again, it is added with the residual connection `src` and normalized to obtain the result.

In [None]:
class TransformerEncoderLayer(Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model

        # Hidden unit = 2048
        # Activation = relu
        # Output unit = d_model(200)
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)

    def __setstate__(self, state):
        if 'activation' not in state:
            state['activation'] = F.relu
        super(TransformerEncoderLayer, self).__setstate__(state)

    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        src2 = self.self_attn(src, src, src, attn_mask=src_mask,
                              key_padding_mask=src_key_padding_mask)[0]
        # Norm(Residual connection value(src) + Multi-head attention(src2))
        src = src + self.dropout1(src2)
        src = self.norm1(src)

        # Norm(Residual connection value(src) + Feed-forward(src))
        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

# TransformerDecoderLayer

The decoder also has two sub layers and additional 1 more layer

In [None]:
class TransformerDecoderLayer(Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"):
        super(TransformerDecoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        self.dropout3 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)

    def __setstate__(self, state):
        if 'activation' not in state:
            state['activation'] = F.relu
        super(TransformerDecoderLayer, self).__setstate__(state)

    def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None,
                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
                              key_padding_mask=tgt_key_padding_mask)[0]
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
                                   key_padding_mask=memory_key_padding_mask)[0]
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        return tgt

# PositionalEncoding

Unlike RNN, positional encoding(PE) is performed to register the order information of the input data in the Transformer model that processes input values in parallel. In this case, a function that provides frequencies such as `sin()` and `cos()` is used.

`max_len` is the number of tokens given and `d_model` is the size of the embedding dimension. Positional Encoding provides information on the order of embedding through a linear operation of a sinusoid.

In [None]:
class PositionalEncoding(Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Defines embedding space of d_model dimension size for max_len statement
        pe = torch.zeros(max_len, d_model)

        # Defines a vector position representing pos information that is sequential by a given size of max_len
        # and adds a dimension for the dot product.
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # In this reference, the natural logarithm of the PE div_term is obtained through exp,
        # however, it can also be obtained by powing 2i/d_model for 10,000 according to the Attention Is All You Need paper.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Apply sin to even numbered embeds
        pe[:, 0::2] = torch.sin(position * div_term)

        # Apply cos to odd numbered embeds
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Add PE with input to put the positional information to the input embedding
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Cell 2

## Purpose


In [None]:
# Cell 2

#####################################################################################################
# class variable        model variable        description
# (Cell2)               (Cell5)
#####################################################################################################
# ntoken                ntokens               size of vocabulary
# ninp                  emsize                embedding dimension
# nhead                 nhead                 the number of heads in the multiheadattentio models
# nhid                  nhid                  the dimension of the feedforward network model 
# nlayers               nlayers               the number of TransformerEncoder
# dropout               dropout               the dropout value
#####################################################################################################
class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

# Cell 3

Cell 3 goes through the process of preparing a dataset for training or testing the Transformer. The dataset is based on torchtext and uses the WikiText2 dataset. Specify the batch size of `20`.

In [None]:
TEXT = torchtext.data.Field(tokenize=get_tokenizer("spacy"),
                            init_token='<sos>',
                            eos_token='<eos>',
                            lower=True)
train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(train_txt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def batchify(data, bsz):
    data = TEXT.numericalize([data.examples[0].text])
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_txt, batch_size)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)

downloading wikitext-2-v1.zip


wikitext-2-v1.zip: 100%|██████████| 4.48M/4.48M [00:00<00:00, 8.61MB/s]


extracting


# Cell 4

Generates an input value and a target value as much as bptt of chunk size for evaluation. For the Wiki2 dataset, the sequence is created by slicing the input values `data` and `sequence` as much as`bptt`.

In [None]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]

    # It is used for cross entropy, so adjust the dimensions for that.
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

# Cell 5

This is the section that defines the main parameters used in the Transformer model.
The descriptions of the main parameters are as follows.

## Parameters

**ntokens**

The total number of tokens in the dataset text, defined as the total number of tokens in the vocab in the current Wiki2 dataset, `28871`.

**emsize**

As word embedding used in Transformer, an embedding vector consisting of `200` dimensions is used. The paper uses a size of 512, but the number of source input tokens is relatively small compared to that of the paper, so the code uses a smaller value.

**nlayers**

The number of layers of the transformer's encoder decoder. A layer is a structure that has all its shape completely but does not share weight. Each layer has two sub-layers. Instead of the number of 6 layers specified in the paper, the number of layers was set to `2` for each of Encoder and Decoder for testing.

**nhead**

This is the number of heads used in the multi-head attention mechanism. The larger the number of heads, the better the project progresses and performs parallel operation. Therefore, when there are many core resources, if the number of heads is proportional to the resources of the core, you can get the advantage of computational performance. Here, it is designated as `2` for testing purposes.

**dropout**

Dropout factor value used in the feed-forward process.

In [None]:
ntokens = len(TEXT.vocab.stoi)  # the size of vocabulary
emsize = 200                    # embedding dimension
nhid = 200                      # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2                     # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2                       # the number of heads in the multiheadattention models
dropout = 0.2                   # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

#####################################################################################################
# class variable        model variable        description
#####################################################################################################
# ntoken                ntokens               size of vocabulary
# ninp                  emsize                embedding dimension
# nhead                 nhead                 the number of heads in the multiheadattentio models
# nhid                  nhid                  the dimension of the feedforward network model 
# nlayers               nlayers               the number of TransformerEncoder
# dropout               dropout               the dropout value
#####################################################################################################

# Cell 6

In that section, we train batch data for Wiki2 and evaluate the loss. For training and evaluation of the model, `train()` and `evaluate()` functions are provided respectively.

In [None]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        if data.size(0) != bptt:
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            if data.size(0) != bptt:
                src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
            output = eval_model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

# Cell 7

Train the model by iterating `3` epochs, and record the optimally trained model as `best_model`. I did not change the source code because I commented on the reference code, but since the `model` is an object and the field of the object itself is updated in the train, the error that `best_model` looks at only the last object state as a reference is expected.

In [None]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()



| epoch   1 |   200/ 3195 batches | lr 5.00 | ms/batch 18.02 | loss  7.39 | ppl  1619.44
| epoch   1 |   400/ 3195 batches | lr 5.00 | ms/batch 16.79 | loss  6.29 | ppl   541.46
| epoch   1 |   600/ 3195 batches | lr 5.00 | ms/batch 16.60 | loss  5.97 | ppl   393.34
| epoch   1 |   800/ 3195 batches | lr 5.00 | ms/batch 16.68 | loss  5.81 | ppl   334.41
| epoch   1 |  1000/ 3195 batches | lr 5.00 | ms/batch 16.84 | loss  5.81 | ppl   333.84
| epoch   1 |  1200/ 3195 batches | lr 5.00 | ms/batch 17.03 | loss  5.75 | ppl   314.98
| epoch   1 |  1400/ 3195 batches | lr 5.00 | ms/batch 16.96 | loss  5.71 | ppl   301.03
| epoch   1 |  1600/ 3195 batches | lr 5.00 | ms/batch 16.87 | loss  5.60 | ppl   271.56
| epoch   1 |  1800/ 3195 batches | lr 5.00 | ms/batch 17.10 | loss  5.63 | ppl   278.36
| epoch   1 |  2000/ 3195 batches | lr 5.00 | ms/batch 17.07 | loss  5.63 | ppl   279.01
| epoch   1 |  2200/ 3195 batches | lr 5.00 | ms/batch 17.17 | loss  5.57 | ppl   262.34
| epoch   1 |  2400/ 

# Cell 8

Finally, the eval value of `best_model`, which shows the best result, is printed.

In [None]:
test_loss = evaluate(best_model, test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss  4.84 | test ppl   126.72
