## I referred to [the webpage](https://qiita.com/halhorn/items/c91497522be27bde17ce) for the implementation.

In [30]:
import numpy as np
import math

import tensorflow as tf

from keras.models import Model
from keras.layers import Dense, Dropout, Activation, Layer, Embedding, Input
from keras import backend as K
from keras.initializers import RandomNormal

In [56]:
class MultiheadAttention(Model):
    ## hidden_dim has to be multiples of head_num
    def __init__(self, hidden_dim=512, head_num=8, dropout_rate=0.1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.hidden_dim = hidden_dim
        self.head_num = head_num
        self.dropout_rate = dropout_rate
        
        self.q_dense_layer = Dense(hidden_dim, use_bias=False, name="q_dense_layer")
        self.k_dense_layer = Dense(hidden_dim, use_bias=False, name="k_dense_layer")
        self.v_dense_layer = Dense(hidden_dim, use_bias=False, name="v_dense_layer")
        self.output_dense_layer = Dense(hidden_dim, use_bias=False, name="output_dense_layer")
        self.attention_dropout_layer = Dropout(dropout_rate, name="attention_dropout_layer")
        
    def split_heads(self, x):
        print("split_heads inputs", x)
        batch_size, max_len, hidden_dim = tf.unstack(tf.shape(x))
        #max_len, hidden_dim = tf.unstack(tf.shape(x))
        x = tf.reshape(x, [batch_size, max_len, self.head_num, self.hidden_dim // self.head_num])
        return tf.transpose(x, [0, 2, 1, 3])
    
    def combine_heads(self, heads):
        batch_size, _, max_len, _ = tf.unstack(tf.shape(heads))
        heads = tf.transpose(heads, [0, 2, 1, 3])
        return tf.reshape(heads, [batch_size, max_len, self.hidden_dim])
        
    def call(self, query, memory):
        #two arguments of query and memory are already encoded as embedded vectors for all words
        q = self.q_dense_layer(query)
        print("q", q)
        k = self.k_dense_layer(memory)
        v = self.v_dense_layer(memory)
        
        q = self.split_heads(q)
        k = self.split_heads(k)
        v = self.split_heads(v)
        
        #for scaled dot-product
        depth_inside_each_head = self.hidden_dim // self.head_num
        q *= depth_inside_each_head ** -0.5
        
        score = tf.matmul(q, k, transpose_b=True)
        normalized_score = Activation("softmax")(score, name="attention_weight")
        normalized_score = self.attention_dropout_layer(normalized_score)
        attention_weighted_output = tf.matmul(normalized_score, v)
        attention_weighted_output = self.combine_head(attention_weighted_output)
        return self.output_dense_layer(attention_weighted_output)

In [57]:
# SlefAttention class inherits MultiheadAttention class so that it can make query and memory come from the same source.
class SelfAttention(MultiheadAttention):
    
    def call(self, query,):
        return super().call(query, query)

In [33]:
class PositionwiseFeedForwardNetwork(Model):
    
    def __init__(self, hidden_dim, dropout_rate, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.hidden_dim = hidden_dim
        self.dropout_rate = dropout_rate
        
        self.first_dense_layer = Dense(hidden_dim*4, use_bias=True, activation="relu", name="first_dense_layer")
        self.second_dense_layer = Dense(hidden_dim, use_bias=True, activation="linear", name="second_dense_layer")
        self.dropout_layer = Dropout(dropout_rate, name="PFFN_dropout")
        
    def call(self, inputs):
        # make the network more flexible to learn for the first dense layer(non-linear transformation is used),
        # and put the network back into the same hidden dim as original(linear transformation is used)
        x = self.first_dense_layer(inputs)
        x = self.dropout_layer(x)
        return self.second_dense_layer(x)

In [59]:
class LayerNormalization(Layer):
    def build(self, input_shape):
        print("input shape", input_shape)
        hidden_dim = 512
        print("hidden dim:", hidden_dim)
        self.scale = self.add_weight("layer_norm_scale", shape=[hidden_dim],
                                    initializer="ones")
        self.shift = self.add_weight("layer_norm_shift", shape=[hidden_dim],
                                    initializer="zeros")
        super().build(input_shape)
        
    def call(self, inputs, epsilon=1e-6):
        print("inside of layer norm inputs", inputs)
        mean = K.mean(inputs, axis=[-1], keepdims=True)
        variance = K.var(inputs, axis=[-1], keepdims=True)
        normalized_inputs = (inputs - mean) / (K.sqrt(variance) + epsilon)
        print("normalized inputs", normalized_inputs)
        return normalized_inputs * self.scale + self.shift

In [40]:
class PreLayerNormPostResidualConnectionWrapper(Model):
    def __init__(self, layer, dropout_rate, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.layer = layer
        self.layer_norm = LayerNormalization()
        self.dropout_layer = Dropout(dropout_rate)
        
    def call(self, inputs, *args, **kwargs):
        print("pre layer norm starts")
        print("inside of prepos inputs", inputs)
        x = self.layer_norm(inputs)
        x = self.layer(x, *args, **kwargs)
        output = self.dropout_layer(x)
        return inputs + output

In [36]:
class AddPositionalEncoding(Layer):
    def call(self, inputs):
        data_type = inputs.dtype
        batch_size, max_len, emb_dim = tf.unstack(tf.shape(inputs))
        # i is from 0 to 255 when emb_dim is 512
        #so the doubled_i is from 0 to 510
        doubled_i = K.arange(emb_dim) // 2 * 2
        exponent = K.tile(K.expand_dims(doubled_i, 0), [max_len, 1])
        denominator_matrix = K.pow(10000.0, K.cast(exponent / emb_dim, data_type))
        
        # since cos(x) = sin(x + π/2), we convert the series of [sin, cos, sin, cos, ...]
        # into [sin, sin, sin, sin, ...]
        to_convert = K.cast(K.arange(emb_dim) % 2, data_type) * math.pi / 2
        convert_matrix = K.tile(tf.expand_dims(to_convert, 0), [max_len, 1])
        
        seq_pos = K.arange(max_len)
        numerator_matrix = K.cast(K.tile(K.expand_dims(seq_pos, 1), [1, emb_dim]), data_type)
        
        positinal_encoding = K.sin(numerator_matrix / denominator_matrix + convert_matrix)
        batched_positional_encoding = K.tile(K.expand_dims(positinal_encoding, 0), [batch_size, 1, 1])
        return inputs + batched_positional_encoding

In [37]:
PAD_ID = 0

class TokenEmbedding(Layer):
    def __init__(self, vocab_size, emb_dim, data_type="float32", *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.data_type = data_type
        
    def build(self, input_shape):
        self.embedding_layer = Embedding(self.vocab_size,
                                   self.emb_dim,
                                   embeddings_initializer=RandomNormal(mean=0.0, stddev=self.emb_dim**-0.5)
                                  )
        super().build(input_shape)
        
    def call(self, inputs):
        mask_for_pads = tf.to_float(tf.not_equal(inputs, PAD_ID))
        embedding = self.embedding_layer(inputs)
        pads_masked_embedding = embedding * tf.expand_dims(mask_for_pads, -1)
        return pads_masked_embedding * (self.emb_dim ** 0.5)

In [38]:
class Encoder(Model):
    def __init__(self, vocab_size, stack_num, head_num, emb_dim, dropout_rate, max_len, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.stack_num = stack_num
        self.head_num = head_num
        self.emb_dim = emb_dim
        self.dropout_rate = dropout_rate
        
        self.token_emb_layer = TokenEmbedding(vocab_size, emb_dim)
        self.add_pos_enc_layer = AddPositionalEncoding()
        self.input_dropout_layer = Dropout(dropout_rate)
        
        self.attention_block_list = []
        for _ in range(stack_num):
            self_attention_layer = SelfAttention(emb_dim, head_num, dropout_rate, name="self_attention_layer")
            pffn_layer = PositionwiseFeedForwardNetwork(emb_dim, dropout_rate, "pffn_layer")
            self.attention_block_list.append([
                PreLayerNormPostResidualConnectionWrapper(self_attention_layer, dropout_rate, name="prepos_self_attention_wrapper"),
                PreLayerNormPostResidualConnectionWrapper(pffn_layer, dropout_rate, name="prepos_pffn_wrapper")
            ])
        self.output_layer_norm = LayerNormalization()
        
    def call(self, inputs):
        x = self.token_emb_layer(inputs)
        print("first x", x)
        x = self.add_pos_enc_layer(x)
        print("second x", x)
        x = self.input_dropout_layer(x)
        print("third x", x)
        
        for i, set_of_layers_list in enumerate(self.attention_block_list):
            self_attention_layer, pffn_layer = tuple(set_of_layers_list)
            print("fourth x", x)
            x = self_attention_layer(x)
            print("fifth x", x)
            x = pffn_layer(x)
            
        return self.output_layer_norm(x)

In [60]:
# Transformer classification model
MAX_LEN = 717

inputs = Input(shape=(MAX_LEN,))
transformer_encoder = Encoder(vocab_size=8000, stack_num=6, head_num=8, emb_dim=512, dropout_rate=0.1, max_len=MAX_LEN)
encoder_output = transformer_encoder(inputs)
summarized_vecs = encoder_output[:, 0, :]
outputs = Dense(MAX_LEN, activation="softmax")
model = Model(inputs, outputs)
model.summary()

first x Tensor("encoder_17/token_embedding_17/mul_1:0", shape=(?, 717, 512), dtype=float32)
second x Tensor("encoder_17/add_positional_encoding_17/add_1:0", shape=(?, 717, 512), dtype=float32)
third x Tensor("encoder_17/dropout_209/cond/Merge:0", shape=(?, 717, 512), dtype=float32)
fourth x Tensor("encoder_17/dropout_209/cond/Merge:0", shape=(?, 717, 512), dtype=float32)
pre layer norm starts
inside of prepos inputs Tensor("encoder_17/dropout_209/cond/Merge:0", shape=(?, 717, 512), dtype=float32)
input shape (None, 717)
hidden dim: 512
inside of layer norm inputs Tensor("encoder_17/dropout_209/cond/Merge:0", shape=(?, 717, 512), dtype=float32)
normalized inputs Tensor("encoder_17/prepos_self_attention_wrapper/layer_normalization_209/truediv:0", shape=(?, 717, 512), dtype=float32)
q Tensor("encoder_17/prepos_self_attention_wrapper/self_attention_layer/q_dense_layer/Reshape_2:0", shape=(?, 512), dtype=float32)
split_heads inputs Tensor("encoder_17/prepos_self_attention_wrapper/self_atten

NameError: name 'batch_size' is not defined