# Transformer

encoder结构

In [None]:
import tensorflow as tf
from sequence_feature_layer import SequenceFeatures
from tensorflow import feature_column as fc
from tensorflow.keras.layers import Layer, Dense, LayerNormalization, Dropout, Embedding, Conv1D

## 准备工作

In [None]:
seq = fc.sequence_categorical_column_with_hash_bucket('seq', hash_bucket_size=10, dtype=tf.int64)
target = fc.sequence_categorical_column_with_hash_bucket('target', hash_bucket_size=10, dtype=tf.int64)
seq_col = fc.embedding_column(seq, dimension=64)
target_col = fc.embedding_column(target, dimension=64)
columns = [seq_col, target_col]
features={
  "seq": tf.sparse.SparseTensor(
      indices=[[0, 0], [0, 1], [1, 0], [1, 1], [2, 0]],
      values=[1100, 1101, 1102, 1101, 1103],
      dense_shape=[3, 2]),
  "target": tf.sparse.SparseTensor(
      indices=[[0, 0],[1,0],[2,0]],
      values=[1102,1103,1100],
      dense_shape=[3, 1]),

}
tf.sparse.to_dense(features['seq'])

<tf.Tensor: shape=(3, 2), dtype=int32, numpy=
array([[1100, 1101],
       [1102, 1101],
       [1103,    0]], dtype=int32)>

In [None]:
sequence_feature_layer = SequenceFeatures(columns, name='sequence_features_input_layer')
sequence_inputs, sequence_lengths = sequence_feature_layer(features)
target_input=sequence_inputs['target_embedding']
target_length=sequence_lengths['target_embedding']
sequence_input=sequence_inputs['seq_embedding']
sequence_length=sequence_lengths['seq_embedding']
tf.shape(sequence_input),sequence_length

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  2, 64], dtype=int32)>,
 <tf.Tensor: shape=(3,), dtype=int64, numpy=array([2, 2, 1])>)

In [None]:
x_=tf.concat([sequence_input, target_input], axis=1)
x_length = sequence_length+target_length
tf.shape(x_)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  3, 64], dtype=int32)>

In [None]:
seq_mask = tf.expand_dims(tf.where(tf.sequence_mask(sequence_length),1.0,0.0),axis=-1)
target_mask = tf.expand_dims(tf.where(tf.sequence_mask(target_length),1.0,0.0),axis=-1)
mask_ = tf.concat([seq_mask,target_mask],axis=1)
mask_

<tf.Tensor: shape=(3, 3, 1), dtype=float32, numpy=
array([[[1.],
        [1.],
        [1.]],

       [[1.],
        [1.],
        [1.]],

       [[1.],
        [0.],
        [1.]]], dtype=float32)>

## Layer搭建

### multi_head_attention

In [None]:
def scaled_dot_product_attention(q, k, v, mask, causality=True):
    """
    Attention Mechanism
    :param q: A 3d tensor with shape of (None, seq_len, depth), depth = d_model // num_heads
    :param k: A 3d tensor with shape of (None, seq_len, depth)
    :param v: A 3d tensor with shape of (None, seq_len, depth)
    :param mask:
    :param causality: Boolean. If True, using causality, default True
    :return:
    """
    mat_qk = tf.matmul(q, k, transpose_b=True)  # (None, seq_len, seq_len)
    dk = tf.cast(k.shape[-1], dtype=tf.float32)
    # Scaled
    scaled_att_logits = mat_qk / tf.sqrt(dk)

    paddings = tf.ones_like(scaled_att_logits) * (-2 ** 32 + 1)
    outputs = tf.where(tf.equal(mask, 0), paddings, scaled_att_logits)  # (None, seq_len, seq_len)
    # Causality
    if causality:
        diag_vals = tf.ones_like(outputs)  # (None, seq_len, seq_len)
        masks = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()  # (None, seq_len, seq_len)
        paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
        outputs = tf.where(tf.equal(masks, 0), paddings, outputs)  # (None, seq_len, seq_len)

    # softmax
    outputs = tf.nn.softmax(logits=outputs)  # , axis=-1)  # (None, seq_len, seq_len)
    outputs = tf.matmul(outputs, v)  # (None, seq_len, depth)

    return outputs


class EncoderLayer(Layer):
    def __init__(self, num_heads=None, att_hidden=None, embedding_size=None, ffn_dims=None, dropout=None, norm_training=None, causality=None,
                 **kwargs):
        self.num_heads = num_heads
        self.att_hidden = att_hidden
        self.embedding_size = embedding_size
        self.ffn_dims = ffn_dims
        self.dropout = dropout
        self.norm_training = norm_training
        self.causality = causality

        self.wq = Dense(self.att_hidden, activation=None, use_bias=False,
                        kernel_initializer=tf.keras.initializers.VarianceScaling(distribution='uniform'), name='wq')
        self.wk = Dense(self.att_hidden, activation=None, use_bias=False,
                        kernel_initializer=tf.keras.initializers.VarianceScaling(distribution='uniform'), name='wk')
        self.wv = Dense(self.att_hidden, activation=None, use_bias=False,
                        kernel_initializer=tf.keras.initializers.VarianceScaling(distribution='uniform'), name='wv')

        # drp and ln
        self.dropout_1 = Dropout(self.dropout, trainable=True)
        self.dropout_2 = Dropout(self.dropout, trainable=True)
        self.layer_norm_1 = LayerNormalization(epsilon=1e-6, trainable=self.norm_training)
        self.layer_norm_2 = LayerNormalization(epsilon=1e-6, trainable=self.norm_training)
        # FFN
        self.ffn = Dense(units=self.ffn_dims, activation='relu', use_bias=True,
                         kernel_initializer=tf.keras.initializers.VarianceScaling(distribution='uniform'))

        super().__init__(**kwargs)

    def call(self, inputs, *args, **kwargs):
        x, mask_ = inputs
        q_ = self.wq(x)
        k_ = self.wk(x)
        v_ = self.wv(x)

        q = self._process_multi_head(q_)
        k = self._process_multi_head(k_)
        v = self._process_multi_head(v_)

        mask_ = tf.tile(mask_, multiples=[1, 1, self.num_heads])
        mask = self._process_multi_head(mask_)

        att_output = scaled_dot_product_attention(q, k, v, mask)
        mha_outputs = tf.concat(tf.split(att_output, self.num_heads, axis=0), axis=2)  # (N, seq_len, d_model)

        dropout_1_out = self.dropout_1(mha_outputs)
        layer_norm_1_out = self.layer_norm_1(x + dropout_1_out)
        ffn_out = self.ffn(mha_outputs)

        drouput_2_out = self.dropout_2(ffn_out)
        trm_out = self.layer_norm_2(drouput_2_out + layer_norm_1_out)

        return trm_out

    def _process_multi_head(self, emb):
        emb_split = tf.split(emb, self.num_heads, axis=2)
        emb = tf.concat(emb_split, axis=0)
        return emb


### Trm Layer

In [None]:
class Transformer(Layer):
    def __init__(self, num_blocks=3, num_heads=4, seq_len=100, att_hidden=64, embedding_size=64, ffn_dims=64, dropout=0.05, norm_training=True,
                 causality=False,
                 **kwargs):
        self.num_blocks = num_blocks
        self.num_heads = num_heads
        self.att_hidden = att_hidden
        self.embedding_size = embedding_size
        self.ffn_dims = ffn_dims
        self.seq_len = seq_len
        self.dropout = dropout
        self.norm_training = norm_training
        self.causality = causality

        # pos_emb
        self.pos_encoding = Embedding(
            input_dim=self.seq_len,
            output_dim=self.embedding_size,
            name="position_embedding")

        # dropout
        self.dropout_layer = Dropout(self.dropout)

        # blocks
        self.encoder_layer = [EncoderLayer(self.num_heads, self.att_hidden, self.embedding_size, self.ffn_dims,
                                           self.dropout, self.norm_training, self.causality) for i in range(self.num_blocks)]

        super().__init__(**kwargs)

    def call(self, inputs, *args, **kwargs):
        x_, mask_ = inputs

        positions = tf.range(start=0, limit=tf.shape(x_)[1], delta=1)
        x = x_ + tf.expand_dims(self.pos_encoding(positions), 0)
        net = self.dropout_layer(x)

        for index, encoder in enumerate(self.encoder_layer):
            print('index_{}_encoder_{}_net_{}'.format(index, encoder, tf.shape(net)))
            net = encoder([net, mask_])
            net *= mask_
            print('after mask net is '.format(net))

        return net


## 执行

In [None]:
trm_layer = Transformer()
trm_layer

<__main__.Transformer at 0x7fb81e064290>

In [None]:
output = trm_layer([x_, mask_])

index_0_encoder_<__main__.EncoderLayer object at 0x7fb81e064810>_net_[ 3  3 64]
after mask net is 
index_1_encoder_<__main__.EncoderLayer object at 0x7fb81e0062d0>_net_[ 3  3 64]
after mask net is 
index_2_encoder_<__main__.EncoderLayer object at 0x7fb81e001950>_net_[ 3  3 64]
after mask net is 


In [None]:
output[:,-1]

<tf.Tensor: shape=(3, 64), dtype=float32, numpy=
array([[ 2.9326947 ,  0.69362676,  0.9936812 , -0.8257617 ,  0.08538669,
         0.23545605,  1.29105   ,  0.11086974, -0.8754775 , -0.5112791 ,
        -1.2423049 , -0.45985162, -0.6020464 ,  1.4475819 ,  0.62697715,
        -0.16384442,  0.5519566 , -0.86469024,  0.8320564 , -0.8523632 ,
        -0.14820299,  1.12581   , -1.3312322 ,  2.5265052 ,  0.2369073 ,
         0.6300299 , -1.277069  , -0.6257964 , -1.0155196 , -0.29722834,
        -0.09147831,  0.13335192,  0.8710997 , -0.12049151,  0.5275751 ,
         0.10439041,  0.31844503, -1.0451658 , -0.7986394 , -0.50971556,
         2.2151523 , -0.6969824 ,  0.6148121 ,  0.6670119 , -1.1124527 ,
        -0.39789087,  1.4995996 ,  0.04672161,  0.6685069 , -1.529521  ,
        -0.1502356 , -0.1818445 , -1.1916966 , -0.05184604, -1.0963775 ,
         1.4292254 , -0.4543887 ,  0.7681211 , -0.87967885, -0.49879047,
        -1.9696634 ,  0.25545433, -1.6738725 ,  1.1033427 ],
       [ 1.988