基于Transformer的multi_head_attention结构得到待预测id的序列embedding

In [None]:
import tensorflow as tf
from sequence_feature_layer import SequenceFeatures
from tensorflow import feature_column as fc
from tensorflow.keras.layers import Layer, Dense, LayerNormalization, Dropout, Embedding, Conv1D

## 0.准备工作

### 0.1定义feature_column

In [1]:
num_heads = 4
batch_size = 3
seq_len = 2+1 # seq_nums_feed + target_feed
emb_dims = 64
att_hidden = 16
out_dims = 64
# multi_head_attention计算的维度=emb_dims / num_heads，因此固定一个即可。
# 另一个需要确定的是ffn_output_dims

In [None]:
seq = fc.sequence_categorical_column_with_hash_bucket('seq', hash_bucket_size=10, dtype=tf.int64)
target = fc.sequence_categorical_column_with_hash_bucket('target', hash_bucket_size=10, dtype=tf.int64)
seq_col = fc.embedding_column(seq, dimension=emb_dims)
target_col = fc.embedding_column(target, dimension=emb_dims)
columns = [seq_col, target_col]
features={
  "seq": tf.sparse.SparseTensor(
      indices=[[0, 0], [0, 1], [1, 0], [1, 1], [2, 0]],
      values=[1100, 1101, 1102, 1101, 1103],
      dense_shape=[3, 2]),
  "target": tf.sparse.SparseTensor(
      indices=[[0, 0],[1,0],[2,0]],
      values=[1102,1103,1100],
      dense_shape=[3, 1]),

}
tf.sparse.to_dense(features['seq'])

<tf.Tensor: shape=(3, 2), dtype=int32, numpy=
array([[1100, 1101],
       [1102, 1101],
       [1103,    0]], dtype=int32)>

### 0.2 定义input_layer

In [None]:
sequence_feature_layer = SequenceFeatures(columns, name='sequence_features_input_layer')
sequence_inputs, sequence_lengths = sequence_feature_layer(features)
target_input=sequence_inputs['target_embedding']
target_length=sequence_lengths['target_embedding']
sequence_input=sequence_inputs['seq_embedding']
sequence_length=sequence_lengths['seq_embedding']
tf.shape(sequence_input),sequence_length

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  2, 64], dtype=int32)>,
 <tf.Tensor: shape=(3,), dtype=int64, numpy=array([2, 2, 1])>)

## 1.拼接序列，与mask

In [None]:
x_=tf.concat([sequence_input, target_input], axis=1)
x_length = sequence_length+target_length
x_,x_length

(<tf.Tensor: shape=(3, 3, 64), dtype=float32, numpy=
 array([[[-0.13805474, -0.12819062, -0.08316334,  0.15519094,
           0.08654672, -0.0417437 , -0.12233107,  0.06252771,
          -0.05552072, -0.12069733, -0.11733798,  0.19201703,
          -0.0947831 ,  0.09968773,  0.12797266,  0.14191905,
          -0.02172974, -0.12107637, -0.23050526, -0.21904746,
           0.02279476,  0.02023425,  0.09253869,  0.11605652,
           0.01733863,  0.10320976, -0.07052449,  0.09014963,
          -0.15206148,  0.05858514,  0.22653651, -0.02475267,
          -0.05450587,  0.03590911,  0.02578084, -0.06262375,
          -0.08051723,  0.16072297, -0.01146526, -0.14211865,
           0.10708775,  0.03645298, -0.06645644, -0.08966307,
           0.08810062,  0.23838545,  0.13731799,  0.08355393,
           0.03483437, -0.09295635, -0.04672443, -0.16190618,
          -0.140681  , -0.07738471,  0.03678761,  0.05667284,
          -0.18610348, -0.02829157, -0.06727324, -0.02758603,
          -0.0044

In [None]:
seq_mask = tf.expand_dims(tf.where(tf.sequence_mask(sequence_length),1.0,0.0),axis=-1)
target_mask = tf.expand_dims(tf.where(tf.sequence_mask(target_length),1.0,0.0),axis=-1)
mask_ = tf.concat([seq_mask,target_mask],axis=1)
mask_

<tf.Tensor: shape=(3, 3, 1), dtype=float32, numpy=
array([[[1.],
        [1.],
        [1.]],

       [[1.],
        [1.],
        [1.]],

       [[1.],
        [0.],
        [1.]]], dtype=float32)>

## 2.序列位置编码

In [None]:
pos_encoding = tf.keras.layers.Embedding(
    input_dim=seq_len,
    output_dim=emb_dims,
    name="position_embedding")
positions = tf.range(start=0, limit= tf.shape(x_)[1], delta=1)
positions,pos_encoding(0),tf.expand_dims(pos_encoding(positions), 0)

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([0, 1, 2], dtype=int32)>,
 <tf.Tensor: shape=(64,), dtype=float32, numpy=
 array([-0.0239528 ,  0.01356203,  0.02028309,  0.0455563 ,  0.01913543,
         0.01106744,  0.01685603, -0.04790261, -0.0008321 ,  0.03737653,
        -0.01793382,  0.01415796, -0.02430677,  0.0235954 ,  0.02223745,
        -0.00408934, -0.04091454,  0.00440548, -0.01014294, -0.03660297,
         0.02920336, -0.02311252, -0.01433514, -0.00996177, -0.0245832 ,
         0.04747171,  0.02383449,  0.03898003, -0.03891411,  0.04956288,
         0.02569861,  0.00105651,  0.02441013, -0.0307085 , -0.01080358,
        -0.00674318,  0.02624693, -0.01815044, -0.04034762, -0.03648641,
         0.02181803,  0.03624168,  0.04769038,  0.01515422,  0.03644714,
        -0.01503394, -0.01342198,  0.02367103, -0.04513092,  0.02090872,
        -0.03307921,  0.02280761, -0.00067054, -0.00466581, -0.01769658,
        -0.0184207 , -0.00984343,  0.01305796,  0.03739792,  0.01999433,


In [None]:
x = x_ + tf.expand_dims(pos_encoding(positions), 0)
x

<tf.Tensor: shape=(3, 3, 64), dtype=float32, numpy=
array([[[-0.16200754, -0.11462858, -0.06288025,  0.20074724,
          0.10568215, -0.03067626, -0.10547504,  0.0146251 ,
         -0.05635282, -0.0833208 , -0.1352718 ,  0.206175  ,
         -0.11908988,  0.12328313,  0.15021011,  0.1378297 ,
         -0.06264427, -0.11667089, -0.24064821, -0.25565043,
          0.05199812, -0.00287828,  0.07820354,  0.10609475,
         -0.00724457,  0.15068147, -0.04669   ,  0.12912966,
         -0.19097558,  0.10814802,  0.2522351 , -0.02369616,
         -0.03009574,  0.00520061,  0.01497726, -0.06936693,
         -0.0542703 ,  0.14257254, -0.05181288, -0.17860505,
          0.12890579,  0.07269465, -0.01876606, -0.07450886,
          0.12454776,  0.22335151,  0.12389601,  0.10722497,
         -0.01029655, -0.07204764, -0.07980363, -0.13909857,
         -0.14135154, -0.08205052,  0.01909103,  0.03825215,
         -0.1959469 , -0.01523361, -0.02987531, -0.0075917 ,
         -0.04457304,  0.09702648

### 2.1 校验位置编码

In [None]:
x_[:,0] + pos_encoding(0)==x[:,0]# ,x_[:,1] + pos_encoding(1)

<tf.Tensor: shape=(3, 64), dtype=bool, numpy=
array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, 

## 3.multi_head_attention

### 3.1初始化Wq，Wk，Wv

In [None]:
wq = tf.random.uniform(shape=[emb_dims, num_heads * att_hidden], minval=0, maxval=1, dtype=tf.float32, seed=7)
wk= tf.random.uniform(shape=[emb_dims, num_heads * att_hidden], minval=0, maxval=1, dtype=tf.float32, seed=7)
wv = tf.random.uniform(shape=[emb_dims, num_heads * att_hidden], minval=0, maxval=1, dtype=tf.float32, seed=7)

In [None]:
q_ = tf.matmul(x, wq)
k_ = tf.matmul(x, wk)
v_ = tf.matmul(x, wv)
tf.shape(q_)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  3, 64], dtype=int32)>

### 3.2对Q，K，V按照multi_head切分

In [None]:
def process_multi_head(emb, num_heads):
    emb_split = tf.split(emb, num_heads, axis=2)
    emb = tf.concat(emb_split, axis=0)
    print(emb_split[0].shape, emb.shape)
    return emb
q = process_multi_head(q_, num_heads)
k = process_multi_head(k_, num_heads)
v = process_multi_head(v_, num_heads)
mask_=tf.tile(mask_, multiples=[1,1,num_heads])
mask = process_multi_head(mask_,num_heads)
tf.shape(mask)

(3, 3, 16) (12, 3, 16)
(3, 3, 16) (12, 3, 16)
(3, 3, 16) (12, 3, 16)
(3, 3, 1) (12, 3, 1)


<tf.Tensor: shape=(3,), dtype=int32, numpy=array([12,  3,  1], dtype=int32)>

### 3.3scaled_dot_product_attention计算

In [None]:
def scaled_dot_product_attention(q, k, v, mask, causality=True):
    """
    Attention Mechanism
    :param q: A 3d tensor with shape of (None, seq_len, depth), depth = d_model // num_heads
    :param k: A 3d tensor with shape of (None, seq_len, depth)
    :param v: A 3d tensor with shape of (None, seq_len, depth)
    :param mask:
    :param causality: Boolean. If True, using causality, default True
    :return:
    """
    mat_qk = tf.matmul(q, k, transpose_b=True)  # (None, seq_len, seq_len)
    dk = tf.cast(k.shape[-1], dtype=tf.float32)
    # Scaled
    scaled_att_logits = mat_qk / tf.sqrt(dk)

    paddings = tf.ones_like(scaled_att_logits) * (-2 ** 32 + 1)
    # 通过赋予极小值，将softmax后均等采样
    outputs = tf.where(tf.equal(mask, 0), paddings, scaled_att_logits)  # (None, seq_len, seq_len)
    # Causality
    if causality:
        diag_vals = tf.ones_like(outputs)  # (None, seq_len, seq_len)
        masks = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()  # (None, seq_len, seq_len)
        paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
        outputs = tf.where(tf.equal(masks, 0), paddings, outputs)  # (None, seq_len, seq_len)

    # softmax
    outputs = tf.nn.softmax(logits=outputs)#, axis=-1)  # (None, seq_len, seq_len)
    outputs = tf.matmul(outputs, v)  # (None, seq_len, depth)

    return outputs

In [None]:
outputs = scaled_dot_product_attention(q, k, v, mask, False)
tf.shape(outputs)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([12,  3, 16], dtype=int32)>

### 3.4恢复原始序列emb

In [None]:
mha_outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, seq_len, d_model)
tf.shape(mha_outputs)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  3, 64], dtype=int32)>

## 4.Dropout与LayerNorm

In [None]:
dropout_1 = Dropout(0.01)
dropout_2 = Dropout(0.02)
layer_norm_1 = LayerNormalization(epsilon=1e-6, trainable=True)
layer_norm_2 = LayerNormalization(epsilon=1e-6, trainable=True)
# FFN
ffn = Dense(units=emb_dims, activation='relu', use_bias=True, kernel_initializer=tf.keras.initializers.VarianceScaling(distribution='uniform'))


In [None]:
dropout_1_out = dropout_1(mha_outputs)
layer_norm_1_out = layer_norm_1(x + dropout_1_out)
ffn_out = ffn(layer_norm_1_out)

drouput_2_out = dropout_2(ffn_out)
trm_out = layer_norm_2(drouput_2_out + layer_norm_1_out)
tf.shape(trm_out)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  3, 64], dtype=int32)>

## 5.取待预测向量

In [None]:
trm_out[:,-1]

<tf.Tensor: shape=(3, 64), dtype=float32, numpy=
array([[-0.96063447,  0.53353155, -0.10756716, -0.62171066,  0.3796619 ,
        -0.18158105, -0.21348801, -1.1662719 ,  0.03126302, -0.3147969 ,
         1.7155861 ,  0.34300265,  0.35236105,  0.68642306,  0.73129237,
         0.8369678 ,  1.7741693 , -1.0348194 ,  0.7429557 , -0.79573417,
        -0.54471695,  1.80662   ,  0.5339227 , -0.20000356,  0.13934907,
        -1.6805193 , -0.28901866, -0.91347325, -0.771244  ,  2.3199646 ,
        -0.23767787,  0.44956222,  0.34320107, -0.76763415, -0.2788762 ,
        -0.42716184,  1.0298506 , -1.7698371 , -0.31335628, -0.6394391 ,
        -0.21522582, -0.6182866 ,  0.22501227,  0.72568035, -2.0019896 ,
         0.903931  ,  0.25917563,  0.8449507 ,  0.07633522,  1.992932  ,
         1.1084348 ,  0.41804144, -2.3003209 , -0.08705986, -0.59299177,
         0.23203978, -1.39345   ,  0.94254184,  1.0015532 ,  1.901718  ,
        -1.2914509 , -0.4555504 , -1.9442124 , -0.25193006],
       [-1.526