<a href="https://colab.research.google.com/github/Muzhi1920/awesome-models/blob/main/06-%E5%BA%8F%E5%88%97%E6%8E%A8%E8%8D%90/02_SASRec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SASRec

应用self-attention做序列推荐。

1. 基于马尔可夫的方法，通过简单的假设，学习状态转移；**高维稀疏数据下表现较好，但复杂样本中捕捉信息能力较差**
2. 基于RNN的方法需要样本较多，稠密数据下效果较好，但效率较低。

序列的自注意力机制：连续输出依赖于连续输入的相关信息，这些输入的作用更需要关注。


- 变长seq的Embedding表示：左侧填充零向量；
- 位置编码：Trm自身无位置信息；位置的重要性，分不同场景需要尝试；
- $Q_i K^T_j$的计算中，i<j时，存在序列穿越的现象，**强**序列推荐需要禁止这种“穿越交互”；具体操作是使用对角阵，右上角(前对后的交互)置0，保留左下角（后对前的交互）。

1. 推荐场景（电商等）——>序列强相关——>（pos_emb + 禁止穿越）
2. 推荐场景（信息流等）——>序列弱相关——>（pos_emb/穿越，需尝试）——>去掉后退化为特征交叉，兴趣提取。

In [None]:
import tensorflow as tf
from sequence_feature_layer import SequenceFeatures
from tensorflow import feature_column as fc
from tensorflow.keras.layers import Layer, Dense, LayerNormalization, Dropout, Embedding, Conv1D

## 0.准备工作

### 0.1定义feature_column

In [None]:
num_heads = 4
batch_size = 3
seq_len = 2+1 # seq_nums_feed + target_feed
emb_dims = 64
att_hidden = 16
out_dims = 64
# multi_head_attention计算的维度=emb_dims / num_heads，因此固定一个即可。
# 另一个需要确定的是ffn_output_dims

In [None]:
seq = fc.sequence_categorical_column_with_hash_bucket('seq', hash_bucket_size=10, dtype=tf.int64)
target = fc.sequence_categorical_column_with_hash_bucket('target', hash_bucket_size=10, dtype=tf.int64)
seq_col = fc.embedding_column(seq, dimension=emb_dims)
target_col = fc.embedding_column(target, dimension=emb_dims)
columns = [seq_col, target_col]
features={
  "seq": tf.sparse.SparseTensor(
      indices=[[0, 0], [0, 1], [1, 0], [1, 1], [2, 0]],
      values=[1100, 1101, 1102, 1101, 1103],
      dense_shape=[3, 2]),
  "target": tf.sparse.SparseTensor(
      indices=[[0, 0],[1,0],[2,0]],
      values=[1102,1103,1100],
      dense_shape=[3, 1]),

}
tf.sparse.to_dense(features['seq'])

<tf.Tensor: shape=(3, 2), dtype=int32, numpy=
array([[1100, 1101],
       [1102, 1101],
       [1103,    0]], dtype=int32)>

### 0.2 定义input_layer

In [None]:
sequence_feature_layer = SequenceFeatures(columns, name='sequence_features_input_layer')
sequence_inputs, sequence_lengths = sequence_feature_layer(features)
target_input=sequence_inputs['target_embedding']
target_length=sequence_lengths['target_embedding']
sequence_input=sequence_inputs['seq_embedding']
sequence_length=sequence_lengths['seq_embedding']
tf.shape(sequence_input),sequence_length

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  2, 64], dtype=int32)>,
 <tf.Tensor: shape=(3,), dtype=int64, numpy=array([2, 2, 1])>)

## 1.拼接序列，与mask

In [None]:
x_= sequence_input
tf.shape(x_)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  2, 64], dtype=int32)>

In [None]:
seq_mask = tf.expand_dims(tf.where(tf.sequence_mask(sequence_length),1.0,0.0),axis=-1)
mask_ = seq_mask
mask_

<tf.Tensor: shape=(3, 2, 1), dtype=float32, numpy=
array([[[1.],
        [1.]],

       [[1.],
        [1.]],

       [[1.],
        [0.]]], dtype=float32)>

## 2.序列位置编码

In [None]:
pos_encoding = tf.keras.layers.Embedding(
    input_dim=seq_len,
    output_dim=emb_dims,
    name="position_embedding")
positions = tf.range(start=0, limit= tf.shape(x_)[1], delta=1)
tf.shape(positions)

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([2], dtype=int32)>

In [None]:
x = x_ + tf.expand_dims(pos_encoding(positions), 0)
tf.shape(x)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  2, 64], dtype=int32)>

### 2.1 校验位置编码

In [None]:
x_[:,0] + pos_encoding(0)==x[:,0]# ,x_[:,1] + pos_encoding(1)

<tf.Tensor: shape=(3, 64), dtype=bool, numpy=
array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, 

### 2.2 dropout与mask

In [None]:
input_dropout=tf.keras.layers.Dropout(0.1)
xx = input_dropout(x, training=True) * mask_
tf.shape(xx)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  2, 64], dtype=int32)>

## 3.multi_head_attention

### 3.1初始化Wq，Wk，Wv

In [None]:
wq = tf.random.uniform(shape=[emb_dims, num_heads * att_hidden], minval=0, maxval=1, dtype=tf.float32, seed=7)
wk= tf.random.uniform(shape=[emb_dims, num_heads * att_hidden], minval=0, maxval=1, dtype=tf.float32, seed=7)
wv = tf.random.uniform(shape=[emb_dims, num_heads * att_hidden], minval=0, maxval=1, dtype=tf.float32, seed=7)

In [None]:
q_ = tf.matmul(x, wq)
k_ = tf.matmul(x, wk)
v_ = tf.matmul(x, wv)
tf.shape(q_)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  2, 64], dtype=int32)>

### 3.2对Q，K，V按照multi_head切分

In [None]:
def process_multi_head(emb, num_heads):
    emb_split = tf.split(emb, num_heads, axis=2)
    emb = tf.concat(emb_split, axis=0)
    print(emb_split[0].shape, emb.shape)
    return emb
q = process_multi_head(q_, num_heads)
k = process_multi_head(k_, num_heads)
v = process_multi_head(v_, num_heads)
mask_=tf.tile(mask_, multiples=[1,1,num_heads])
mask = process_multi_head(mask_,num_heads)
tf.shape(mask)

(3, 2, 16) (12, 2, 16)
(3, 2, 16) (12, 2, 16)
(3, 2, 16) (12, 2, 16)
(3, 2, 1) (12, 2, 1)


<tf.Tensor: shape=(3,), dtype=int32, numpy=array([12,  2,  1], dtype=int32)>

### 3.3scaled_dot_product_attention计算

In [None]:
def scaled_dot_product_attention(q, k, v, mask, causality=True):
    """
    Attention Mechanism
    :param q: A 3d tensor with shape of (None, seq_len, depth), depth = d_model // num_heads
    :param k: A 3d tensor with shape of (None, seq_len, depth)
    :param v: A 3d tensor with shape of (None, seq_len, depth)
    :param mask:
    :param causality: Boolean. If True, using causality, default True
    :return:
    """
    mat_qk = tf.matmul(q, k, transpose_b=True)  # (None, seq_len, seq_len)
    dk = tf.cast(k.shape[-1], dtype=tf.float32)
    # Scaled
    scaled_att_logits = mat_qk / tf.sqrt(dk)

    paddings = tf.ones_like(scaled_att_logits) * (-2 ** 32 + 1)
    # 通过赋予极小值，将softmax后均等采样
    outputs = tf.where(tf.equal(mask, 0), paddings, scaled_att_logits)  # (None, seq_len, seq_len)
    # Causality
    if causality:
        diag_vals = tf.ones_like(outputs)  # (None, seq_len, seq_len)
        print('diag_vals is {}'.format(diag_vals))
        masks = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense()  # (None, seq_len, seq_len)
        print('masks is {}'.format(masks))
        paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
        print('paddings is {}'.format(paddings))
        outputs = tf.where(tf.equal(masks, 0), paddings, outputs)  # (None, seq_len, seq_len)
        print('outputs is {}'.format(outputs))

    # softmax
    outputs = tf.nn.softmax(logits=outputs)#, axis=-1)  # (None, seq_len, seq_len)
    print('outputs is {}'.format(outputs))
    outputs = tf.matmul(outputs, v)  # (None, seq_len, depth)
    print('outputs is {}'.format(outputs))

    return outputs

In [None]:
outputs = scaled_dot_product_attention(q, k, v, mask)
tf.shape(outputs)

diag_vals is [[[1. 1.]
  [1. 1.]]

 [[1. 1.]
  [1. 1.]]

 [[1. 1.]
  [1. 1.]]

 [[1. 1.]
  [1. 1.]]

 [[1. 1.]
  [1. 1.]]

 [[1. 1.]
  [1. 1.]]

 [[1. 1.]
  [1. 1.]]

 [[1. 1.]
  [1. 1.]]

 [[1. 1.]
  [1. 1.]]

 [[1. 1.]
  [1. 1.]]

 [[1. 1.]
  [1. 1.]]

 [[1. 1.]
  [1. 1.]]]
masks is [[[1. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 1.]]]
paddings is [[[-4.2949673e+09 -4.2949673e+09]
  [-4.2949673e+09 -4.2949673e+09]]

 [[-4.2949673e+09 -4.2949673e+09]
  [-4.2949673e+09 -4.2949673e+09]]

 [[-4.2949673e+09 -4.2949673e+09]
  [-4.2949673e+09 -4.2949673e+09]]

 [[-4.2949673e+09 -4.2949673e+09]
  [-4.2949673e+09 -4.2949673e+09]]

 [[-4.2949673e+09 -4.2949673e+09]
  [-4.2949673e+09 -4.2949673e+09]]

 [[-4.2949673e+09 -4.2949673e+09]
  [-4.2949673e+09 -4.2949673e+09]]

 [[-4.2949673e+09 

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([12,  2, 16], dtype=int32)>

### 3.4恢复原始序列emb

In [None]:
mha_outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, seq_len, d_model)
tf.shape(mha_outputs)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  2, 64], dtype=int32)>

## 4.Dropout与LayerNorm

In [None]:
dropout_1 = Dropout(0.01)
dropout_2 = Dropout(0.02)
layer_norm_1 = LayerNormalization(epsilon=1e-6, trainable=True)
layer_norm_2 = LayerNormalization(epsilon=1e-6, trainable=True)
# FFN
ffn = Dense(units=emb_dims, activation='relu', use_bias=True, kernel_initializer=tf.keras.initializers.VarianceScaling(distribution='uniform'))


In [None]:
dropout_1_out = dropout_1(mha_outputs)
layer_norm_1_out = layer_norm_1(x + dropout_1_out)
ffn_out = ffn(layer_norm_1_out)

drouput_2_out = dropout_2(ffn_out)
trm_out = layer_norm_2(drouput_2_out + layer_norm_1_out)
tf.shape(trm_out)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 3,  2, 64], dtype=int32)>

## 5.取待预测向量

原生SASRec用于召回，这里有改动：后面操作很自由，可以变换为类似bst，用于排序；也可以用于原生SASRec实现召回.

In [None]:
trm_out[:,-1]

<tf.Tensor: shape=(3, 64), dtype=float32, numpy=
array([[-1.84379369e-01, -9.45800185e-01, -2.22851932e-02,
        -8.38192582e-01,  9.56860602e-01,  3.14615428e-01,
         7.33368933e-01, -4.57641661e-01, -2.87721753e-02,
        -1.66162729e+00, -3.33662450e-01,  6.51713789e-01,
        -2.27731556e-01,  1.04329467e+00,  6.65294230e-01,
         5.56918323e-01, -7.19381154e-01, -9.85158741e-01,
        -3.48450541e-01,  1.16099119e-02,  8.54185045e-01,
         1.98537827e+00,  5.62301576e-01,  1.17580986e+00,
        -8.53503704e-01, -5.00624835e-01, -5.60072839e-01,
         9.97676253e-02, -9.24546361e-01,  7.36404955e-01,
        -5.75597525e-01, -1.00922728e+00, -2.32741570e+00,
        -7.83360243e-01,  1.39773190e-01, -1.43922091e-01,
         2.26295710e+00,  3.37453485e-02,  2.00631928e+00,
         1.76497293e+00, -1.69512177e+00, -1.60295320e+00,
        -4.00020689e-01, -1.33954775e+00,  8.61033797e-03,
         3.13110828e-01,  4.96499836e-01, -2.39545435e-01,
       