# Attention layer testing

In [14]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Attention
from tensorflow import keras

In [None]:
# Tokenizer:用來對文本中的字進行統計計數，產生文件字典，以支援基於字典位序產生文字的向量表示。 init(num_words) 建構函數，傳入字典的最大值
# pad_sequences:將序列填充到相同的長度

## 測試資料

In [15]:
docs = ['king', 'queen', 'table', 'ball', 'chair']

## 分詞，轉成數字

In [18]:
t = Tokenizer()
t.fit_on_texts(docs)#產生token字典，參數必須為list，每個元素為一個文件檔或句子。
vocab_size = len(t.word_index) + 1
#t.word_index =>{'king': 1, 'queen': 2, 'table': 3, 'ball': 4, 'chair': 5}

encoded_docs = t.texts_to_sequences(docs)
print('encoded_docs:')
print(encoded_docs)

encoded_docs:
[[1], [2], [3], [4], [5]]


## 補零

In [19]:
max_length = 1
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# padding:'pre'為在序列前進行拉伸或截斷，'post'是在序列最後進行拉伸或截斷
print('padded_docs:')
print(padded_docs)

padded_docs:
[[1]
 [2]
 [3]
 [4]
 [5]]


## 讀取 GloVe 檔案

In [20]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./glove/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


## 轉為 GloVe 向量

In [8]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## 建立模型

In [9]:
query_input = tf.keras.Input(shape=(None,), dtype='int32')
value_input = tf.keras.Input(shape=(None,), dtype='int32')

# Embedding lookup.
token_embedding = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=1)
# Query embeddings of shape [batch_size, Tq, dimension].
query_embeddings = token_embedding(query_input)
# Value embeddings of shape [batch_size, Tv, dimension].
value_embeddings = token_embedding(value_input)

# Query-value attention of shape [batch_size, Tq, filters].
query_value_attention_seq = tf.keras.layers.Attention()(
    [query_embeddings, value_embeddings])

model = keras.Model(inputs=[query_input, value_input], outputs=query_value_attention_seq)
model.compile(optimizer='adam', loss='mse')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 100)            600       ['input_1[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 attention (Attention)       (None, None, 100)            0         ['embedding[0][0]',       

## 模型預測

In [10]:
result = model([padded_docs, padded_docs])
result

<tf.Tensor: shape=(5, 1, 100), dtype=float32, numpy=
array([[[-0.32307  , -0.87616  ,  0.21977  ,  0.25268  ,  0.22976  ,
          0.7388   , -0.37954  , -0.35307  , -0.84369  , -1.1113   ,
         -0.30266  ,  0.33178  , -0.25113  ,  0.30448  , -0.077491 ,
         -0.89815  ,  0.092496 , -1.1407   , -0.58324  ,  0.66869  ,
         -0.23122  , -0.95855  ,  0.28262  , -0.078848 ,  0.75315  ,
          0.26584  ,  0.3422   , -0.33949  ,  0.95608  ,  0.065641 ,
          0.45747  ,  0.39835  ,  0.57965  ,  0.39267  , -0.21851  ,
          0.58795  , -0.55999  ,  0.63368  , -0.043983 , -0.68731  ,
         -0.37841  ,  0.38026  ,  0.61641  , -0.88269  , -0.12346  ,
         -0.37928  , -0.38318  ,  0.23868  ,  0.6685   , -0.43321  ,
         -0.11065  ,  0.081723 ,  1.1569   ,  0.78958  , -0.21223  ,
         -2.3211   , -0.67806  ,  0.44561  ,  0.65707  ,  0.1045   ,
          0.46217  ,  0.19912  ,  0.25802  ,  0.057194 ,  0.53443  ,
         -0.43133  , -0.34311  ,  0.59789  , -0.58

## 結果轉換為NumPy array

In [11]:
result = result.numpy().reshape(len(docs), -1)

## 相似度計算

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# king vs. ['queen', 'table', 'ball', 'chair']
cosine_similarity(result[0:1], result[1:])

array([[0.750769  , 0.29662594, 0.28252047, 0.3039691 ]], dtype=float32)

In [13]:
# chair vs. ['king', 'queen', 'table', 'ball']
cosine_similarity(result[-1:], result[0:-1])

array([[0.3039691 , 0.3016673 , 0.49246815, 0.35501537]], dtype=float32)