In [1]:
import numpy as np

In [5]:
#此函数将各个单词attention值标准化并总和为1，成为标准attention值的格式
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=-1, keepdims=True)

#此函数计算attention值
def attention(query, key, value):
    scores = np.dot(query, key.T)

    scale = np.sqrt(key.shape[-1])
    scores /= scale

    weights = softmax(scores)

    output = np.dot(weights, value)
    return output, weights

#在此函数中，embedding_dim可调
#在对sentence做embedding处理后，运用attention并显示结果
def sentence_to_attention(sentence, embedding_dim=2):
    words = sentence.split()

    np.random.seed(42)
    embeddings = np.random.rand(len(words), embedding_dim)

    query = embeddings
    key = embeddings
    value = embeddings
    
    output, weights = attention(query, key, value)
    
    print("Attention Output:\n", output)
    print("Attention Weights:\n", weights)

#运用示范
sentence1 = "Today is July 19th."
sentence_to_attention(sentence1)

print("\n")
print("----------分界线----------")
print("\n")

sentence2 = "今天是7月19日。"
sentence_to_attention(sentence2)

print("\n")
print("----------分界线----------")
print("\n")

sentence2 = "今 天 是 7 月 19 日。"
sentence_to_attention(sentence2)

Attention Output:
 [[0.34849442 0.70189894]
 [0.36651102 0.68131782]
 [0.33805986 0.6537384 ]
 [0.33371996 0.69768299]]
Attention Weights:
 [[0.30399137 0.26374837 0.16815237 0.26410789]
 [0.28623393 0.29672212 0.18259781 0.23444614]
 [0.25963374 0.25978996 0.23217766 0.24839865]
 [0.29723294 0.24312372 0.18105339 0.27858995]]


----------分界线----------


Attention Output:
 [[0.37454012 0.95071431]]
Attention Weights:
 [[1.]]


----------分界线----------


Attention Output:
 [[0.39296863 0.68747658]
 [0.42702208 0.65494911]
 [0.40233016 0.64356457]
 [0.37516149 0.69187854]
 [0.41509897 0.66579213]
 [0.36971587 0.69921133]
 [0.44548969 0.62722922]]
Attention Weights:
 [[0.17237952 0.14955956 0.09535147 0.14976343 0.15549089 0.15899318
  0.11846195]
 [0.15936433 0.16520376 0.10166362 0.13053083 0.16170176 0.13376868
  0.147767  ]
 [0.14742525 0.14751396 0.13183514 0.14104574 0.14716478 0.1420799
  0.14293523]
 [0.17007298 0.13911236 0.10359649 0.1594057  0.14795681 0.16960074
  0.1102549 ]
 

In [10]:
def initialize_random_embeddings(vocab_size, embedding_dim):
    np.random.seed(42)
    embeddings = np.random.rand(vocab_size, embedding_dim)
    return embeddings

#在encode-decode中所需要的positional encoding，此函数会帮助系统识别sentence用词的先后顺序，从而更准确识别句子信息的attention
def get_positional_encoding(seq_length, embedding_dim):
    positional_encoding = np.zeros((seq_length, embedding_dim))
    for pos in range(seq_length):
        for i in range(embedding_dim):
            if i % 2 == 0:
                positional_encoding[pos, i] = np.sin(pos / (10000 ** (i / embedding_dim)))
            else:
                positional_encoding[pos, i] = np.cos(pos / (10000 ** ((i - 1) / embedding_dim)))
    return positional_encoding

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=-1, keepdims=True)

def attention(query, key, value, mask=None):
    scores = np.dot(query, key.T)
    scale = np.sqrt(key.shape[-1])
    scores /= scale
    if mask is not None:
        scores += (mask * -1e9)
    weights = softmax(scores)
    output = np.dot(weights, value)
    return output, weights

#此函数将x内信息标准化
def layer_norm(x, epsilon=1e-6):
    mean = np.mean(x, axis=-1, keepdims=True)
    std = np.std(x, axis=-1, keepdims=True)
    return (x - mean) / (std + epsilon)

#RELU函数
def feed_forward(x, hidden_dim):
    W1 = np.random.rand(x.shape[-1], hidden_dim)
    b1 = np.random.rand(hidden_dim)
    W2 = np.random.rand(hidden_dim, x.shape[-1])
    b2 = np.random.rand(x.shape[-1])
    hidden = np.maximum(0, np.dot(x, W1) + b1)
    return np.dot(hidden, W2) + b2

#学习资料中:
def create_mask(seq_length):
    mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype(np.float32)
    return mask
def encode(sentence, vocab, embedding_dim=16, hidden_dim=32):
    embeddings = initialize_random_embeddings(len(vocab), embedding_dim)
    positional_encodings = get_positional_encoding(len(sentence), embedding_dim)
    sentence_embeddings = np.array([embeddings[vocab.index(word)] for word in sentence])
    sentence_embeddings += positional_encodings[:len(sentence)]
    attention_output, attention_weights = attention(sentence_embeddings, sentence_embeddings, sentence_embeddings)
    attention_output = layer_norm(sentence_embeddings + attention_output)
    ff_output = feed_forward(attention_output, hidden_dim)
    encoder_output = layer_norm(attention_output + ff_output)
    return encoder_output, attention_weights
def decode(target_sentence, encoder_output, vocab, embedding_dim=16, hidden_dim=32):
    embeddings = initialize_random_embeddings(len(vocab), embedding_dim)
    positional_encodings = get_positional_encoding(len(target_sentence), embedding_dim)
    target_embeddings = np.array([embeddings[vocab.index(word)] for word in target_sentence])
    target_embeddings += positional_encodings[:len(target_sentence)]
    mask = create_mask(len(target_sentence))
    attention_output, attention_weights = attention(target_embeddings, target_embeddings, target_embeddings, mask)
    attention_output = layer_norm(target_embeddings + attention_output)
    enc_dec_attention_output, enc_dec_attention_weights = attention(attention_output, encoder_output, encoder_output)
    enc_dec_attention_output = layer_norm(attention_output + enc_dec_attention_output)
    ff_output = feed_forward(enc_dec_attention_output, hidden_dim)
    decoder_output = layer_norm(enc_dec_attention_output + ff_output)
    return decoder_output, attention_weights, enc_dec_attention_weights


vocab = ['today', 'is', 'july', 'the', '19th']
input_sentence = ['today', 'is', 'july']
target_sentence = ['the', '19th']

encoder_output, encoder_attention_weights = encode(input_sentence, vocab, embedding_dim=4, hidden_dim=8)

decoder_output, decoder_attention_weights, enc_dec_attention_weights = decode(target_sentence, encoder_output, vocab, embedding_dim=4, hidden_dim=8)

print("Encoder Output:\n", encoder_output)
print("\n")
print("Encoder Attention Weights:\n", encoder_attention_weights)
print("\n")
print("Decoder Output:\n", decoder_output)
print("\n")
print("Decoder Self-Attention Weights:\n", decoder_attention_weights)
print("\n")
print("Encoder-Decoder Attention Weights:\n", enc_dec_attention_weights)

Encoder Output:
 [[-1.48293084  0.75647753 -0.3313181   1.05777141]
 [-0.51012977 -0.37033002 -0.82764469  1.70810449]
 [-0.11888339 -0.71121139 -0.83705269  1.66714747]]


Encoder Attention Weights:
 [[0.63398533 0.20356514 0.16244953]
 [0.28802469 0.31861361 0.3933617 ]
 [0.18720687 0.32038272 0.49241041]]


Decoder Output:
 [[-1.02147432  0.2144661  -0.73336813  1.54037635]
 [-0.72770163 -0.09804882 -0.83610401  1.66185446]]


Decoder Self-Attention Weights:
 [[1.         0.        ]
 [0.44322455 0.55677545]]


Encoder-Decoder Attention Weights:
 [[0.31823617 0.36909888 0.31266495]
 [0.20718191 0.39893009 0.39388801]]
