# Embedding an Input Sentence

In [1]:
sentence = 'Life is short, eat dessert first'

dc = {w: v for v, w in enumerate(sorted(sentence.replace(',', '').split()))}
dc

{'Life': 0, 'dessert': 1, 'eat': 2, 'first': 3, 'is': 4, 'short': 5}

Next, we use this dictionary to assign an integer index to each word:

In [2]:
import torch

sentence_int = torch.tensor([dc[w] for w in sentence.replace(',', '').split()])
sentence_int

tensor([0, 4, 5, 2, 1, 3])

Now, using the integer-vector representation of the input sentence, we can use an embedding layer to encode the inputs into a real-vector embedding. Here, we will use a 16-dimensional embedding such that each input word is represented by a 16-dimensional vector. Since the sentence consists of 6 words, this will result in a 6×16-dimensional embedding:

In [5]:
torch.manual_seed(42)
import torch.nn as nn

embed = nn.Embedding(10, 16)
embedded_sentence = embed(sentence_int).detach()

embedded_sentence.shape

torch.Size([6, 16])

# Defining the Weight Matrices

In [7]:
torch.manual_seed(42)
d = embedded_sentence.shape[1]

U_query = torch.rand(d, d)
U_key = torch.rand(d, d)
U_value = torch.rand(d, d)

In [13]:
a = torch.rand((2,3,3,4))
b = torch.rand((2,3,3,4))
c = torch.rand((2,3,3,4))

bt = torch.transpose(b, 2, 3)

s = a @ bt
s.size()

torch.Size([2, 3, 3, 3])

In [14]:
import torch.nn.functional as F

s = F.softmax(s, dim=3)
s.size()

torch.Size([2, 3, 3, 3])

In [16]:
c = s @ c
c.size()

torch.Size([2, 3, 3, 4])

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super(MultiHeadAttention, self).__init__()

        self.n_head = config.n_head
        self.dim = config.dim
        self.device = device

        self.wq = nn.Linear(self.dim, self.dim).to(self.device)
        self.wk = nn.Linear(self.dim, self.dim).to(self.device)
        self.wv = nn.Linear(self.dim, self.dim).to(self.device)

        self.softmax = F.softmax(dim=3)
        self.fc = nn.Linear(self.dim, self.dim)
        self.norm = nn.LayerNorm(self.dim)
        self.dropout = nn.Dropout(config.drop_out)

    def split(self, tensor):
        a, b, c = tensor.size()
        d = c // self.n_head
        return tensor.reshape(a, b, self.n_head, d).permute(0, 2, 1, 3) # (a, self.n_head, b, d)

    def attention(self, k, q, v):   # dv_model * n_head = d_model
        _,_,_,d = k.size()
        kt = torch.transpose(k, 2, 3)
        # (batch_size, n_head, seq_len, dv_model) @ (batch_size, n_head, dv_model, seq_len) = (batch_size, n_head, seq_len, seq_len)
        s = (q @ kt) / math.sqrt(d)   
        s = self.somtmax(s)
        # (batch_size, n_head, seq_len, seq_len) @ (batch_size, n_head, seq_len, dv_model) = (batch_size, n_head, seq_len, dv_model)
        v = s @ v
        return v
    
    def concat(self, tensor):
        a, b, c, d = tensor.size()  # (batch_size, n_head, seq_len, dv_model)
        tensor = tensor.permute(0, 2, 1, 3) # (batch_size, seq_len, n_head, dv_model)
        return tensor.reshape(a, c, b * d)  #  (batch_size, seq_len, n_head * dv_model) => (batch_size, seq_len, d_model) 
    
    def forward(self, x):
        # input进去后经过线性变化得到 k,q,v矩阵(batch_size, seq_len, d_model)
        k, q, v = self.wk(x), self.wq(x), self.wv(x) 

        # 将k,q,v分头，分头后的维度为(a, self.n_head, b, d) (batch_size, n_head, seq_len, dv_model)
        k, q, v = self.split(k), self.split(q), self.split(v) 
        
        # 经过attention,输出维度为(batch_size, n_head, seq_len, dv_model)
        v = self.attention(k, q, v)
        
        # 将结果concat起来,输出维度为(batch_size, seq_len, d_model)
        v = self.concat(v)
        
        # 备份作残差连接
        vb = v 

        v = self.norm(v + x)

        v = self.f(v)

        v = self.norm(v + vb)

        return self.dropout(v)


In [25]:
a = torch.rand((2,3,2,4)) # a2 b3 c2 d4
b = torch.rand((2,3,3,4))
c = torch.rand((2,3,3,4))
s = torch.rand((2,3,3,3))
v = torch.rand((2,3,1,3))

# a = a.permute(0, 2, 1, 3)
print(a.reshape((2,2,3*4)).size())
# a.size()

torch.Size([2, 2, 12])
