## The following is a very simple transformer. It shows the inner workings of the architecture in detail while being very basic. Please look at the pytorch_transformers notebook for a better representation of the Transformer architecture.

In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from tqdm.notebook import tqdm
import numpy as np
%config Completer.use_jedi = False

In [2]:
torch.manual_seed(1)

z1 = torch.randn(3,3)

print("z1:")
print(z1)

z2 = torch.randn(3,3)
print("z2:")
print(z2)

z_concatenated = torch.cat((z1,z2), dim=1)
print("z_concatenated:")
print(z_concatenated)

z1:
tensor([[ 0.6614,  0.2669,  0.0617],
        [ 0.6213, -0.4519, -0.1661],
        [-1.5228,  0.3817, -1.0276]])
z2:
tensor([[-0.5631, -0.8923, -0.0583],
        [-0.1955, -0.9656,  0.4224],
        [ 0.2673, -0.4212, -0.5107]])
z_concatenated:
tensor([[ 0.6614,  0.2669,  0.0617, -0.5631, -0.8923, -0.0583],
        [ 0.6213, -0.4519, -0.1661, -0.1955, -0.9656,  0.4224],
        [-1.5228,  0.3817, -1.0276,  0.2673, -0.4212, -0.5107]])


# Transformer Encoder Layer

**Consider an input of 2 words and each word has 3 features.**

In [3]:
sentence = ["thinking", "machines"]
print(sentence)

word2idx = {}

for word in sentence:
    if word not in word2idx:
        word2idx[word] = len(word2idx)
        
print(word2idx)

seq = list(word2idx.values())

print(seq)

seq_tensor = torch.tensor(seq, dtype=torch.long)
print(seq_tensor)

['thinking', 'machines']
{'thinking': 0, 'machines': 1}
[0, 1]
tensor([0, 1])


## Compute Positional embeddings to feed into the Self-Attention layer in the Transformer Encoder

In [4]:
torch.manual_seed(1)

def add_pos_encoding(embeddings, embed_len, seq_len):
    pe = torch.zeros((seq_len, embed_len), dtype = torch.float32)
    
    for pos in range(seq_len):
        v1 = pe[pos]
        for i in range(v1.shape[0]):
            if i % 2 == 0:
                pwr = (2 * i) / embed_len
                v1[i] = np.sin(pos / (10000 ** pwr))
            else:
                pwr = (2 * i) / embed_len
                v1[i] = np.cos(pos / (10000 ** pwr))
                
        pe[pos] = v1
            
#     print(pe)
    
    return embeddings + pe

embedding_len = 3
embeddings = nn.Embedding(len(word2idx), embedding_len)
embeddings = embeddings(seq_tensor)
# print(em)
# print(embeddings)
positional_embeddings = add_pos_encoding(embeddings, embed_len=embeddings.shape[1], seq_len=embeddings.shape[0])
print("These are the positional embeddings:")
print(positional_embeddings)

These are the positional embeddings:
tensor([[ 0.6614,  1.2669,  0.0617],
        [ 1.4628,  0.5481, -0.1661]], grad_fn=<AddBackward0>)


## The Multi-head Attention/ Self-Attention layer for the Transformer Encoder

In [5]:
torch.manual_seed(1)

queries = nn.ModuleList()
keys = nn.ModuleList()
values = nn.ModuleList()


vector_dim = 64
divide_by = np.sqrt(vector_dim)

for i in range(positional_embeddings.shape[0]):
    q = nn.Linear(vector_dim, 1, bias = False)
    k = nn.Linear(vector_dim, 1)
    v = nn.Linear(vector_dim, 1, bias = False)
    
    queries.append(q)
    keys.append(k)
    values.append(v)

# calculate scores for each word:
scores = {}
for i in range(positional_embeddings.shape[0]):
    
    q_weight = queries[i].weight
    scores_list = []
    for j in range(len(keys)):
        k_weight, k_bias = keys[j].weight, keys[j].bias
        s = q_weight @ torch.transpose(k_weight, 0, 1) + k_bias
        scores_list.append(s)
    
    scores_list = torch.stack(scores_list, dim=1).view(1, -1)
    scores["word_"+str(i+1)] = scores_list

# divide them by the dimension of the vectors
# and apply softmax
softmax = nn.Softmax(dim=1)

for word, tensors in scores.items():
    tensors = softmax(tensors / divide_by)
    tensors = tensors.squeeze(0).tolist()
    scores[word] = tensors

softmax_scores = list(scores.values())

# multiply the softmax with the value vector of each word
# for every word
summed_values = {}

for i in range(positional_embeddings.shape[0]):
    vals = softmax_scores[i]
    v_weight = values[i].weight
    arr = []
    for val in vals:
        arr.append(val * v_weight)
    
    temp = 0
    for tensor in arr:
        temp = temp + tensor
    summed_values["z"+str(i+1)] = temp
    
print(summed_values)

{'z1': tensor([[ 0.1160, -0.0776,  0.0271,  0.1079,  0.0828,  0.0779,  0.0888,  0.0791,
          0.0323, -0.0855, -0.1050, -0.0573, -0.0146, -0.0766,  0.0457,  0.0387,
         -0.0283,  0.0480,  0.0404,  0.0763,  0.0842, -0.0423,  0.1221, -0.0145,
         -0.0043, -0.1180, -0.0805, -0.0730, -0.0535,  0.0889, -0.0408, -0.0934,
          0.0481,  0.0400,  0.0810, -0.0647,  0.0271, -0.0455, -0.0281, -0.0996,
         -0.0570, -0.0383,  0.0535,  0.0228,  0.0309,  0.1248,  0.1218,  0.0853,
          0.0040, -0.0865,  0.0977, -0.0313, -0.0101, -0.1077, -0.0247, -0.0806,
          0.1149, -0.1081, -0.0974, -0.0042, -0.0676,  0.0447, -0.0481, -0.0587]],
       grad_fn=<AddBackward0>), 'z2': tensor([[-0.0237, -0.0461,  0.0522, -0.0426, -0.0702,  0.0284, -0.0904, -0.0267,
          0.0639,  0.0467,  0.1064, -0.0853,  0.0011, -0.0369, -0.0699, -0.0703,
          0.0315,  0.0131, -0.0638, -0.0881, -0.1036,  0.1225, -0.0213, -0.0289,
         -0.0460, -0.0625, -0.1143, -0.0734,  0.0764,  0.0274,

## Matrix Calculation of Self-Attention

In [6]:
def get_self_attention(pos_embedds, embedding_size, vector_size, div_val):
    # calculating q:
    wq = nn.Linear(embedding_size, vector_size)
    q = wq(pos_embedds)

    # calculating k:
    wk = nn.Linear(embedding_size, vector_size)
    k = wk(pos_embedds)

    # calculating v:
    wv = nn.Linear(embedding_size, vector_size)
    v = wv(pos_embedds)

#     print(f"q shape: {q.shape}, k shape:{k.shape}, v shape: {v.shape}")

    score = q @ torch.transpose(k, 0, 1)
    score = softmax(score / div_val)
    z = score @ v
    
    return z

z = get_self_attention(positional_embeddings, embedding_len, vector_dim, divide_by)
print("z shape:", z.shape)

z shape: torch.Size([2, 64])


## The Beast with Many Heads

**Using the 'z' matrices computed above; we concatenate them and then pass them through a linear layer to get the final output of the Self-Attention layer**

In [7]:
number_of_heads = 2
z_matrices = {}

for i in range(number_of_heads):
    z = get_self_attention(positional_embeddings, embedding_len, vector_dim, divide_by)
    z_matrices["z"+str(i+1)] = z

# print(z_matrices)

list_of_zs = []
for z in z_matrices.values():
    list_of_zs.append(z)
    
z_tuple = tuple(list_of_zs)

z_concatenated = torch.cat(z_tuple, dim=1)

wo = nn.Linear(vector_dim*number_of_heads, embedding_len)
z = wo(z_concatenated)

print("Final output (z) shape:", z.shape)

Final output (z) shape: torch.Size([2, 3])


**Add the output of the multi-headed attention layer to the positional embeddings (input into the M-HAL) and normalize it. This is called a risidual connection**

In [8]:
torch.manual_seed(1)

risidual_connection = z + positional_embeddings

m = nn.LayerNorm([len(word2idx), embedding_len])

layer_norm_output = m(risidual_connection)

print("Layer norm output shape:", layer_norm_output.shape)
print(layer_norm_output)

Layer norm output shape: torch.Size([2, 3])
tensor([[-0.6960,  1.6577, -0.8487],
        [ 0.5519,  0.5397, -1.2046]], grad_fn=<NativeLayerNormBackward>)


**Pointwise feed forward network**

In [9]:
torch.manual_seed(1)

lin1 = nn.Linear(embedding_len, embedding_len)
relu = nn.ReLU()
lin2 = nn.Linear(embedding_len, embedding_len)

encoder_output = lin1(layer_norm_output)
encoder_output = relu(encoder_output)
encoder_output = lin2(encoder_output)

print(f"Encoder output:\n{encoder_output}")
print("Shape:", encoder_output.shape)

Encoder output:
tensor([[-0.3710, -0.0980, -0.3630],
        [-0.3306, -0.1015, -0.2085]], grad_fn=<AddmmBackward>)
Shape: torch.Size([2, 3])


# Transformer Decoder Layer

In [36]:
torch.manual_seed(1)

target_sentence = ["SOS", "tänkande", "maskiner", "EOS"]
target_tensor = torch.tensor([0,1,2,3], dtype=torch.long)

vocab_size = 4
embedding_len = 3
vector_len = 64
number_of_heads = 2
divide_by = np.sqrt(vector_len)


embeddings = nn.Embedding(vocab_size, embedding_len)
embeddings = embeddings(target_tensor)
# print(embeddings)
# print()
positional_embeddings = add_pos_encoding(embeddings, embed_len=embedding_len, seq_len=vocab_size)
print(positional_embeddings)

tensor([[ 0.6614,  1.2669,  0.0617],
        [ 1.4628,  0.5481, -0.1661],
        [-0.6135,  1.3817, -1.0276],
        [-0.4219,  0.1077, -0.0582]], grad_fn=<AddBackward0>)


## Masked multihead attention of a Decoder layer

In [37]:
torch.manual_seed(1)

def get_masked_attention(scores):
    masked_attention = torch.zeros(scores.shape[0], scores.shape[1])
    
#     print(masked_attention)
    for i in range(len(masked_attention)):
        masked_attention[i, (i+1):] = -float('inf')
        
#     print(masked_attention)
    
    return masked_attention

def get_masked_multihead_attention(positional_embeddings, vector_len, embedding_len, divide_by, mask):
    wq = nn.Linear(embedding_len, vector_len)
    q = wq(positional_embeddings)
    
    wk = nn.Linear(embedding_len, vector_len)
    k = wk(positional_embeddings)
    
    wv = nn.Linear(embedding_len, vector_len)
    v = wv(positional_embeddings)
    
    score = q @ torch.transpose(k, 0, 1)
    
    #scale the score
    score = score / divide_by
    
    # before sending the score to the softmax layer
    # we have to add the masked attention to it
    if mask == True:
        masked_attention = get_masked_attention(score)
        score = score + masked_attention
    
    softmax = nn.Softmax(dim=1)
    score = softmax(score)
        
    z = score @ v
    
    return z
    
# z = get_masked_multihead_attention(positional_embeddings, vector_len, embedding_len, divide_by, True)
# print("z shape:", z.shape)

z_matrices = {}

for i in range(number_of_heads):
    z = get_masked_multihead_attention(positional_embeddings, 
                                       vector_len, 
                                       embedding_len, 
                                       divide_by, 
                                       True)
    z_matrices['z'+str(i+1)] = z
    
list_of_zs = []
for z in z_matrices.values():
    list_of_zs.append(z)
    
z_tuple = tuple(list_of_zs)

z_concatenated = torch.cat(z_tuple, dim=1)

wo = nn.Linear(vector_dim*number_of_heads, embedding_len)
z = wo(z_concatenated)

print("Final output (z) shape:", z.shape)

Final output (z) shape: torch.Size([4, 3])


## Adding and getting the normalized value of the output from the Masked MHAL

In [38]:
torch.manual_seed(1)

risidual_connection = z + positional_embeddings

m = nn.LayerNorm([vocab_size, embedding_len])

layer_norm_output = m(risidual_connection)

print("Layer norm output shape:", layer_norm_output.shape)
print(layer_norm_output)

Layer norm output shape: torch.Size([4, 3])
tensor([[ 0.6209,  0.9641, -0.1326],
        [ 1.8560, -0.0939, -0.6452],
        [-0.9726,  1.4778, -1.6857],
        [-0.8058, -0.3499, -0.2332]], grad_fn=<NativeLayerNormBackward>)


In [54]:
torch.manual_seed(1)

def get_multihead_attention_decoder(q, k, v, div):
    softmax = nn.Softmax(dim=1)
    score = q @ torch.transpose(k, 0, 1)
    score = softmax(score / div)
    
    z = score @ v
    
    return z
    
# z = get_multihead_attention_decoder(layer_norm_output, encoder_output, encoder_output, divide_by)
# print("z shape:", z.shape)

z_matrices = {}

for i in range(number_of_heads):
    z = get_multihead_attention_decoder(layer_norm_output, 
                                       encoder_output, 
                                       encoder_output, 
                                       divide_by)
    z_matrices['z'+str(i+1)] = z
    

list_of_zs = []
for z in z_matrices.values():
    list_of_zs.append(z)
    
z_tuple = tuple(list_of_zs)

z_concatenated = torch.cat(z_tuple, dim=1)

# print("z_concatenated shape:", z_concatenated.shape)

wo = nn.Linear(z_concatenated.shape[1], embedding_len)
z = wo(z_concatenated)

# print("Final output (z) shape:", z.shape)

risidual_connection = z + layer_norm_output
m = nn.LayerNorm([vocab_size, embedding_len])
decoder_MHAL_output = m(risidual_connection)
print("Decoder multihead attention output:", decoder_MHAL_output)
print(decoder_MHAL_output.shape)

Decoder multihead attention output: tensor([[ 0.7282,  0.9485, -0.3348],
        [ 1.8690, -0.0289, -0.8083],
        [-0.7441,  1.4229, -1.7695],
        [-0.5897, -0.2653, -0.4278]], grad_fn=<NativeLayerNormBackward>)
torch.Size([4, 3])


## Decoder feed forward network and AddNorm layer

In [56]:
torch.manual_seed(1)

lin1 = nn.Linear(embedding_len, embedding_len)
relu = nn.ReLU()
lin2 = nn.Linear(embedding_len, embedding_len)

decoder_FF_output = lin1(decoder_MHAL_output)
decoder_FF_output = relu(decoder_FF_output)
decoder_FF_output = lin2(decoder_FF_output)

print(f"Decoder FFNN output:\n{decoder_FF_output}. Shape: {decoder_FF_output.shape}")

risidual_connection = decoder_FF_output + decoder_MHAL_output
m = nn.LayerNorm([vocab_size, embedding_len])
decoder_output = m(risidual_connection)
print(f"Decoder output:\n{decoder_output}. Shape: {decoder_output.shape}")

Decoder FFNN output:
tensor([[-0.3569, -0.0972, -0.2907],
        [-0.3163, -0.0934,  0.1176],
        [-0.3652, -0.0977, -0.3335],
        [-0.3487, -0.0968, -0.2490]], grad_fn=<AddmmBackward>). Shape: torch.Size([4, 3])
Decoder output:
tensor([[ 0.5701,  1.0403, -0.4064],
        [ 1.7275,  0.0866, -0.4703],
        [-0.8804,  1.5046, -1.8538],
        [-0.7130, -0.1484, -0.4567]], grad_fn=<NativeLayerNormBackward>). Shape: torch.Size([4, 3])


## Final steps of the decoder. Pass the output through a linear layer and then a softmax to get the probabilities

In [65]:
decoder_output_flattened = decoder_output.view(1, -1)
# print(decoder_output_flattened.shape)

lin = nn.Linear(decoder_output_flattened.shape[1], vocab_size)
softmax = nn.Softmax(dim=1)
probabilities = softmax(lin(decoder_output_flattened))
print(probabilities)

tensor([[0.1771, 0.0780, 0.4744, 0.2706]], grad_fn=<SoftmaxBackward>)
