In [2]:
import torch
#Use this space for manually computing the scaled dot product attention

In [3]:
input = torch.tensor([[-1.5898,1.5792,0.8666,1.0245],[2.2796,0.9813,1.4023,-0.6616],[0.1588,0.4028,-1.7044,-1.8284],[0.4366,-1.0197,-0.4217, 1.4877],[-0.8311,-0.1762,-1.8709,2.2313]], dtype=torch.float32)
input
#Paper example input from embedding layers

tensor([[-1.5898,  1.5792,  0.8666,  1.0245],
        [ 2.2796,  0.9813,  1.4023, -0.6616],
        [ 0.1588,  0.4028, -1.7044, -1.8284],
        [ 0.4366, -1.0197, -0.4217,  1.4877],
        [-0.8311, -0.1762, -1.8709,  2.2313]])

In [4]:
#Lets try to initialize the K,Q,V wieghts
import torch.nn as nn
import torch.nn.init as init

# Our example input has a column dimension of 4, or 4 featues in the sequence. 
# This is important for the linear layers
d_model = 4

# Set dimensions of the weights
W_Q = nn.Linear(d_model, d_model, bias=False)
W_K = nn.Linear(d_model, d_model, bias=False)
W_V = nn.Linear(d_model, d_model, bias=False)

nn.init.kaiming_normal_(W_Q.weight)
nn.init.kaiming_normal_(W_K.weight)
nn.init.kaiming_normal_(W_V.weight)

with torch.no_grad():
    print("WQ matrix:")
    printTensor(W_Q.weight)
    print("\nWK matrix:")
    printTensor(W_K.weight)
    print("\nWV matrix:")
    printTensor(W_V.weight)



WQ matrix:
          0         1         2         3
0  0.280131  0.973659  0.650142  0.658475
1  0.652911 -0.443674 -0.941379 -1.285408
2  0.454926 -0.340234 -0.077881  0.398767
3 -0.085917  0.376892  1.278757 -0.799545

WK matrix:
          0         1         2         3
0 -1.256685 -0.824019 -0.450124 -0.223081
1 -0.152469  0.114830  0.037177  0.903964
2  0.079489 -0.151013 -0.610239  1.435603
3 -0.242462  0.790393  0.287243 -0.669850

WV matrix:
          0         1         2         3
0 -0.380946 -0.126559  0.535825 -0.000506
1 -0.148489  0.819622  0.664562 -0.365945
2  0.334110 -0.488277  0.150901  0.558298
3  1.126758 -0.039417 -0.866881  0.231026


In [5]:
import math
# Lets compute Q,K,V first linear projection
with torch.no_grad():
    Q = W_Q(input)
    K = W_K(input)
    V = W_V(input)
    print("Q matrix:")
    printTensor(Q)
    print("\nK matrix:")
    printTensor(K)
    print("\nV matrix:",)
    printTensor(V)
    

Q matrix:
          0         1         2         3
0  2.330270 -3.871346 -0.919493  1.020816
1  2.070086  0.583329  0.330141  2.496168
2 -1.875383  3.879697 -0.661170 -0.579457
3 -0.165087 -0.777848  1.171646 -2.150563
4 -0.151472 -1.571365  0.717338 -4.171453

K matrix:
          0         1         2         3
0  0.077962  1.382064  0.577092  1.196317
1 -4.156968 -0.780814 -1.772519  1.068870
2  0.643597 -1.694131 -1.632971  1.015044
3  0.149523  1.145488  2.581776 -2.029489
4  1.533999  2.053942  4.305502 -1.969797

V matrix:
          0         1         2         3
0  0.869595  1.731415 -0.599508 -2.368121
1 -0.240874  1.639825  0.124729  1.161404
2 -1.023807 -0.157023 -1.421610  1.218156
3 -0.263981 -1.725261  1.410713  1.241398
4 -0.664702 -2.080871  0.771765  1.207833


In [6]:
# Scaled dot-product    
with torch.no_grad():
    attention_weights = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_model)
    print("Scaled Dot Product:")
    printTensor(attention_weights)

Scaled Dot Product:
          0         1         2         3         4
0 -2.239093 -1.971559  5.298000 -4.265907 -5.173285
1  2.072158 -3.488925  1.169339 -1.617938  0.439054
2  2.070504  2.659580 -3.644104  1.816371  1.693290
3 -1.492257 -1.540911 -1.442327  3.236886  3.714900
4 -3.379974 -1.936814 -1.420495  4.247648  3.922783


In [7]:
with torch.no_grad():
    #softmax
    attention_weights = torch.softmax(attention_weights, dim=-1)
    print("Softmax:")
    printTensor(attention_weights)

Softmax:
          0         1         2         3         4
0  0.000532  0.000695  0.998674  0.000070  0.000028
1  0.613662  0.002359  0.248794  0.015323  0.119862
2  0.234358  0.422388  0.000773  0.181765  0.160716
3  0.003347  0.003188  0.003518  0.378874  0.611073
4  0.000282  0.001192  0.001998  0.578494  0.418034


In [8]:
with torch.no_grad():
    #Last linear layer
    print("Output:")
    output = torch.matmul(attention_weights, V)
    printTensor(output)

Output:
          0         1         2         3
0 -1.022191 -0.154933 -1.419836  1.216209
1  0.194635  0.751453 -0.607167 -0.983621
2 -0.053548  0.450272  0.291540  0.356277
3 -0.507656 -1.914749  0.999476  1.208470
4 -0.432667 -1.865798  1.135852  1.226208


In [13]:
# Applying layer normalization to a mini batch
# Remember d_model is our embedding dimension
layer_norm = nn.LayerNorm(d_model)
# We now will call the norm layer with our output
layer_norm(output)

tensor([[-0.6697,  0.1882, -1.0631,  1.5446],
        [ 0.5253,  1.3475, -0.6585, -1.2143],
        [-1.6539,  0.9940,  0.1598,  0.5000],
        [-0.3597, -1.4746,  0.8344,  1.0000],
        [-0.3529, -1.4806,  0.8812,  0.9523]],
       grad_fn=<NativeLayerNormBackward0>)

In [9]:
#This is how you'll do diagrams for tensors from here on out
import pandas as pd
#pd.DataFrame(input)

def printTensor(tensor):
    df = pd.DataFrame(tensor)
    print(df)

If a tensor requires grads, use this method,
# Convert the tensor to a numpy array
tensor_numpy = tensor.cpu().numpy()

# Create a DataFrame using the numpy array
df = pd.DataFrame(tensor_numpy)

# Display the DataFrame
print(df)