In [2]:
import torch

#each row corresponds to each token
inputs=torch.tensor(
    [
        [0.72,0.45,0.31],
        [0.75,0.20,0.55],
        [0.30,0.80,0.40],
        [0.85,0.35,0.60],
        [0.55,0.15,0.75],
        [0.25,0.20,0.85]
    ]
)

words=['Dream','big','and','work','for','it']

In [3]:
#using 2nd token "big"
x_2=inputs[1]

#d_in,d_out are dimensions for wq,wk,wv matrices
d_in=inputs.shape[1]
d_out=2
print(x_2)
print(d_in)

tensor([0.7500, 0.2000, 0.5500])
3


In [4]:
#randomly initializing wq,wk,wv matrices

torch.manual_seed(123)
W_query=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)
W_key=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)
W_value=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)

In [5]:
print(W_query)

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])


In [6]:
query_2=x_2@W_query
#query for 2nd token(input embedding for 2nd token * query matrix)
key_2=x_2@W_key
#key for 2nd token(input embedding for 2nd token * key matrix)
value_2=x_2@W_value
#value for 2nd token(input embedding for 2nd token * value matrix)


In [7]:
#calculating key,query,value for all tokens

keys=inputs @ W_key
queries=inputs @ W_query
values=inputs @ W_value

print("keys shape: ",keys.shape)
print("queries shape: ",queries.shape)
print("values shape: ",values.shape)




keys shape:  torch.Size([6, 2])
queries shape:  torch.Size([6, 2])
values shape:  torch.Size([6, 2])


In [8]:
#calculating attention score for 2nd token with respect to 2nd token
keys_2=keys[1]#key for 2nd token
attn_score22=query_2.dot(keys_2)
print(attn_score22)

tensor(0.6990)


In [9]:
#calculating attention score for 2nd token with respect to all tokens
attn_scores2=query_2@keys.T
print(attn_scores2)

d_k=keys.shape[-1]

#scaling down the attention weights and applying softmax
attn_weights2=torch.softmax(attn_scores2/d_k**0.5,dim=-1)
print(attn_weights2)

#context vector for 2nd token
context_vec2=attn_weights2@ values
print(context_vec2)

tensor([0.7021, 0.6990, 0.9867, 0.8707, 0.7880, 0.8624])
tensor([0.1531, 0.1528, 0.1873, 0.1725, 0.1627, 0.1715])
tensor([0.2274, 0.7362])


In [10]:
#calculating attention score for all tokens with respect to 2nd token
attn_scores=queries @ keys.T
print(attn_scores)

tensor([[0.6807, 0.6795, 0.9526, 0.8454, 0.7654, 0.8359],
        [0.7021, 0.6990, 0.9867, 0.8707, 0.7880, 0.8624],
        [0.7350, 0.7315, 1.0337, 0.9113, 0.8248, 0.9029],
        [0.8436, 0.8402, 1.1848, 1.0464, 0.9471, 1.0361],
        [0.7080, 0.7025, 1.0003, 0.8764, 0.7929, 0.8699],
        [0.6680, 0.6606, 0.9486, 0.8254, 0.7465, 0.8210]])


In [11]:
#scaling down by square root of dimension of keys to reduce variance to 1 and applying softmax
d_k=keys.shape[-1]
attn_weights=torch.softmax(attn_scores/d_k**0.5,dim=-1)
print(attn_weights)

tensor([[0.1536, 0.1534, 0.1861, 0.1725, 0.1630, 0.1714],
        [0.1531, 0.1528, 0.1873, 0.1725, 0.1627, 0.1715],
        [0.1525, 0.1521, 0.1884, 0.1728, 0.1625, 0.1717],
        [0.1505, 0.1501, 0.1915, 0.1737, 0.1619, 0.1724],
        [0.1530, 0.1524, 0.1881, 0.1724, 0.1625, 0.1716],
        [0.1538, 0.1530, 0.1875, 0.1719, 0.1625, 0.1713]])


In [12]:
context_vec=attn_weights @ values
print(context_vec)

tensor([[0.2273, 0.7361],
        [0.2274, 0.7362],
        [0.2276, 0.7363],
        [0.2280, 0.7368],
        [0.2275, 0.7362],
        [0.2275, 0.7360]])


In [13]:
#Class for Self Attention
import torch.nn as nn

class SelfAttention_v1(nn.Module):
  def __init__(self,d_in,d_out):
    super().__init__()
    W_query=nn.Parameter(torch.randn(d_in,d_out))
    W_key=nn.Parameter(torch.randn(d_in,d_out))
    W_value=nn.Parameter(torch.randn(d_in,d_out))

  def forward(self,x):
    keys=x @ self.W_key
    queries=x @ self.W_query
    values=x @ self.W_value

    attn_scores=queries@keys.T

    attn_weights=torch.softmax(attn_scores/keys.shape[-1]**0.5,dim=-1)

    context_vecs=attn_weights @ values

    return context_vecs




In [14]:
class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        # x: (seq_len, d_in) or (batch, seq_len, d_in)

        queries = self.W_query(x)
        keys    = self.W_key(x)
        values  = self.W_value(x)

        d_k = keys.size(-1)

        attn_scores = queries @ keys.transpose(-2, -1)
        attn_weights = torch.softmax(attn_scores / d_k**0.5, dim=-1)

        context_vecs = attn_weights @ values
        return context_vecs

In [15]:
torch.manual_seed(123)

d_in=inputs.shape[-1]
d_out=2

sa_v2=SelfAttention_v2(d_in,d_out)
print(sa_v2(inputs))

tensor([[-0.5282, -0.0051],
        [-0.5288, -0.0036],
        [-0.5276, -0.0066],
        [-0.5289, -0.0040],
        [-0.5289, -0.0032],
        [-0.5287, -0.0033]], grad_fn=<MmBackward0>)


In [16]:
queries=sa_v2.W_query(inputs)
keys=sa_v2.W_key(inputs)
attn_scores=queries@keys.T
attn_weights=torch.softmax(attn_scores/keys.shape[-1]**0.5,dim=-1)
print(attn_weights)

tensor([[0.1666, 0.1670, 0.1658, 0.1709, 0.1661, 0.1636],
        [0.1656, 0.1690, 0.1606, 0.1743, 0.1677, 0.1628],
        [0.1675, 0.1651, 0.1710, 0.1676, 0.1644, 0.1643],
        [0.1658, 0.1687, 0.1614, 0.1746, 0.1673, 0.1623],
        [0.1653, 0.1696, 0.1591, 0.1751, 0.1682, 0.1627],
        [0.1655, 0.1692, 0.1601, 0.1740, 0.1680, 0.1632]],
       grad_fn=<SoftmaxBackward0>)


In [17]:
#applying casual attention

# no of tokens is the context length
context_length=attn_scores.shape[0]

#mask with ones in lower diagonal to capture the lower diagonal values of attentiion weights
mask_simple=torch.tril(torch.ones(context_length,context_length))
print(mask_simple)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])


In [21]:
masked_attn_weights=attn_weights*mask_simple
print(masked_attn_weights)

tensor([[0.1666, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1656, 0.1690, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1675, 0.1651, 0.1710, 0.0000, 0.0000, 0.0000],
        [0.1658, 0.1687, 0.1614, 0.1746, 0.0000, 0.0000],
        [0.1653, 0.1696, 0.1591, 0.1751, 0.1682, 0.0000],
        [0.1655, 0.1692, 0.1601, 0.1740, 0.1680, 0.1632]],
       grad_fn=<MulBackward0>)


In [22]:
#normalize the maksed attention weights - Naive Approach
#this leads to data leakage

#finding row sum of each row
row_sums=masked_attn_weights.sum(dim=1,keepdim=True)

#dividing each row vaues lby its row sum
masked_simple_norm=masked_attn_weights/row_sums
print(masked_simple_norm)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4949, 0.5051, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3326, 0.3278, 0.3395, 0.0000, 0.0000, 0.0000],
        [0.2472, 0.2516, 0.2407, 0.2604, 0.0000, 0.0000],
        [0.1974, 0.2025, 0.1900, 0.2092, 0.2009, 0.0000],
        [0.1655, 0.1692, 0.1601, 0.1740, 0.1680, 0.1632]],
       grad_fn=<DivBackward0>)


In [23]:
#replacing the upper diagonal of attention scores of upper diagonal to negative inifinity

print(attn_scores)

tensor([[0.1659, 0.1698, 0.1592, 0.2024, 0.1614, 0.1403],
        [0.2529, 0.2817, 0.2094, 0.3258, 0.2710, 0.2289],
        [0.0804, 0.0600, 0.1096, 0.0811, 0.0539, 0.0534],
        [0.2697, 0.2946, 0.2320, 0.3430, 0.2826, 0.2403],
        [0.2703, 0.3064, 0.2162, 0.3523, 0.2954, 0.2481],
        [0.2361, 0.2674, 0.1891, 0.3075, 0.2578, 0.2166]],
       grad_fn=<MmBackward0>)


In [25]:
mask=torch.triu(torch.ones(context_length,context_length),diagonal=1)
#upper diagonal with 1
print(mask)

tensor([[0., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.]])


In [26]:
#masked attention scores with negative infinity where mask has 1(it has 1 in upper diagonal)
masked_inf=attn_scores.masked_fill(mask.bool(),-torch.inf)
print(masked_inf)

tensor([[0.1659,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.2529, 0.2817,   -inf,   -inf,   -inf,   -inf],
        [0.0804, 0.0600, 0.1096,   -inf,   -inf,   -inf],
        [0.2697, 0.2946, 0.2320, 0.3430,   -inf,   -inf],
        [0.2703, 0.3064, 0.2162, 0.3523, 0.2954,   -inf],
        [0.2361, 0.2674, 0.1891, 0.3075, 0.2578, 0.2166]],
       grad_fn=<MaskedFillBackward0>)


In [28]:
attn_weights=torch.softmax(masked_inf/keys.shape[-1]**0.5,dim=-1)
#now automatically each row value sums up to 1 with softmax
print(attn_weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4949, 0.5051, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3326, 0.3278, 0.3395, 0.0000, 0.0000, 0.0000],
        [0.2472, 0.2516, 0.2407, 0.2604, 0.0000, 0.0000],
        [0.1974, 0.2025, 0.1900, 0.2092, 0.2009, 0.0000],
        [0.1655, 0.1692, 0.1601, 0.1740, 0.1680, 0.1632]],
       grad_fn=<SoftmaxBackward0>)


In [29]:
#dropout attention scores with 50%
example=torch.ones(context_length,context_length)
print(example)

tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]])


In [31]:
torch.manual_seed(123)
dropout=torch.nn.Dropout(0.5)
# 1/1-p here p=0.5 1/0.5=2 scale up remaining weights by 2
#50% of weights is dropped in each row and remaining are scaled up by 2%
print(dropout(example))

tensor([[2., 2., 0., 2., 2., 0.],
        [0., 0., 0., 2., 0., 2.],
        [2., 2., 2., 2., 0., 2.],
        [0., 2., 2., 0., 0., 2.],
        [0., 2., 0., 2., 0., 2.],
        [0., 2., 2., 2., 2., 0.]])
