In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import color
from color import magenta

In [6]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads 
        
        assert (self.head_dim * heads == embed_size), "Embed size needs to be divisible by heads"
        
        # Explanation: Ensures that 'embed_size' is divisible by 'heads'. This is because the embedding is cut up into chunks and fed into identical but seperate attention heads. 
        # Each head sees a reduced dimension of the embedding which is concatonated at the end to form the final full form. This was better than just one single headed attention
        # according to the "Attention is all you need" paper.
        
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False) # The query needs to be head_dim x head_dim because it is multiplied by the key which is head_dim x head_dim
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        
        self.fc_out = nn.Linear(embed_size, embed_size)
    
    def forward(self, queries, keys, values, mask):
        
        # queries, keys, values have shape: (num_examples, seq_length, embed_size)
        num_examples = queries.shape[0] # Number of examples in the batch
        value_len, key_len, query_len = values.shape[1], keys.shape[1], queries.shape[1]
        
        
        
        # Split embedding into self.heads pieces
        self.queries = queries.reshape(num_examples, query_len, self.heads, self.head_dim)
        self.keys = keys.reshape(num_examples, key_len, self.heads, self.head_dim)
        self.values = values.reshape(num_examples, value_len, self.heads, self.head_dim)
        # queries, keys, values have a new shape: (num examples, seq length, num heads, head dimension)
        
        self.qk = torch.einsum("nqhd,nkhd->nhqk", [self.queries, self.keys]) # (num examples, num heads, query len, key len)
        print(self.qk.shape)
        

In [9]:
# Test what we have so far:

if __name__ == "__main__":
    print(magenta("Testing self attention"))
    
    embed_size = 512
    heads = 8
    seq_length = 10
    batch_size = 4
    
    queries = torch.rand(batch_size, seq_length, embed_size)
    keys = torch.rand(batch_size, seq_length, embed_size)
    values = torch.rand(batch_size, seq_length, embed_size)
    
    self_attention = SelfAttention(embed_size, heads)
    self_attention.forward(queries, keys, values, None)
    # self_attention.forward(queries, keys, values, None)

[10;10;35mTesting self attention[0m


TypeError: cannot assign 'torch.FloatTensor' as child module 'queries' (torch.nn.Module or None expected)

In [26]:
# Experimenting with linear layers

linear_layer = nn.Linear(5, 4, bias=False, dtype=float)
weights = torch.arange(20, dtype=float).reshape(4, 5)

new_layer = linear_layer(weights)

print('linear layer\n', linear_layer.weight)
print('linear layer bias\n',linear_layer.bias) # should be none

print('\nweight matrix\n',weights)
print('new_layer layer\n' ,new_layer)

linear layer
 Parameter containing:
tensor([[-0.3275,  0.2698,  0.1238, -0.3431, -0.3357],
        [ 0.2703,  0.2390,  0.1315, -0.3263, -0.1824],
        [ 0.1595, -0.3666,  0.3468, -0.3971,  0.2737],
        [-0.2088, -0.0380,  0.0654, -0.2136, -0.1499]], dtype=torch.float64,
       requires_grad=True)
linear layer bias
 None

weight matrix
 tensor([[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.],
        [10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19.]], dtype=torch.float64)
new_layer layer
 tensor([[ -1.8546,  -1.2065,   0.2308,  -1.1475],
        [ -4.9180,  -0.5458,   0.3127,  -3.8720],
        [ -7.9813,   0.1149,   0.3945,  -6.5964],
        [-11.0447,   0.7755,   0.4764,  -9.3209]], dtype=torch.float64,
       grad_fn=<MmBackward0>)


In [49]:
# Figuring out .split()
# .split takes arguments: split_size_or_sections, in this case we want to split the embedding into 3 parts
arrange_1to12 = torch.arange(12)
arrange_0to3, arrange_4to7, arrange_8to11 = arrange_1to12.split(4)[0], arrange_1to12.split(4)[1], arrange_1to12.split(4)[2]

print(magenta('original tensor\n'), arrange_1to12)
print(magenta('\n3 partitions of tensor using.split:\n'),arrange_0to3,'\n' ,arrange_4to7,'\n' ,arrange_8to11)

[10;10;35moriginal tensor
[0m tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])
[10;10;35m
3 partitions of tensor using.split:
[0m tensor([0, 1, 2, 3]) 
 tensor([4, 5, 6, 7]) 
 tensor([ 8,  9, 10, 11])


In [83]:
# Experimenting with torch reshape

num_examples = 10
seq_length = 5
embed_size = 10
heads = 5

x = torch.arange(num_examples*seq_length*embed_size).reshape(num_examples, seq_length, embed_size)
print(magenta('10 sentences, of 5 words, each with a 10 integer embedding\n'))
print(x.shape)

print(magenta('\nReshaping into 5 heads with a dimensionality of 2 instead of 10\n'))

x = x.reshape(num_examples, seq_length, heads, embed_size // heads)
print(x.shape) # Numbers are arranged in a weird way, but does that really matter on intialization? probably not

[10;10;35m10 sentences, of 5 words, each with a 10 integer embedding
[0m
torch.Size([10, 5, 10])
[10;10;35m
Reshaping into 5 heads with a dimensionality of 2 instead of 10
[0m
torch.Size([10, 5, 5, 2])
