In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import color
from color import magenta

In [25]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads 
        
        assert (self.head_dim * heads == embed_size), "Embed size needs to be divisible by heads"
        
        # Explanation: Ensures that 'embed_size' is divisible by 'heads'. This is because the embedding is cut up into chunks and fed into identical but seperate attention heads. 
        # Each head sees a reduced dimension of the embedding which is concatonated at the end to form the final full form. This was better than just one single headed attention
        # according to the "Attention is all you need" paper.
        
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False) # The query needs to be head_dim x head_dim because it is multiplied by the key which is head_dim x head_dim
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        
        self.fc_out = nn.Linear(embed_size, embed_size)
    
    def forward(self, queries, keys, values, mask):
        
        # queries, keys, values have shape: (num_examples, seq_length, embed_size)
        
        num_examples = queries.shape[0] # Number of examples in the batch
        value_len, key_len, query_len = values.shape[1], keys.shape[1], queries.shape[1]
        

In [22]:
# Experimenting with linear layers

linear_layer = nn.Linear(5, 4, bias=False)

print(linear_layer.weight)
print(linear_layer.bias) # should be none

Parameter containing:
tensor([[ 0.0711,  0.1049,  0.4373, -0.4432, -0.0977],
        [ 0.3297, -0.3747, -0.4408, -0.2911, -0.3674],
        [ 0.1335,  0.1937,  0.4305,  0.0925, -0.0908],
        [-0.2462, -0.1885,  0.1552,  0.3014,  0.3736]], requires_grad=True)
None


In [83]:
# Experimenting with torch reshape

num_examples = 10
seq_length = 5
embed_size = 10
heads = 5

x = torch.arange(num_examples*seq_length*embed_size).reshape(num_examples, seq_length, embed_size)
print(magenta('10 sentences, of 5 words, each with a 10 integer embedding\n'))
print(x.shape)

print(magenta('\nReshaping into 5 heads with a dimensionality of 2 instead of 10\n'))

x = x.reshape(num_examples, seq_length, heads, embed_size // heads)
print(x.shape) # Numbers are arranged in a weird way, but does that really matter on intialization? probably not

[10;10;35m10 sentences, of 5 words, each with a 10 integer embedding
[0m
torch.Size([10, 5, 10])
[10;10;35m
Reshaping into 5 heads with a dimensionality of 2 instead of 10
[0m
torch.Size([10, 5, 5, 2])
