In [1]:
import torch 
from torch import nn
from torch.utils.data import Dataset,DataLoader
import torch.nn.functional as F

# Section 1 : vocabulary and (train/val split)

In [2]:
# Imagine this is out entire dataset. 
raw_text = 'The cat is hungry I went to the store But it was closed So I went to a different store'

![](section_1.png)

split the dataset into train and validation sets

In [3]:
ratio = 0.66 # arbitrary ratio
split_index = int(len(raw_text) * ratio)
train_raw_text = raw_text[:split_index]
val_raw_text = raw_text[split_index:]


print(f"Train raw text: {train_raw_text}")

print("="*100)

print(f"Val raw text: {val_raw_text}")


Train raw text: The cat is hungry I went to the store But it was closed 
Val raw text: So I went to a different store


Corpus vocabulary


We will make an assumption that each word is a unique token.<br>
This is a simplification and not true in the real world.<br>
In practice, we would use a more sophisticated tokenization method.


In [4]:
vocab = list(sorted(set(raw_text.split(' '))))

print("unique words in the corpus:")
print("-"*50)
for i in vocab:
    print(i)

print("="*50)
vocab_size = len(vocab)
print(f'vocab_size: {vocab_size}') # You can think of as number of unique words in the corpus. 

unique words in the corpus:
--------------------------------------------------
But
I
So
The
a
cat
closed
different
hungry
is
it
store
the
to
was
went
vocab_size: 16


Note: (The) is different from (the)

In [5]:
tokens_to_ids = {token: id for id, token in enumerate(vocab)}
print("Mapping of tokens to ids:")
tokens_to_ids

Mapping of tokens to ids:


{'But': 0,
 'I': 1,
 'So': 2,
 'The': 3,
 'a': 4,
 'cat': 5,
 'closed': 6,
 'different': 7,
 'hungry': 8,
 'is': 9,
 'it': 10,
 'store': 11,
 'the': 12,
 'to': 13,
 'was': 14,
 'went': 15}

In [6]:
ids_to_tokens = {id: token for id, token in enumerate(vocab)}
print("Mapping of ids to tokens:")
ids_to_tokens

Mapping of ids to tokens:


{0: 'But',
 1: 'I',
 2: 'So',
 3: 'The',
 4: 'a',
 5: 'cat',
 6: 'closed',
 7: 'different',
 8: 'hungry',
 9: 'is',
 10: 'it',
 11: 'store',
 12: 'the',
 13: 'to',
 14: 'was',
 15: 'went'}

In [7]:
def encode(text):
    return [tokens_to_ids[token] for token in text.strip().split(' ')]

def decode(ids):
    return ' '.join([ids_to_tokens[id] for id in ids])

In [8]:
encode(raw_text)

[3, 5, 9, 8, 1, 15, 13, 12, 11, 0, 10, 14, 6, 2, 1, 15, 13, 4, 7, 11]

In [9]:
encode("The cat is hungry")

[3, 5, 9, 8]

In [10]:
decode([3,5,9,8])

'The cat is hungry'

# Section 2 : Creating  dataset and dataloader 

### Dataset & DataLoader Config

In [11]:
# X aka input 
max_len = 4  # length of the green bracket
stride = 3  # jump of the green bracket

# y aka output 
# The red bracket length and jump are the same as the green bracket,
# but it is shifted by one token to the right.

Note : max_len and stride are hyper-parameters and can be tuned.

![](section_2.png)

### Dataset 

In [12]:
class Data(Dataset):
    def __init__(self,raw_text,max_len=max_len,stride=stride):
        self.token_ids = encode(raw_text)
        self.X = []
        self.y = []
        for i in range(0,len(self.token_ids)-max_len,stride):
            input = self.token_ids[i:i+max_len]
            output = self.token_ids[i+1:i+max_len+1]
            self.X.append(torch.tensor(input))
            self.y.append(torch.tensor(output))
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self,idx):
        return self.X[idx],self.y[idx]
    

In [13]:
train_ds = Data(train_raw_text)
train_ds[0]

(tensor([3, 5, 9, 8]), tensor([5, 9, 8, 1]))

In [14]:
val_ds = Data(val_raw_text)
val_ds[0]

(tensor([ 2,  1, 15, 13]), tensor([ 1, 15, 13,  4]))

### DataLoader

In [15]:
train_dl = DataLoader(train_ds,batch_size=1,shuffle=False,drop_last=False,num_workers=0)
val_dl   = DataLoader(val_ds,batch_size=1,shuffle=False,drop_last=True,num_workers=0)

In [16]:
# for i,(x,y) in enumerate(train_dl):
#   print(f'Batch Number: {i+1}')
#   print(f'x :{x}')
#   print(f'y :{y}')
#   print('--'*20)

In [17]:
# for i,(x,y) in enumerate(val_dl):
#   print(f'Batch Number: {i+1}')
#   print(f'x :{x}')
#   print(f'y :{y}')
#   print('--'*20)

# Section 3 : Token Embedding and Positional Encoding 

Taking single batch, and we know that the batch size is 1. <br>
so we are taking a single example from the dataset. 

In [18]:
for x,y in train_dl:
  print(x)
  print(y)
  break

tensor([[3, 5, 9, 8]])
tensor([[5, 9, 8, 1]])


In [19]:
B = 1  # batch size 
d_in = 4  # embedding dimension  [input dimension]

In [20]:
torch.manual_seed(1)

# Embedding 
token_emb = nn.Embedding(vocab_size,d_in)
token_embedding = token_emb(x)
print('Token Embeddings:')
print(token_embedding)


Token Embeddings:
tensor([[[-0.2223,  1.6871,  0.2284,  0.4676],
         [ 0.8657,  0.2444, -0.6629,  0.8073],
         [ 0.1991,  0.0457,  0.1530, -0.4757],
         [ 1.8793, -0.0721,  0.1578, -0.7735]]], grad_fn=<EmbeddingBackward0>)


In [21]:
context_window = 4 # [max length of the input sequence the model can handle]
num_tokens = 4 # this can not be greater than context window


torch.manual_seed(1)

# Positional embedding
pos_emb = nn.Embedding(context_window,d_in)
positional_embedding = pos_emb(torch.arange(num_tokens))
print('Positional Embeddings:')
print('--'*20)
print(positional_embedding)
print('--'*20)
print(positional_embedding.shape)

Positional Embeddings:
----------------------------------------
tensor([[-1.5256, -0.7502, -0.6540, -1.6095],
        [-0.1002, -0.6092, -0.9798, -1.6091],
        [-0.7121,  0.3037, -0.7773, -0.2515],
        [-0.2223,  1.6871,  0.2284,  0.4676]], grad_fn=<EmbeddingBackward0>)
----------------------------------------
torch.Size([4, 4])


In [22]:
# Token embedding + token embedding 
tok_pos_emb = token_embedding + positional_embedding
print('Token Embedding + Positional Embedding')
print('--'*20)
print(tok_pos_emb)
print('--'*20)
print(tok_pos_emb.shape)

Token Embedding + Positional Embedding
----------------------------------------
tensor([[[-1.7479,  0.9369, -0.4256, -1.1418],
         [ 0.7655, -0.3648, -1.6427, -0.8018],
         [-0.5131,  0.3494, -0.6244, -0.7271],
         [ 1.6571,  1.6150,  0.3862, -0.3058]]], grad_fn=<AddBackward0>)
----------------------------------------
torch.Size([1, 4, 4])


# Section 4 : Pre Transformer Block Dropout 

In [23]:
dropout = 0.25 


torch.manual_seed(2)

pre_trans_dp = nn.Dropout(dropout)
pre_transformer_dp_result = pre_trans_dp(tok_pos_emb)
pre_transformer_dp_result

tensor([[[-0.0000,  1.2492, -0.5674, -1.5225],
         [ 0.0000, -0.4864, -2.1902, -1.0691],
         [-0.6841,  0.4659, -0.8325, -0.9695],
         [ 2.2094,  0.0000,  0.5149, -0.4078]]], grad_fn=<MulBackward0>)

# Section5 : Transformer Block 

### Layer Normalization 

In [24]:
layernorm1 = nn.LayerNorm(d_in)
print(layernorm1.weight)
print(layernorm1.bias)
layernorm1 = layernorm1(pre_transformer_dp_result)
print(layernorm1)
print('--'*20)
print(layernorm1.shape)


Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
tensor([[[ 0.2096,  1.4551, -0.3562, -1.3084],
         [ 1.1463,  0.5509, -1.5349, -0.1624],
         [-0.3144,  1.7046, -0.5748, -0.8154],
         [ 1.6361, -0.5812, -0.0645, -0.9905]]],
       grad_fn=<NativeLayerNormBackward0>)
----------------------------------------
torch.Size([1, 4, 4])


### Multi-Head Attention 

weights initialization 

In [25]:
d_out = 4

W_q = nn.Parameter(torch.tensor([
    [-0.5, 0.2, 0.7, -0.9],
    [0.1, -0.3, 0.8, 0.4],
    [-0.7, 0.6, -0.2, 0.9],
    [0.3, -0.8, 0.5, -0.1]
]))


W_k = nn.Parameter(torch.tensor([
    [0.3, -0.5, 0.2, 0.7],
    [-0.4, 0.1, -0.6, -0.2],
    [0.8, -0.3, 0.5, -0.7],
    [-0.1, 0.6, -0.9, 0.4]
]))

W_v = nn.Parameter(torch.tensor([
    [0.2, -0.8, 0.3, 0.5],
    [-0.7, 0.4, -0.1, -0.6],
    [0.9, -0.2, 0.7, -0.3],
    [-0.5, 0.1, -0.4, 0.8]
]))



print('W_q')
print(W_q.data.shape)
print('---'*20)

print('W_k')
print(W_k.data.shape)
print('---'*20)

print('W_v')
print(W_v.data.shape)
print('---'*20)

W_q
torch.Size([4, 4])
------------------------------------------------------------
W_k
torch.Size([4, 4])
------------------------------------------------------------
W_v
torch.Size([4, 4])
------------------------------------------------------------


Q,K,V

In [26]:
Q = layernorm1 @ W_q
K = layernorm1 @ W_k
V = layernorm1 @ W_v

print('Q')
print(Q.data)
print(Q.data.shape)
print('---'*20)

print('K')
print(K.data)
print(K.data.shape)
print('---'*20)

print('V')
print(V.data)
print(V.data.shape)
print('---'*20)


Q
tensor([[[-0.1025,  0.4384,  0.7278,  0.2037],
         [ 0.5076, -0.7271,  1.4690, -2.1765],
         [ 0.4854, -0.2668,  0.8509,  0.5290],
         [-1.1282,  1.2553,  0.1980, -1.6640]]])
torch.Size([1, 4, 4])
------------------------------------------------------------
K
tensor([[[-0.6733, -0.6375,  0.1684, -0.4184],
         [-1.0882, -0.1550, -0.7226,  1.7017],
         [-1.1545,  0.0108, -0.6392, -0.4848],
         [ 0.7708, -1.4511,  1.5352,  0.9105]]])
torch.Size([1, 4, 4])
------------------------------------------------------------
V
tensor([[[-0.6430,  0.3548,  0.1914, -1.7081],
         [-1.4566, -0.4060, -0.7207,  0.5732],
         [-1.3657,  0.9668, -0.3410, -1.6598],
         [ 1.1713, -1.6276,  0.9000,  0.3938]]])
torch.Size([1, 4, 4])
------------------------------------------------------------


### splitting Q,K,V into multiple heads 

In [27]:
num_heads = 2               # this is a hyper-parameter and can be tuned. 
head_dim = d_out//num_heads
print(head_dim)

2


In [28]:
Q_multi_head  = Q.view(B,num_tokens,num_heads,head_dim).transpose(1,2)
K_multi_head  = K.view(B,num_tokens,num_heads,head_dim).transpose(1,2)
V_multi_head  = V.view(B,num_tokens,num_heads,head_dim).transpose(1,2)


print('Q_multi_head')
print(Q_multi_head.data)
print(Q_multi_head.data.shape)

print('---'*20)

print('K_multi_head')
print(K_multi_head.data)
print(K_multi_head.data.shape)


print('---'*20)

print('V_multi_head')
print(V_multi_head.data)
print(V_multi_head.data.shape)

Q_multi_head
tensor([[[[-0.1025,  0.4384],
          [ 0.5076, -0.7271],
          [ 0.4854, -0.2668],
          [-1.1282,  1.2553]],

         [[ 0.7278,  0.2037],
          [ 1.4690, -2.1765],
          [ 0.8509,  0.5290],
          [ 0.1980, -1.6640]]]])
torch.Size([1, 2, 4, 2])
------------------------------------------------------------
K_multi_head
tensor([[[[-0.6733, -0.6375],
          [-1.0882, -0.1550],
          [-1.1545,  0.0108],
          [ 0.7708, -1.4511]],

         [[ 0.1684, -0.4184],
          [-0.7226,  1.7017],
          [-0.6392, -0.4848],
          [ 1.5352,  0.9105]]]])
torch.Size([1, 2, 4, 2])
------------------------------------------------------------
V_multi_head
tensor([[[[-0.6430,  0.3548],
          [-1.4566, -0.4060],
          [-1.3657,  0.9668],
          [ 1.1713, -1.6276]],

         [[ 0.1914, -1.7081],
          [-0.7207,  0.5732],
          [-0.3410, -1.6598],
          [ 0.9000,  0.3938]]]])
torch.Size([1, 2, 4, 2])


### Attention Score 

In [29]:
K_multi_head_transpose = K_multi_head.transpose(2,3)

print('K_multi_head_transpose')
print(K_multi_head_transpose.data)
print(K_multi_head_transpose.data.shape)

K_multi_head_transpose
tensor([[[[-0.6733, -1.0882, -1.1545,  0.7708],
          [-0.6375, -0.1550,  0.0108, -1.4511]],

         [[ 0.1684, -0.7226, -0.6392,  1.5352],
          [-0.4184,  1.7017, -0.4848,  0.9105]]]])
torch.Size([1, 2, 2, 4])


In [30]:
attn_score = Q_multi_head @ K_multi_head_transpose
print(attn_score)
print(attn_score.shape)

tensor([[[[-0.2105,  0.0435,  0.1231, -0.7152],
          [ 0.1217, -0.4397, -0.5940,  1.4464],
          [-0.1567, -0.4868, -0.5633,  0.7614],
          [-0.0406,  1.0331,  1.3161, -2.6912]],

         [[ 0.0373, -0.1792, -0.5639,  1.3027],
          [ 1.1579, -4.7654,  0.1161,  0.2734],
          [-0.0780,  0.2853, -0.8003,  1.7879],
          [ 0.7295, -2.9747,  0.6801, -1.2111]]]],
       grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 2, 4, 4])


### mask

In [31]:
mask = torch.triu(torch.ones(num_tokens,num_tokens),diagonal=1)
print(mask)
print(mask.bool()) 

tensor([[0., 1., 1., 1.],
        [0., 0., 1., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 0.]])
tensor([[False,  True,  True,  True],
        [False, False,  True,  True],
        [False, False, False,  True],
        [False, False, False, False]])


In [32]:
attn_score = attn_score.masked_fill(mask.bool()[:num_tokens,:num_tokens],-torch.inf)
print(attn_score)

tensor([[[[-0.2105,    -inf,    -inf,    -inf],
          [ 0.1217, -0.4397,    -inf,    -inf],
          [-0.1567, -0.4868, -0.5633,    -inf],
          [-0.0406,  1.0331,  1.3161, -2.6912]],

         [[ 0.0373,    -inf,    -inf,    -inf],
          [ 1.1579, -4.7654,    -inf,    -inf],
          [-0.0780,  0.2853, -0.8003,    -inf],
          [ 0.7295, -2.9747,  0.6801, -1.2111]]]],
       grad_fn=<MaskedFillBackward0>)


In [33]:
attn_weight = torch.softmax(attn_score/K_multi_head.shape[-1]**0.5,dim=-1)
print(attn_weight)

tensor([[[[1.0000, 0.0000, 0.0000, 0.0000],
          [0.5980, 0.4020, 0.0000, 0.0000],
          [0.3934, 0.3115, 0.2951, 0.0000],
          [0.1695, 0.3621, 0.4424, 0.0260]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.9851, 0.0149, 0.0000, 0.0000],
          [0.3457, 0.4469, 0.2074, 0.0000],
          [0.4363, 0.0318, 0.4213, 0.1106]]]], grad_fn=<SoftmaxBackward0>)


In [34]:
torch.manual_seed(3)

attn_dropout = nn.Dropout(0)

attn_weight = attn_dropout(attn_weight)
print(attn_weight)

tensor([[[[1.0000, 0.0000, 0.0000, 0.0000],
          [0.5980, 0.4020, 0.0000, 0.0000],
          [0.3934, 0.3115, 0.2951, 0.0000],
          [0.1695, 0.3621, 0.4424, 0.0260]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.9851, 0.0149, 0.0000, 0.0000],
          [0.3457, 0.4469, 0.2074, 0.0000],
          [0.4363, 0.0318, 0.4213, 0.1106]]]], grad_fn=<SoftmaxBackward0>)


In [35]:
con_vector = attn_weight @ V_multi_head
print(con_vector)
print(con_vector.shape)

tensor([[[[-0.6430,  0.3548],
          [-0.9701,  0.0489],
          [-1.1097,  0.2984],
          [-1.2102,  0.2985]],

         [[ 0.1914, -1.7081],
          [ 0.1778, -1.6741],
          [-0.3267, -0.6785],
          [ 0.0165, -1.3828]]]], grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 2, 4, 2])


In [36]:
conv_vector = con_vector.transpose(1,2).contiguous().view(B,num_tokens,d_out)
conv_vector

tensor([[[-0.6430,  0.3548,  0.1914, -1.7081],
         [-0.9701,  0.0489,  0.1778, -1.6741],
         [-1.1097,  0.2984, -0.3267, -0.6785],
         [-1.2102,  0.2985,  0.0165, -1.3828]]], grad_fn=<ViewBackward0>)

In [37]:
# projection 

out_proj = nn.Parameter(torch.tensor([
    [0.5, -0.3, 0.4, 0.2],
    [-0.6, 0.4, -0.2, -0.5], 
    [0.3, -0.7, 0.6, -0.4],
    [-0.2, 0.5, -0.4, 0.3]
]))


In [38]:
out_proj_result = conv_vector @ out_proj.data
print(out_proj_result)
print(out_proj_result.shape)


tensor([[[-0.1353, -0.6533,  0.4700, -0.8950],
         [-0.1263, -0.6509,  0.3785, -0.7918],
         [-0.6962,  0.3417, -0.4281, -0.4441],
         [-0.5027, -0.2205,  0.0192, -0.8127]]], grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 4, 4])


# Dropout1 

In [39]:
torch.manual_seed(3)
dropout1 = nn.Dropout(dropout)
after_dropout_1 = dropout1(out_proj_result)
print(after_dropout_1)
print(after_dropout_1.shape)

tensor([[[-0.1804, -0.8710,  0.0000, -1.1933],
         [-0.0000, -0.8678,  0.5046, -1.0557],
         [-0.9283,  0.0000, -0.5709, -0.5921],
         [-0.6702, -0.2940,  0.0000, -1.0836]]], grad_fn=<MulBackward0>)
torch.Size([1, 4, 4])


# Skip Connection 

In [40]:
skip_connection = after_dropout_1 + pre_transformer_dp_result
print(skip_connection)
print(skip_connection.shape)

tensor([[[-0.1804,  0.3782, -0.5674, -2.7158],
         [ 0.0000, -1.3542, -1.6856, -2.1248],
         [-1.6124,  0.4659, -1.4033, -1.5616],
         [ 1.5392, -0.2940,  0.5149, -1.4913]]], grad_fn=<AddBackward0>)
torch.Size([1, 4, 4])


# LayerNorm 2

In [41]:
layernorm2 = nn.LayerNorm(after_dropout_1.shape[-1])
print(layernorm2.weight)
print(layernorm2.bias)
layernorm2_result = layernorm2(after_dropout_1)

print(layernorm2_result)
print(layernorm2_result.shape)

Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
tensor([[[ 0.7791, -0.6339,  1.1482, -1.2933],
         [ 0.5576, -0.8065,  1.3507, -1.1018],
         [-1.2158,  1.5676, -0.1441, -0.2077],
         [-0.3892,  0.5360,  1.2589, -1.4057]]],
       grad_fn=<NativeLayerNormBackward0>)
torch.Size([1, 4, 4])


# FC

In [42]:
fc1_weight = nn.Parameter(torch.tensor([
    [0.5, -0.3, 0.4, 0.2, 0.1, -0.2, 0.3, -0.1],
    [-0.6, 0.4, -0.2, -0.5, 0.2, 0.3, -0.4, 0.5],
    [0.3, -0.7, 0.6, -0.4, -0.3, 0.4, 0.2, -0.6],
    [-0.2, 0.5, -0.4, 0.3, 0.4, -0.5, 0.1, 0.2]
]))

fc1_bias  = nn.Parameter(torch.tensor([0.1, -0.2, 0.3, -0.1, 0.2, -0.3, 0.4, -0.2]))


print(fc1_weight.data.shape)
print('--'*20)

print(fc1_bias.data.shape)
print('--'*20)

fc1_result = (layernorm2_result @ fc1_weight.data) + fc1_bias.data
print(fc1_result.data)
print(fc1_result.shape)

torch.Size([4, 8])
----------------------------------------
torch.Size([8])
----------------------------------------
tensor([[[ 1.4730, -2.1377,  1.9447, -0.4745, -0.7107,  0.4600,  0.9876,
          -1.5424],
         [ 1.4883, -2.1863,  1.9355, -0.4561, -0.7515,  0.4378,  1.0498,
          -1.6898],
         [-1.4502,  0.7888, -0.5032, -1.1317,  0.3521,  0.4597, -0.6414,
           0.7503],
         [ 0.2427, -1.4530,  1.3548, -1.3711, -0.6717,  1.1451,  0.1801,
          -0.9296]]])
torch.Size([1, 4, 8])


In [43]:
gelu = nn.GELU()
gelu_result = gelu(fc1_result)
gelu_result

tensor([[[ 1.3693, -0.0348,  1.8943, -0.1507, -0.1696,  0.3115,  0.8279,
          -0.0948],
         [ 1.3866, -0.0315,  1.8843, -0.1478, -0.1700,  0.2930,  0.8956,
          -0.0769],
         [-0.1066,  0.6191, -0.1547, -0.1459,  0.2245,  0.3113, -0.1672,
           0.5804],
         [ 0.1446, -0.1062,  1.2359, -0.1168, -0.1685,  1.0007,  0.1029,
          -0.1639]]], grad_fn=<GeluBackward0>)

In [44]:
fc2_weight = nn.Parameter(torch.tensor([
    [0.5, -0.6, 0.3, -0.2],
    [-0.3, 0.4, -0.7, 0.5], 
    [0.4, -0.2, 0.6, -0.4],
    [0.2, -0.5, -0.4, 0.3],
    [0.1, 0.2, -0.3, 0.4],
    [-0.2, 0.3, 0.4, -0.5],
    [0.3, -0.4, 0.2, 0.1],
    [-0.1, 0.5, -0.6, 0.2]
]))


print(fc2_weight.data.shape)
print('--'*20)

fc2_bias  = nn.Parameter(torch.tensor([0.1, -0.2, 0.3, -0.1]))
print(fc2_bias.data.shape)
print('--'*20)


torch.Size([8, 4])
----------------------------------------
torch.Size([4])
----------------------------------------


In [45]:
fc2_result = (gelu_result @ fc2_weight.data) + fc2_bias.data
print(fc2_result)
print(fc2_result.shape)

tensor([[[ 1.7013, -1.6581,  2.3299, -1.3539],
         [ 1.7277, -1.6903,  2.3212, -1.3314],
         [-0.3781,  0.7108, -0.5244,  0.2825],
         [ 0.5055, -0.3746,  1.7758, -1.3017]]], grad_fn=<AddBackward0>)
torch.Size([1, 4, 4])


# Dropout 2 

In [46]:
torch.manual_seed(2)

dropout2 = nn.Dropout(dropout)
dropout2_result = dropout2(fc2_result)
dropout2_result

tensor([[[ 0.0000, -2.2108,  3.1066, -1.8052],
         [ 0.0000, -2.2537,  3.0949, -1.7752],
         [-0.5041,  0.9477, -0.6991,  0.3767],
         [ 0.6739, -0.0000,  2.3677, -1.7356]]], grad_fn=<MulBackward0>)

End of Transformer block 

##  skip connection 2

In [47]:
skip_connection_2 = skip_connection + dropout2_result
print(skip_connection_2)


tensor([[[-0.1804, -1.8326,  2.5392, -4.5210],
         [ 0.0000, -3.6079,  1.4093, -3.9000],
         [-2.1165,  1.4136, -2.1025, -1.1849],
         [ 2.2132, -0.2940,  2.8826, -3.2269]]], grad_fn=<AddBackward0>)


# Post Transformer Block LayerNorm

In [48]:
post_transformer_LN = nn.LayerNorm(d_out)
print(post_transformer_LN.weight)
print(post_transformer_LN.bias)
post_transformer_LN_result = post_transformer_LN(skip_connection_2)
print(post_transformer_LN_result)

Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
tensor([[[ 0.3192, -0.3253,  1.3800, -1.3740],
         [ 0.6668, -0.9111,  1.2831, -1.0388],
         [-0.7758,  1.6717, -0.7660, -0.1299],
         [ 0.7573, -0.2863,  1.0360, -1.5071]]],
       grad_fn=<NativeLayerNormBackward0>)


# Head_out 

In [49]:
head_out = nn.Parameter(torch.tensor([
    [0.5, -0.3, 0.4, 0.2, 0.1, -0.2, 0.3, -0.1, 0.4, -0.5, 0.2, 0.3, -0.2, 0.1, -0.3, 0.4],
    [-0.6, 0.4, -0.2, -0.5, 0.2, 0.3, -0.4, 0.5, -0.3, 0.2, -0.5, 0.4, 0.3, -0.2, 0.5, -0.4],
    [0.3, -0.7, 0.6, -0.4, -0.3, 0.4, 0.2, -0.6, 0.5, -0.2, 0.4, -0.3, 0.2, -0.5, 0.3, -0.1],
    [-0.2, 0.5, -0.4, 0.3, 0.4, -0.5, 0.1, 0.2, -0.4, 0.3, -0.2, 0.5, -0.3, 0.4, -0.5, 0.2]
]))

print(head_out.shape)


torch.Size([4, 16])


In [50]:
head_out_logits = post_transformer_LN_result @ head_out
head_out_logits.shape

torch.Size([1, 4, 16])

In [51]:
head_out_logits 

tensor([[[ 1.0436, -1.8789,  1.5703, -0.7377, -0.9967,  1.0776,  0.3645,
          -1.2974,  1.4649, -0.9129,  1.0533, -1.1353,  0.5268, -1.1426,
           0.8426, -0.1550],
         [ 1.4727, -1.9820,  1.6343, -0.2360, -0.9160,  0.6260,  0.7172,
          -1.4998,  1.5971, -1.0839,  1.3099, -1.0687,  0.1616, -0.8082,
           0.2488,  0.2951],
         [-1.5947,  1.3727, -1.0523, -0.7235,  0.4346,  0.4152, -1.0676,
           1.3471, -1.1429,  0.8365, -1.2714,  0.6008,  0.5424, -0.0808,
           0.9037, -0.9283],
         [ 1.1626, -1.8204,  1.5846, -0.5719, -0.8951,  0.9306,  0.3982,
          -1.1419,  1.5096, -1.0952,  1.0104, -0.9516,  0.4220, -0.9878,
           0.6940,  0.0124]]], grad_fn=<UnsafeViewBackward0>)

# Softmax 

In [52]:
# we won't be using the following head_out_prob to calculate the loss, since we can use cross entropy loss 
# calculating softmax is for demonstration purpose only  

head_out_prob = F.softmax(head_out_logits,dim=-1)
print(head_out_prob)
print(head_out_prob.shape)

tensor([[[0.1075, 0.0058, 0.1821, 0.0181, 0.0140, 0.1112, 0.0545, 0.0103,
          0.1639, 0.0152, 0.1086, 0.0122, 0.0641, 0.0121, 0.0879, 0.0324],
         [0.1529, 0.0048, 0.1797, 0.0277, 0.0140, 0.0655, 0.0718, 0.0078,
          0.1731, 0.0119, 0.1299, 0.0120, 0.0412, 0.0156, 0.0450, 0.0471],
         [0.0090, 0.1756, 0.0155, 0.0216, 0.0687, 0.0674, 0.0153, 0.1712,
          0.0142, 0.1027, 0.0125, 0.0812, 0.0766, 0.0410, 0.1099, 0.0176],
         [0.1209, 0.0061, 0.1843, 0.0213, 0.0154, 0.0958, 0.0563, 0.0121,
          0.1710, 0.0126, 0.1038, 0.0146, 0.0576, 0.0141, 0.0757, 0.0383]]],
       grad_fn=<SoftmaxBackward0>)
torch.Size([1, 4, 16])


In [53]:
head_out_prob.shape

torch.Size([1, 4, 16])

In [54]:
head_out_prob.argmax(dim=1)

tensor([[1, 2, 3, 1, 2, 0, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1]])

In [55]:
head_out_prob.argmax(dim=1).shape

torch.Size([1, 16])

# Loss 

In [56]:
loss_fn = nn.CrossEntropyLoss()
loss_fn(head_out_logits.squeeze(0),y.squeeze(0))

tensor(3.9955, grad_fn=<NllLossBackward0>)

# The End of the forward pass 