In [77]:
import torch 
from torch import nn
from torch.utils.data import Dataset,DataLoader
import torch.nn.functional as F

# Section 1 

In [78]:
# Imagine this is out entire dataset. 
raw_text = 'The cat is hungry I went to the store But it was closed So I went to a different store'

# In this section, we will split the dataset into train and validation sets. 
# we will tokenize the dataset and create a vocabulary. 

![](section_1.png)

split the dataset into train and validation sets

In [79]:
ratio = 0.66
split_index = int(len(raw_text) * ratio)
train_raw_text = raw_text[:split_index]
val_raw_text = raw_text[split_index:]


print(f"Train raw text: {train_raw_text}")

print("="*100)

print(f"Val raw text: {val_raw_text}")


Train raw text: The cat is hungry I went to the store But it was closed 
Val raw text: So I went to a different store


Corpus vocabulary


We will make an assumption that each word is a unique token.<br>
This is a simplification and not true in the real world.<br>
In practice, we would use a more sophisticated tokenization method.


In [80]:
vocab = list(sorted(set(raw_text.split(' '))))

print("unique words in the corpus:")
print("-"*50)
for i in vocab:
    print(i)

print("="*50)
vocab_size = len(vocab)
print(f'vocab_size: {vocab_size}') # You can think of as number of unique words in the corpus. 

unique words in the corpus:
--------------------------------------------------
But
I
So
The
a
cat
closed
different
hungry
is
it
store
the
to
was
went
vocab_size: 16


Note: (The) is different from (the)

In [81]:
tokens_to_ids = {token: id for id, token in enumerate(vocab)}
print("Mapping of tokens to ids:")
tokens_to_ids

Mapping of tokens to ids:


{'But': 0,
 'I': 1,
 'So': 2,
 'The': 3,
 'a': 4,
 'cat': 5,
 'closed': 6,
 'different': 7,
 'hungry': 8,
 'is': 9,
 'it': 10,
 'store': 11,
 'the': 12,
 'to': 13,
 'was': 14,
 'went': 15}

In [82]:
ids_to_tokens = {id: token for id, token in enumerate(vocab)}
print("Mapping of ids to tokens:")
ids_to_tokens

Mapping of ids to tokens:


{0: 'But',
 1: 'I',
 2: 'So',
 3: 'The',
 4: 'a',
 5: 'cat',
 6: 'closed',
 7: 'different',
 8: 'hungry',
 9: 'is',
 10: 'it',
 11: 'store',
 12: 'the',
 13: 'to',
 14: 'was',
 15: 'went'}

In [83]:
def encode(text):
    return [tokens_to_ids[token] for token in text.strip().split(' ')]

def decode(ids):
    return ' '.join([ids_to_tokens[id] for id in ids])

In [84]:
encode(raw_text)

[3, 5, 9, 8, 1, 15, 13, 12, 11, 0, 10, 14, 6, 2, 1, 15, 13, 4, 7, 11]

In [85]:
encode("The cat is hungry")

[3, 5, 9, 8]

In [86]:
decode([3,5,9,8])

'The cat is hungry'

# Section 2 : Creating  dataset and dataloader 

### Dataset & DataLoader Config

In [87]:
# X aka input 
max_len = 4  # length of the green bracket
stride = 3  # jump of the green bracket

# y aka output 
# The red bracket length and jump are the same as the green bracket,
# but it is shifted by one token to the right.

Note : max_len and stride are hyper-parameters and can be tuned.

![](section_2.png)

### Dataset 

In [88]:
class Data(Dataset):
    def __init__(self,raw_text,max_len=max_len,stride=stride):
        self.token_ids = encode(raw_text)
        self.X = []
        self.y = []
        for i in range(0,len(self.token_ids)-max_len,stride):
            input = self.token_ids[i:i+max_len]
            output = self.token_ids[i+1:i+max_len+1]
            self.X.append(torch.tensor(input))
            self.y.append(torch.tensor(output))
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self,idx):
        return self.X[idx],self.y[idx]
    

In [89]:
train_ds = Data(train_raw_text)
train_ds[0]

(tensor([3, 5, 9, 8]), tensor([5, 9, 8, 1]))

In [90]:
val_ds = Data(val_raw_text)
val_ds[0]

(tensor([ 2,  1, 15, 13]), tensor([ 1, 15, 13,  4]))

### DataLoader

In [91]:
train_dl = DataLoader(train_ds,batch_size=1,shuffle=False,drop_last=False,num_workers=0)
val_dl   = DataLoader(val_ds,batch_size=1,shuffle=False,drop_last=True,num_workers=0)

In [92]:
for i,(x,y) in enumerate(train_dl):
  print(f'Batch Number: {i+1}')
  print(f'x :{x}')
  print(f'y :{y}')
  print('--'*20)

Batch Number: 1
x :tensor([[3, 5, 9, 8]])
y :tensor([[5, 9, 8, 1]])
----------------------------------------
Batch Number: 2
x :tensor([[ 8,  1, 15, 13]])
y :tensor([[ 1, 15, 13, 12]])
----------------------------------------
Batch Number: 3
x :tensor([[13, 12, 11,  0]])
y :tensor([[12, 11,  0, 10]])
----------------------------------------


In [93]:
for i,(x,y) in enumerate(val_dl):
  print(f'Batch Number: {i+1}')
  print(f'x :{x}')
  print(f'y :{y}')
  print('--'*20)

Batch Number: 1
x :tensor([[ 2,  1, 15, 13]])
y :tensor([[ 1, 15, 13,  4]])
----------------------------------------


# Section 3 : Token Embedding and Positional Encoding 

Taking single batch, and we know that the batch size is 1. <br>
so we are taking a single example from the dataset. 

In [94]:
for x,y in train_dl:
  print(x)
  print(y)
  break

tensor([[3, 5, 9, 8]])
tensor([[5, 9, 8, 1]])


In [95]:
B = 1  # batch size 
d_in = 4  # embedding dimension  [input dimension]

In [96]:
torch.manual_seed(1)

# Embedding 
token_emb = nn.Embedding(vocab_size,d_in)
token_embedding = token_emb(x)
print('Token Embeddings:')
print(token_embedding)
# print('--'*20)
# print(token_embedding.shape)


Token Embeddings:
tensor([[[-0.2223,  1.6871,  0.2284,  0.4676],
         [ 0.8657,  0.2444, -0.6629,  0.8073],
         [ 0.1991,  0.0457,  0.1530, -0.4757],
         [ 1.8793, -0.0721,  0.1578, -0.7735]]], grad_fn=<EmbeddingBackward0>)


In [97]:
context_window = 4 # [max length of the input sequence the model can handle]
num_tokens = 4 # this can not be greater than context window


torch.manual_seed(1)

# Positional embedding
pos_emb = nn.Embedding(context_window,d_in)
positional_embedding = pos_emb(torch.arange(num_tokens))
print('Positional Embeddings:')
print('--'*20)
print(positional_embedding)
print('--'*20)
print(positional_embedding.shape)

Positional Embeddings:
----------------------------------------
tensor([[-1.5256, -0.7502, -0.6540, -1.6095],
        [-0.1002, -0.6092, -0.9798, -1.6091],
        [-0.7121,  0.3037, -0.7773, -0.2515],
        [-0.2223,  1.6871,  0.2284,  0.4676]], grad_fn=<EmbeddingBackward0>)
----------------------------------------
torch.Size([4, 4])


In [98]:
# Token embedding + token embedding 
tok_pos_emb = token_embedding + positional_embedding
print('Token Embedding + Positional Embedding')
print('--'*20)
print(tok_pos_emb)
print('--'*20)
print(tok_pos_emb.shape)

Token Embedding + Positional Embedding
----------------------------------------
tensor([[[-1.7479,  0.9369, -0.4256, -1.1418],
         [ 0.7655, -0.3648, -1.6427, -0.8018],
         [-0.5131,  0.3494, -0.6244, -0.7271],
         [ 1.6571,  1.6150,  0.3862, -0.3058]]], grad_fn=<AddBackward0>)
----------------------------------------
torch.Size([1, 4, 4])


# Pre Transformer Block Dropout 

In [99]:
dropout = 0.25 # notice that we are not using it, but we are still initializing it(to see how it works)


torch.manual_seed(2)

pre_transformer_dp = nn.Dropout(dropout)
pre_transformer_dp_result = pre_transformer_dp(tok_pos_emb)
pre_transformer_dp_result

tensor([[[-0.0000,  1.2492, -0.5674, -1.5225],
         [ 0.0000, -0.4864, -2.1902, -1.0691],
         [-0.6841,  0.4659, -0.8325, -0.9695],
         [ 2.2094,  0.0000,  0.5149, -0.4078]]], grad_fn=<MulBackward0>)

# Transformer Block 

### Layer Normalization 

In [100]:
layernorm1 = nn.LayerNorm(d_in)
print(layernorm1.weight)
print(layernorm1.bias)
layernorm1 = layernorm1(pre_transformer_dp_result)
print(layernorm1)
print('--'*20)
print(layernorm1.shape)


Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
tensor([[[ 0.2096,  1.4551, -0.3562, -1.3084],
         [ 1.1463,  0.5509, -1.5349, -0.1624],
         [-0.3144,  1.7046, -0.5748, -0.8154],
         [ 1.6361, -0.5812, -0.0645, -0.9905]]],
       grad_fn=<NativeLayerNormBackward0>)
----------------------------------------
torch.Size([1, 4, 4])


### Multi-Head Attention 

weights initialization 

In [129]:
d_out = 4

torch.manual_seed(1)

W_q = nn.Linear(d_in,d_out,bias=False)
W_k = nn.Linear(d_in,d_out,bias=False)
W_v = nn.Linear(d_in,d_out,bias=False)

# REMINDER : THE WEIGHT MATRICES ARE TRANSPOSED 
print('W_q')
print(W_q.weight.T)
print(W_q.weight.T.shape)
print('---'*20)

print('W_k')
print(W_k.weight.T)
print(W_k.weight.T.shape)
print('---'*20)

print('W_v')
print(W_v.weight.T)
print(W_v.weight.T.shape)

W_q
tensor([[ 0.2576, -0.4707,  0.0695,  0.1826],
        [-0.2207,  0.2999, -0.0612, -0.1949],
        [-0.0969, -0.1029,  0.1387, -0.0365],
        [ 0.2347,  0.2544,  0.0247, -0.0450]], grad_fn=<PermuteBackward0>)
torch.Size([4, 4])
------------------------------------------------------------
W_k
tensor([[ 0.0725, -0.1862, -0.1602, -0.4888],
        [-0.0020, -0.3020,  0.0239,  0.3100],
        [ 0.4371, -0.0838,  0.2981,  0.1397],
        [ 0.1556, -0.2157,  0.2718,  0.4743]], grad_fn=<PermuteBackward0>)
torch.Size([4, 4])
------------------------------------------------------------
W_v
tensor([[ 0.3300,  0.4391,  0.4906, -0.2634],
        [-0.4556, -0.0833, -0.2115,  0.2570],
        [-0.4754,  0.2140,  0.3750, -0.2654],
        [-0.2412, -0.2324,  0.0059,  0.1471]], grad_fn=<PermuteBackward0>)
torch.Size([4, 4])


Q,K,V

In [131]:
torch.manual_seed(1)

Q = W_q(layernorm1)
K = W_k(layernorm1)
V = W_v(layernorm1)


print(f'Q\n{Q}')
print(Q.shape)

print('---'*20)

print(f'K\n{K}')
print(K.shape)

print('---'*20)

print(f'V\n{V}')
print(V.shape)

Q
tensor([[[-0.5397,  0.0415, -0.1562, -0.1734],
         [ 0.2844, -0.2578, -0.1709,  0.1653],
         [-0.5928,  0.5108, -0.2260, -0.3319],
         [ 0.3236, -1.1898,  0.1159,  0.4590]]], grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 4, 4])
------------------------------------------------------------
K
tensor([[[-0.3470, -0.1664, -0.4605, -0.3217],
         [-0.6142, -0.2162, -0.6721, -0.6809],
         [-0.4043, -0.2322, -0.3018,  0.2150],
         [-0.0625,  0.0899, -0.5645, -1.4586]]], grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 4, 4])
------------------------------------------------------------
V
tensor([[[-0.1088,  0.1986, -0.3463,  0.2209],
         [ 0.8962,  0.1667, -0.1306,  0.2231],
         [-0.4104, -0.2135, -0.7352,  0.5536],
         [ 1.0743,  0.9831,  0.8956, -0.7089]]], grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 4, 4])


### splitting Q,K,V into multiple heads 

In [132]:
num_heads = 2 
head_dim = d_out//num_heads
print(head_dim)

2


In [133]:
Q_split  = Q.view(B,num_tokens,num_heads,head_dim).transpose(1,2)
K_split  = K.view(B,num_tokens,num_heads,head_dim).transpose(1,2)
V_split  = V.view(B,num_tokens,num_heads,head_dim).transpose(1,2)


print(Q_split)
print(Q_split.shape)

print('---'*20)

print(K_split)
print(K_split.shape)


print('---'*20)

print(V_split)
print(V_split.shape)

tensor([[[[-0.5397,  0.0415],
          [ 0.2844, -0.2578],
          [-0.5928,  0.5108],
          [ 0.3236, -1.1898]],

         [[-0.1562, -0.1734],
          [-0.1709,  0.1653],
          [-0.2260, -0.3319],
          [ 0.1159,  0.4590]]]], grad_fn=<TransposeBackward0>)
torch.Size([1, 2, 4, 2])
------------------------------------------------------------
tensor([[[[-0.3470, -0.1664],
          [-0.6142, -0.2162],
          [-0.4043, -0.2322],
          [-0.0625,  0.0899]],

         [[-0.4605, -0.3217],
          [-0.6721, -0.6809],
          [-0.3018,  0.2150],
          [-0.5645, -1.4586]]]], grad_fn=<TransposeBackward0>)
torch.Size([1, 2, 4, 2])
------------------------------------------------------------
tensor([[[[-0.1088,  0.1986],
          [ 0.8962,  0.1667],
          [-0.4104, -0.2135],
          [ 1.0743,  0.9831]],

         [[-0.3463,  0.2209],
          [-0.1306,  0.2231],
          [-0.7352,  0.5536],
          [ 0.8956, -0.7089]]]], grad_fn=<TransposeBackward0>)
tor

### Attention Score 

In [134]:
attn_score = Q_split @ K_split.transpose(2,3)
attn_score

tensor([[[[ 0.1804,  0.3225,  0.2086,  0.0375],
          [-0.0558, -0.1190, -0.0551, -0.0410],
          [ 0.1207,  0.2537,  0.1211,  0.0830],
          [ 0.0857,  0.0584,  0.1454, -0.1272]],

         [[ 0.1277,  0.2230,  0.0099,  0.3410],
          [ 0.0255,  0.0023,  0.0871, -0.1446],
          [ 0.2109,  0.3779, -0.0031,  0.6117],
          [-0.2011, -0.3905,  0.0637, -0.7349]]]],
       grad_fn=<UnsafeViewBackward0>)

### mask

In [135]:
mask = torch.triu(torch.ones(num_tokens,num_tokens),diagonal=1)
print(mask)
print(mask.bool()) 

tensor([[0., 1., 1., 1.],
        [0., 0., 1., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 0.]])
tensor([[False,  True,  True,  True],
        [False, False,  True,  True],
        [False, False, False,  True],
        [False, False, False, False]])


In [136]:
attn_score = attn_score.masked_fill(mask.bool()[:num_tokens,:num_tokens],-torch.inf)
print(attn_score)

tensor([[[[ 0.1804,    -inf,    -inf,    -inf],
          [-0.0558, -0.1190,    -inf,    -inf],
          [ 0.1207,  0.2537,  0.1211,    -inf],
          [ 0.0857,  0.0584,  0.1454, -0.1272]],

         [[ 0.1277,    -inf,    -inf,    -inf],
          [ 0.0255,  0.0023,    -inf,    -inf],
          [ 0.2109,  0.3779, -0.0031,    -inf],
          [-0.2011, -0.3905,  0.0637, -0.7349]]]],
       grad_fn=<MaskedFillBackward0>)


In [138]:
attn_weight = torch.softmax(attn_score/K_split.shape[-1]**0.5,dim=-1)
print(attn_weight)

tensor([[[[1.0000, 0.0000, 0.0000, 0.0000],
          [0.5112, 0.4888, 0.0000, 0.0000],
          [0.3227, 0.3545, 0.3228, 0.0000],
          [0.2574, 0.2525, 0.2686, 0.2215]],

         [[1.0000, 0.0000, 0.0000, 0.0000],
          [0.5041, 0.4959, 0.0000, 0.0000],
          [0.3350, 0.3770, 0.2880, 0.0000],
          [0.2655, 0.2322, 0.3202, 0.1820]]]], grad_fn=<SoftmaxBackward0>)


# Stopped here 

In [109]:
torch.manual_seed(3)

attn_dropout = nn.Dropout(dropout)

attn_weight = attn_dropout(attn_weight)
print(attn_weight)

tensor([[[[1.3333, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.6518, 0.0000, 0.0000],
          [0.4303, 0.0000, 0.4304, 0.0000],
          [0.3433, 0.3367, 0.0000, 0.2953]],

         [[1.3333, 0.0000, 0.0000, 0.0000],
          [0.6721, 0.6612, 0.0000, 0.0000],
          [0.4467, 0.5027, 0.3840, 0.0000],
          [0.3540, 0.3097, 0.0000, 0.2427]]]], grad_fn=<MulBackward0>)


In [110]:
con_vector = attn_weight @ V_split
print(con_vector)
print(con_vector.shape)

tensor([[[[-0.1451,  0.2648],
          [ 0.5841,  0.1087],
          [-0.2235, -0.0064],
          [ 0.5816,  0.4146]],

         [[-0.4617,  0.2945],
          [-0.3191,  0.2960],
          [-0.5026,  0.4234],
          [ 0.0543, -0.0248]]]], grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 2, 4, 2])


In [111]:
conv_vector = con_vector.transpose(1,2).contiguous().view(B,num_tokens,d_out)
conv_vector

tensor([[[-0.1451,  0.2648, -0.4617,  0.2945],
         [ 0.5841,  0.1087, -0.3191,  0.2960],
         [-0.2235, -0.0064, -0.5026,  0.4234],
         [ 0.5816,  0.4146,  0.0543, -0.0248]]], grad_fn=<ViewBackward0>)

In [112]:
# projection 

torch.manual_seed(1)

out_proj = nn.Linear(d_out,d_out,bias=True)
print(out_proj.weight.T)
print(out_proj.weight.T.shape)
print(out_proj.bias)
print('--'*20)

out_proj_result = out_proj(conv_vector)
print(out_proj_result)
print(out_proj_result.shape)


tensor([[ 0.2576, -0.4707,  0.0695,  0.1826],
        [-0.2207,  0.2999, -0.0612, -0.1949],
        [-0.0969, -0.1029,  0.1387, -0.0365],
        [ 0.2347,  0.2544,  0.0247, -0.0450]], grad_fn=<PermuteBackward0>)
torch.Size([4, 4])
Parameter containing:
tensor([ 0.0725, -0.0020,  0.4371,  0.1556], requires_grad=True)
----------------------------------------
tensor([[[ 0.0905,  0.2681,  0.3540,  0.0811],
         [ 0.2994, -0.1363,  0.4341,  0.2394],
         [ 0.1644,  0.2607,  0.3627,  0.1153],
         [ 0.1197, -0.1633,  0.4591,  0.1802]]], grad_fn=<ViewBackward0>)
torch.Size([1, 4, 4])


# Dropout1 

In [113]:
dropout1 = nn.Dropout(dropout)
after_dropout_1 = dropout1(out_proj_result)
print(after_dropout_1)
print(after_dropout_1.shape)

tensor([[[ 0.1207,  0.3575,  0.4720,  0.1081],
         [ 0.3992, -0.1817,  0.5788,  0.3192],
         [ 0.2192,  0.3475,  0.0000,  0.0000],
         [ 0.0000, -0.0000,  0.0000,  0.2402]]], grad_fn=<MulBackward0>)
torch.Size([1, 4, 4])


# Skip Connection 

In [114]:
skip_connection = after_dropout_1 + pre_transformer_dp_result
print(skip_connection)

tensor([[[ 0.1207,  1.6067, -0.0954, -1.4144],
         [ 0.3992, -0.6680, -1.6115, -0.7498],
         [-0.4649,  0.8134, -0.8325, -0.9695],
         [ 2.2094,  0.0000,  0.5149, -0.1676]]], grad_fn=<AddBackward0>)


# LayerNorm 2

In [115]:
layernorm2 = nn.LayerNorm(after_dropout_1.shape[-1])
print(layernorm2.weight)
print(layernorm2.bias)
layernorm2_result = layernorm2(after_dropout_1)

print(layernorm2_result)
print(layernorm2_result.shape)

Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
tensor([[[-0.9245,  0.5971,  1.3328, -1.0054],
         [ 0.4266, -1.6329,  1.0633,  0.1430],
         [ 0.5209,  1.3834, -0.9521, -0.9521],
         [-0.5771, -0.5771, -0.5771,  1.7313]]],
       grad_fn=<NativeLayerNormBackward0>)
torch.Size([1, 4, 4])


# FC

In [116]:
torch.manual_seed(1)

fc1 = nn.Linear(d_out,d_out*2,bias=True)
print(fc1.weight.T)
print(fc1.weight.T.shape)
print(fc1.bias)
print('--'*20)

tensor([[ 0.2576, -0.4707,  0.0695,  0.1826,  0.0725, -0.1862, -0.1602, -0.4888],
        [-0.2207,  0.2999, -0.0612, -0.1949, -0.0020, -0.3020,  0.0239,  0.3100],
        [-0.0969, -0.1029,  0.1387, -0.0365,  0.4371, -0.0838,  0.2981,  0.1397],
        [ 0.2347,  0.2544,  0.0247, -0.0450,  0.1556, -0.2157,  0.2718,  0.4743]],
       grad_fn=<PermuteBackward0>)
torch.Size([4, 8])
Parameter containing:
tensor([ 0.3300, -0.4556, -0.4754, -0.2412,  0.4391, -0.0833,  0.2140, -0.2324],
       requires_grad=True)
----------------------------------------


In [117]:
torch.manual_seed(1)

fc1_result = fc1(layernorm2_result)
print(fc1_result)
print(fc1_result.shape)

tensor([[[-0.4051, -0.2342, -0.4162, -0.5297,  0.7970,  0.0137,  0.5004,
           0.1139],
         [ 0.7308, -1.2190, -0.1948,  0.1097,  0.9602,  0.2104,  0.4623,
          -0.7306],
         [ 0.0278, -0.4302, -0.6794, -0.3380, -0.0903, -0.3129, -0.3789,
          -0.6428],
         [ 0.7709,  0.1428, -0.5175, -0.2910,  0.4155, -0.1266,  0.5911,
           0.6113]]], grad_fn=<ViewBackward0>)
torch.Size([1, 4, 8])


In [118]:
gelu = nn.GELU()
gelu_result = gelu(fc1_result)
gelu_result

tensor([[[-0.1388, -0.0954, -0.1409, -0.1579,  0.6274,  0.0069,  0.3461,
           0.0621],
         [ 0.5609, -0.1358, -0.0824,  0.0596,  0.7985,  0.1228,  0.3135,
          -0.1699],
         [ 0.0142, -0.1435, -0.1688, -0.1243, -0.0419, -0.1180, -0.1335,
          -0.1672],
         [ 0.6011,  0.0795, -0.1565, -0.1122,  0.2747, -0.0569,  0.4272,
           0.4460]]], grad_fn=<GeluBackward0>)

In [119]:
torch.manual_seed(1)

fc2 = nn.Linear(2*d_out,d_out,bias=True)
print(fc2.weight.T)
print(fc2.weight.T.shape)
print(fc2.bias)
print('--'*20)

tensor([[ 0.1822,  0.0491,  0.0512, -0.1133],
        [-0.1561, -0.0433, -0.0014,  0.0169],
        [-0.0685,  0.0981,  0.3091,  0.2108],
        [ 0.1659,  0.0174,  0.1100,  0.1922],
        [-0.3328,  0.1291, -0.1317, -0.3456],
        [ 0.2120, -0.1378, -0.2135,  0.2192],
        [-0.0727, -0.0258, -0.0593,  0.0988],
        [ 0.1799, -0.0318, -0.1525,  0.3354]], grad_fn=<PermuteBackward0>)
torch.Size([8, 4])
Parameter containing:
tensor([ 0.2334, -0.3221, -0.3362, -0.1705], requires_grad=True)
----------------------------------------


In [120]:
fc2_result = fc2(gelu_result)
print(fc2_result)
print(fc2_result.shape)

tensor([[[-0.0149, -0.2722, -0.5181, -0.3768],
         [ 0.0792, -0.2122, -0.4501, -0.5173],
         [ 0.2178, -0.3143, -0.3369, -0.3147],
         [ 0.2682, -0.2952, -0.4835, -0.2075]]], grad_fn=<ViewBackward0>)
torch.Size([1, 4, 4])


# Dropout 2 

In [121]:
torch.manual_seed(1)

dropout2 = nn.Dropout(dropout)
dropout2_result = dropout2(fc2_result)
dropout2_result

tensor([[[-0.0199, -0.3630, -0.6909, -0.5024],
         [ 0.1056, -0.2829, -0.6002, -0.6898],
         [ 0.2905, -0.4191, -0.4493, -0.4196],
         [ 0.3576, -0.3936, -0.6447, -0.2766]]], grad_fn=<MulBackward0>)

End of Transformer block 

##  skip connection 2

In [122]:
skip_connection_2 = skip_connection + dropout2_result
print(skip_connection_2)


tensor([[[ 0.1008,  1.2437, -0.7862, -1.9168],
         [ 0.5048, -0.9510, -2.2117, -1.4396],
         [-0.1745,  0.3943, -1.2817, -1.3891],
         [ 2.5670, -0.3936, -0.1298, -0.4442]]], grad_fn=<AddBackward0>)


# Post Transformer Block LayerNorm

In [123]:
post_transformer_LN = nn.LayerNorm(d_out)
print(post_transformer_LN.weight)
print(post_transformer_LN.bias)
post_transformer_LN_result = post_transformer_LN(skip_connection_2)
print(post_transformer_LN_result)

Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
tensor([[[ 0.3795,  1.3643, -0.3848, -1.3589],
         [ 1.5435,  0.0741, -1.1985, -0.4192],
         [ 0.5835,  1.3408, -0.8907, -1.0336],
         [ 1.7242, -0.6313, -0.4214, -0.6715]]],
       grad_fn=<NativeLayerNormBackward0>)


# Head_out 

In [124]:
torch.manual_seed(1)

head_out = nn.Linear(d_out,vocab_size,bias=True)
print(head_out.weight.T)
print(head_out.weight.T.shape)
print(head_out.bias)
print('----'*20)

head_out_result = head_out(post_transformer_LN_result)
print(head_out_result)
print(head_out_result.shape)

tensor([[ 0.2576, -0.4707,  0.0695,  0.1826,  0.0725, -0.1862, -0.1602, -0.4888,
          0.3300,  0.4391,  0.4906, -0.2634, -0.1444,  0.2713, -0.0234, -0.3232],
        [-0.2207,  0.2999, -0.0612, -0.1949, -0.0020, -0.3020,  0.0239,  0.3100,
         -0.4556, -0.0833, -0.2115,  0.2570, -0.0548, -0.1215, -0.3337,  0.3248],
        [-0.0969, -0.1029,  0.1387, -0.0365,  0.4371, -0.0838,  0.2981,  0.1397,
         -0.4754,  0.2140,  0.3750, -0.2654, -0.4807,  0.4980,  0.3045,  0.3036],
        [ 0.2347,  0.2544,  0.0247, -0.0450,  0.1556, -0.2157,  0.2718,  0.4743,
         -0.2412, -0.2324,  0.0059,  0.1471, -0.2384,  0.4008,  0.1552,  0.4434]],
       grad_fn=<PermuteBackward0>)
torch.Size([4, 16])
Parameter containing:
tensor([-0.2803, -0.0823, -0.0097,  0.0730, -0.3795, -0.3548,  0.2720, -0.1172,
         0.2442,  0.0285,  0.1642,  0.1099,  0.1818,  0.2479, -0.4631,  0.2517],
       requires_grad=True)
--------------------------------------------------------------------------------
t

# Softmax 

In [125]:
# we won't be using the following head_out_prob to calculate the loss, since we can use cross entropy loss 
# calculating softmax is for demonstration purpose only  

head_out_prob = F.softmax(head_out_result,dim=-1)
head_out_prob

tensor([[[0.0334, 0.0614, 0.0616, 0.0685, 0.0345, 0.0431, 0.0565, 0.0403,
          0.0931, 0.0985, 0.0657, 0.0935, 0.1260, 0.0414, 0.0205, 0.0620],
         [0.0629, 0.0259, 0.0514, 0.0836, 0.0237, 0.0348, 0.0358, 0.0166,
          0.2244, 0.0959, 0.0879, 0.0547, 0.1050, 0.0502, 0.0215, 0.0258],
         [0.0391, 0.0616, 0.0572, 0.0697, 0.0287, 0.0395, 0.0500, 0.0383,
          0.1151, 0.0873, 0.0588, 0.1028, 0.1406, 0.0378, 0.0180, 0.0555],
         [0.0591, 0.0146, 0.0528, 0.0856, 0.0285, 0.0361, 0.0353, 0.0106,
          0.2118, 0.1211, 0.1309, 0.0299, 0.0682, 0.0671, 0.0290, 0.0192]]],
       grad_fn=<SoftmaxBackward0>)

In [126]:
head_out_prob.shape

torch.Size([1, 4, 16])

In [127]:
head_out_result.argmax(dim=-1)

tensor([[12,  8, 12,  8]])

# Loss 

In [128]:
loss_fn = nn.CrossEntropyLoss()
loss_fn(head_out_result.squeeze(0),y.squeeze(0))

tensor(2.9692, grad_fn=<NllLossBackward0>)

# The End of the forward pass 