In [101]:
import torch 
from torch import nn
from torch.utils.data import Dataset,DataLoader

In [69]:
raw_text = 'The cat is hungry I went to the store But it was closed So I went to a different store'

# split the text into train and validation sets

In [70]:
ratio = 0.66
split_index = int(len(raw_text) * ratio)
train_raw_text = raw_text[:split_index]
val_raw_text = raw_text[split_index:]

In [71]:
print("Train raw text:")
print("-"*100)
print(train_raw_text)


Train raw text:
----------------------------------------------------------------------------------------------------
The cat is hungry I went to the store But it was closed 


In [72]:
print("Val raw text:")
print("-"*100)
print(val_raw_text)

Val raw text:
----------------------------------------------------------------------------------------------------
So I went to a different store


# Corpus vocabulary


We will make an assumption that each word is a unique token.<br>
This is a simplification and not true in the real world.<br>
In practice, we would use a more sophisticated tokenization method.


In [73]:
vocab = list(set(raw_text.split(' ')))

for i in vocab:
    print(i)

But
closed
it
cat
hungry
to
is
So
different
The
I
went
the
was
store
a


In [74]:
vocab_size = len(vocab)
print(f'vocab_size: {vocab_size}')

vocab_size: 16


In [75]:
tokens_to_ids = {token: id for id, token in enumerate(vocab)}
print("Mapping of tokens to ids:")
tokens_to_ids

Mapping of tokens to ids:


{'But': 0,
 'closed': 1,
 'it': 2,
 'cat': 3,
 'hungry': 4,
 'to': 5,
 'is': 6,
 'So': 7,
 'different': 8,
 'The': 9,
 'I': 10,
 'went': 11,
 'the': 12,
 'was': 13,
 'store': 14,
 'a': 15}

In [76]:
ids_to_tokens = {id: token for id, token in enumerate(vocab)}
print("Mapping of ids to tokens:")
ids_to_tokens

Mapping of ids to tokens:


{0: 'But',
 1: 'closed',
 2: 'it',
 3: 'cat',
 4: 'hungry',
 5: 'to',
 6: 'is',
 7: 'So',
 8: 'different',
 9: 'The',
 10: 'I',
 11: 'went',
 12: 'the',
 13: 'was',
 14: 'store',
 15: 'a'}

In [88]:
def encode(text):
    return [tokens_to_ids[token] for token in text.strip().split(' ')]

def decode(ids):
    return ' '.join([ids_to_tokens[id] for id in ids])

In [91]:
encode(raw_text)

[9, 3, 6, 4, 10, 11, 5, 12, 14, 0, 2, 13, 1, 7, 10, 11, 5, 15, 8, 14]

In [92]:
encode("The cat is hungry")

[9, 3, 6, 4]

In [93]:
decode([9,3,6,4])

'The cat is hungry'

# Creating a dataset 

### Dataset 

In [94]:
class Data(Dataset):
    def __init__(self,raw_text,max_len=4,stride=3):
        self.token_ids = encode(raw_text)
        self.X = []
        self.y = []
        for i in range(0,len(self.token_ids)-max_len,stride):
            input = self.token_ids[i:i+max_len]
            output = self.token_ids[i+1:i+max_len+1]
            self.X.append(torch.tensor(input))
            self.y.append(torch.tensor(output))
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self,idx):
        return self.X[idx],self.y[idx]
    

In [96]:
train_ds = Data(train_raw_text)
train_ds[0]

(tensor([9, 3, 6, 4]), tensor([ 3,  6,  4, 10]))

In [97]:
val_ds = Data(val_raw_text)
val_ds[0]

(tensor([ 7, 10, 11,  5]), tensor([10, 11,  5, 15]))

### DataLoader

In [98]:
train_dl = DataLoader(train_ds,batch_size=1,shuffle=False,drop_last=False,num_workers=0)
val_dl   = DataLoader(val_ds,batch_size=1,shuffle=False,drop_last=True,num_workers=0)

In [99]:
for i,(x,y) in enumerate(train_dl):
  print(f'Batch Number: {i+1}')
  print(f'x :{x}')
  print(f'y :{y}')
  print('--'*20)

Batch Number: 1
x :tensor([[9, 3, 6, 4]])
y :tensor([[ 3,  6,  4, 10]])
----------------------------------------
Batch Number: 2
x :tensor([[ 4, 10, 11,  5]])
y :tensor([[10, 11,  5, 12]])
----------------------------------------
Batch Number: 3
x :tensor([[ 5, 12, 14,  0]])
y :tensor([[12, 14,  0,  2]])
----------------------------------------


In [100]:
for i,(x,y) in enumerate(val_dl):
  print(f'Batch Number: {i+1}')
  print(f'x :{x}')
  print(f'y :{y}')
  print('--'*20)

Batch Number: 1
x :tensor([[ 7, 10, 11,  5]])
y :tensor([[10, 11,  5, 15]])
----------------------------------------


Taking single batch, and we know that the batch size is 1. <br>
so we are taking a single example from the dataset. 

In [102]:
for x,y in train_dl:
  print(x)
  print(y)
  break

tensor([[9, 3, 6, 4]])
tensor([[ 3,  6,  4, 10]])


# Config 

In [104]:
B = 1  # batch size 


context_length = 4 # context window  [max length of the input sequence the model can handle]
num_tokens = 4 # this can not be greater than context window
d_in = 3  # embedding dimension 
d_out = 4 # output dimension 

dropout = 0.0 # notice that we are not using it, but we are still initializing it(to see how it works)

num_heads = 2 
head_dim = d_out//num_heads
print(head_dim)

2


# Token Embedding  and Positional Encoding 

In [123]:
torch.manual_seed(1)

# Embedding 
token_emb = nn.Embedding(vocab_size,d_in)
token_embedding = token_emb(x)
print('Token Embeddings:')
print(token_embedding)
print('--'*20)
print(token_embedding.shape)


Token Embeddings:
tensor([[[-1.4465,  0.0612, -0.6177],
         [ 0.3037, -0.7773, -0.2515],
         [ 0.6995,  0.1991,  0.8657],
         [-0.2223,  1.6871,  0.2284]]], grad_fn=<EmbeddingBackward0>)
----------------------------------------
torch.Size([1, 4, 3])


In [124]:
torch.manual_seed(1)

# Positional embedding
pos_emb = nn.Embedding(context_length,d_in)
positional_embedding = pos_emb(torch.arange(num_tokens))
print('Positional Embeddings:')
print('--'*20)
print(positional_embedding)
print('--'*20)
print(positional_embedding.shape)

Positional Embeddings:
----------------------------------------
tensor([[ 0.6614,  0.2669,  0.0617],
        [ 0.6213, -0.4519, -0.1661],
        [-1.5228,  0.3817, -1.0276],
        [-0.5631, -0.8923, -0.0583]], grad_fn=<EmbeddingBackward0>)
----------------------------------------
torch.Size([4, 3])


In [126]:
# Token embedding + token embedding 
tok_pos_emb = token_embedding + positional_embedding
print('Token Embedding + Positional Embedding')
print('--'*20)
print(tok_pos_emb)
print('--'*20)
print(tok_pos_emb.shape)

Token Embedding + Positional Embedding
----------------------------------------
tensor([[[-0.7851,  0.3281, -0.5561],
         [ 0.9250, -1.2292, -0.4176],
         [-0.8232,  0.5808, -0.1619],
         [-0.7853,  0.7948,  0.1702]]], grad_fn=<AddBackward0>)
----------------------------------------
torch.Size([1, 4, 3])


# Pre Transformer Block Dropout 

In [129]:
torch.manual_seed(1)

pre_transformer_dp = nn.Dropout(dropout)
pre_transformer_dp_result = pre_transformer_dp(tok_pos_emb)
pre_transformer_dp_result

tensor([[[-0.7851,  0.3281, -0.5561],
         [ 0.9250, -1.2292, -0.4176],
         [-0.8232,  0.5808, -0.1619],
         [-0.7853,  0.7948,  0.1702]]], grad_fn=<AddBackward0>)

# Transformer Block 

### Layer Normalization 

In [131]:
layernorm1 = nn.LayerNorm(d_in)
layernorm1 = layernorm1(pre_transformer_dp_result)
print(layernorm1)
print('--'*20)
print(layernorm1.shape)

tensor([[[-0.9321,  1.3871, -0.4550],
         [ 1.3121, -1.1129, -0.1992],
         [-1.2004,  1.2477, -0.0473],
         [-1.3007,  1.1310,  0.1697]]], grad_fn=<NativeLayerNormBackward0>)
----------------------------------------
torch.Size([1, 4, 3])


### Multi-Head Attention 

weights initialization 

In [132]:
torch.manual_seed(123)

W_q = nn.Linear(d_in,d_out,bias=False)
W_k = nn.Linear(d_in,d_out,bias=False)
W_v = nn.Linear(d_in,d_out,bias=False)

# REMINDER : THE WEIGHT MATRICES ARE TRANSPOSED 
print('W_q')
print(W_q.weight.T)
print(W_q.weight.T.shape)
print('---'*20)

print('W_k')
print(W_k.weight.T)
print(W_k.weight.T.shape)
print('---'*20)

print('W_v')
print(W_v.weight.T)
print(W_v.weight.T.shape)

W_q
tensor([[-0.2354,  0.2177, -0.4196,  0.2615],
        [ 0.0191, -0.4919, -0.4590, -0.2133],
        [-0.2867,  0.4232, -0.3648,  0.2161]], grad_fn=<PermuteBackward0>)
torch.Size([3, 4])
------------------------------------------------------------
W_k
tensor([[-0.4900, -0.1135, -0.1362,  0.1076],
        [-0.3503, -0.4404,  0.1853,  0.1579],
        [-0.2120,  0.3780,  0.4083,  0.5573]], grad_fn=<PermuteBackward0>)
torch.Size([3, 4])
------------------------------------------------------------
W_v
tensor([[-0.2604,  0.4126,  0.4929,  0.2377],
        [ 0.1829,  0.4611,  0.2757,  0.4800],
        [-0.2569, -0.5323,  0.2516, -0.0762]], grad_fn=<PermuteBackward0>)
torch.Size([3, 4])


Q,K,V

In [133]:
torch.manual_seed(1)

Q = W_q(tok_pos_emb)
K = W_k(tok_pos_emb)
V = W_v(tok_pos_emb)


print(f'Q\n{Q}')
print(Q.shape)

print('---'*20)

print(f'K\n{K}')
print(K.shape)

print('---'*20)

print(f'V\n{V}')
print(V.shape)

Q
tensor([[[ 0.3506, -0.5677,  0.3817, -0.3954],
         [-0.1215,  0.6294,  0.3284,  0.4139],
         [ 0.2513, -0.5335,  0.1379, -0.3741],
         [ 0.1513, -0.4900, -0.0974, -0.3381]]], grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 4, 4])
------------------------------------------------------------
K
tensor([[[ 0.3877, -0.2656, -0.0593, -0.3425],
         [ 0.0658,  0.2786, -0.5242, -0.3273],
         [ 0.2343, -0.2236,  0.1536, -0.0871],
         [ 0.0703, -0.1966,  0.3237,  0.1358]]], grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 4, 4])
------------------------------------------------------------
V
tensor([[[ 0.4073,  0.1233, -0.4364,  0.0132],
         [-0.3584,  0.0372,  0.0120, -0.3383],
         [ 0.3622,  0.0143, -0.2864,  0.0954],
         [ 0.3061, -0.0481, -0.1251,  0.1818]]], grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 4, 4])


### splitting Q,K,V into multiple heads 

In [134]:
Q_split  = Q.view(B,num_tokens,num_heads,head_dim).transpose(1,2)
K_split  = K.view(B,num_tokens,num_heads,head_dim).transpose(1,2)
V_split  = V.view(B,num_tokens,num_heads,head_dim).transpose(1,2)


print(Q_split)
print(Q_split.shape)

print('---'*20)

print(K_split)
print(K_split.shape)


print('---'*20)

print(V_split)
print(V_split.shape)

tensor([[[[ 0.3506, -0.5677],
          [-0.1215,  0.6294],
          [ 0.2513, -0.5335],
          [ 0.1513, -0.4900]],

         [[ 0.3817, -0.3954],
          [ 0.3284,  0.4139],
          [ 0.1379, -0.3741],
          [-0.0974, -0.3381]]]], grad_fn=<TransposeBackward0>)
torch.Size([1, 2, 4, 2])
------------------------------------------------------------
tensor([[[[ 0.3877, -0.2656],
          [ 0.0658,  0.2786],
          [ 0.2343, -0.2236],
          [ 0.0703, -0.1966]],

         [[-0.0593, -0.3425],
          [-0.5242, -0.3273],
          [ 0.1536, -0.0871],
          [ 0.3237,  0.1358]]]], grad_fn=<TransposeBackward0>)
torch.Size([1, 2, 4, 2])
------------------------------------------------------------
tensor([[[[ 0.4073,  0.1233],
          [-0.3584,  0.0372],
          [ 0.3622,  0.0143],
          [ 0.3061, -0.0481]],

         [[-0.4364,  0.0132],
          [ 0.0120, -0.3383],
          [-0.2864,  0.0954],
          [-0.1251,  0.1818]]]], grad_fn=<TransposeBackward0>)
tor

### Attention Score 

In [135]:
attn_score = Q_split @ K_split.transpose(2,3)
attn_score

tensor([[[[ 0.2867, -0.1351,  0.2091,  0.1363],
          [-0.2143,  0.1673, -0.1692, -0.1323],
          [ 0.2391, -0.1321,  0.1782,  0.1226],
          [ 0.1888, -0.1265,  0.1450,  0.1070]],

         [[ 0.1128, -0.0707,  0.0931,  0.0699],
          [-0.1613, -0.3076,  0.0144,  0.1625],
          [ 0.1200,  0.0501,  0.0538, -0.0062],
          [ 0.1216,  0.1617,  0.0145, -0.0775]]]],
       grad_fn=<UnsafeViewBackward0>)