In [72]:
import torch 
from torch import nn 
from torch.utils.data import Dataset,DataLoader
import tiktoken 
import matplotlib.pyplot as plt 
import plotly.graph_objects as go 

import math 

In [73]:
if torch.cuda.is_available():
  device = 'cuda'
# elif torch.backends.mps.is_available():
#   device = 'mps'
else :
  device = 'cpu'

print(device)

cpu


# Part 1 :  Fetch The Data 

In [74]:
with open('the-verdict.txt','r',encoding='utf-8') as f:
  raw_text = f.read()

print(f'First 20 characters: {raw_text[:20]}')
print(f'Number of Characters: {len(raw_text)}')


First 20 characters: I HAD always thought
Number of Characters: 20479


# Part 2 : Dataset Preparation 

### Dataset Class

In [75]:
class Data(Dataset):
  def __init__(self,raw_text,tokenizer,context_window,stride):
    self.token_id = tokenizer.encode(raw_text)
    self.X = []
    self.y = []

    for i in range(0,len(self.token_id)-context_window,stride):
      input_chunks = self.token_id[i:i+context_window]
      output_chunks = self.token_id[(i)+1:(i+context_window) +1]
      self.X.append(torch.tensor(input_chunks))
      self.y.append(torch.tensor(output_chunks))

  def __len__(self):
    return len(self.X)

  def __getitem__(self,idx):
    return self.X[idx],self.y[idx]

### Split 

Notice we split the corpus to train/val before creating the datasets

In [76]:
ratio = 0.90
split_index = int(len(raw_text)* ratio)
print(split_index)

train_text = raw_text[:split_index]
val_text = raw_text[split_index:]

18431


### Datasets 

In [77]:
train_ds = Data(train_text,tiktoken.get_encoding('gpt2'),context_window=256,stride=256)   # notice stride == context_window (in that case there wont be any overlap)
val_ds   = Data(val_text,tiktoken.get_encoding('gpt2'),context_window=256,stride=256)     # *Same as above 

print(f'Number of tokens {len(train_ds.token_id)}')
print(f'Number of tokens {len(val_ds.token_id)}')

Number of tokens 4612
Number of tokens 534


### DataLoader 

In [78]:
train_dl = DataLoader(train_ds,batch_size=2,shuffle=False,drop_last=True, num_workers=0)
val_dl   = DataLoader(val_ds,  batch_size=2,shuffle=False,drop_last=False,num_workers=0)

In [79]:
print(len(train_dl))
print(len(val_dl))    

9
1


In [80]:
for i,(x,y) in enumerate(train_dl):
  print(i+1)
  print(f'X : {x.shape}')
  print(f'y : {y.shape}')
  print('---'*10)

1
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
2
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
3
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
4
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
5
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
6
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
7
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
8
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
9
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------


In [81]:
for i,(x,y) in enumerate(val_dl):
  print(i+1)
  print(f'X : {x.shape}')
  print(f'y : {y.shape}')
  print('---'*10)

1
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------


### Conversions 

These conversion functions allow us to convert text to token_ids,and vice versa easily.

In [82]:
def text_to_token_ids(text,tokenizer=tiktoken.get_encoding('gpt2')):
  token_ids = tokenizer.encode(text,allowed_special={'<|endoftext|>'})              # python list.
  token_ids_in_tensors = torch.tensor(token_ids).unsqueeze(0)                       # tensor,with batch dimesion.
  return token_ids_in_tensors

In [83]:
text_to_token_ids('Hello this is hawar <|endoftext|>')    # Note : the last token_id is 50256, (50257) is the vocab size by the author

tensor([[15496,   428,   318,   387,  5767,   220, 50256]])

In [84]:
def token_ids_to_text(token_ids_in_tensors,tokenizer=tiktoken.get_encoding('gpt2')):
  token_ids = token_ids_in_tensors.squeeze(0).tolist()    # Note: It can only handle batch_size = 1
  text = tokenizer.decode(token_ids)
  return text 

# Part 3 : Multi-Head-Attention

In [85]:
torch.manual_seed(1)
class MultiHeadAttention(nn.Module):
  def __init__(self,d_in,d_out,
               context_window,    # The context window is needed here to create the mask.
               num_heads,
               dp_attn,qkv_bias=False):
    
    super().__init__()

    assert d_out%num_heads == 0, 'd_out must be divisble by num_heads'
    self.d_out = d_out
    self.num_heads = num_heads
    self.head_dim = self.d_out //self.num_heads # dimension of each head

    self.W_q = nn.Linear(d_in,d_out,bias=qkv_bias)
    self.W_k = nn.Linear(d_in,d_out,bias=qkv_bias)
    self.W_v = nn.Linear(d_in,d_out,bias=qkv_bias)

    self.register_buffer('mask',torch.triu(torch.ones(context_window,context_window),diagonal=1))
    # why shape (context_window,context_window) ? beacause we'll use it on Q@KT which will have shape of (b,self.num_heads,context_window,context_window)
    # But since we might have different num_tokens(i.e num_tokens is not context_window for every case) we will slice it to [:num_tokens][:num_tokens]
    # This slicing wont be necessary if num_tokens == context window
    
    self.dropout = nn.Dropout(dp_attn)
    self.out_proj = nn.Linear(d_out,d_out)


  def forward(self,x):
    b,num_tokens,d_in = x.shape

    Q = self.W_q(x)
    K = self.W_k(x)
    V = self.W_v(x)

    # split 
    Q = Q.view(b,num_tokens,self.num_heads,self.head_dim).transpose(1,2)  # result : (b,self.num_heads,num_tokens,self.head_dim)
    K = K.view(b,num_tokens,self.num_heads,self.head_dim).transpose(1,2)
    V = V.view(b,num_tokens,self.num_heads,self.head_dim).transpose(1,2)

    # 
    attn_score = Q @ K.transpose(2,3) # result : (b,self.num_head,num_tokens,num_tokens)
    attn_score.masked_fill_(self.mask.bool()[:num_tokens,:num_tokens],-torch.inf)
    attn_weight = torch.softmax(attn_score/(K.shape[-1]**0.5),dim=-1)
    attn_weight = self.dropout(attn_weight)

    context_vec = attn_weight @ V # result : (b,self.num_head,num_toknes,self.head_dim)
    context_vec = context_vec.transpose(1,2).contiguous().view(b,num_tokens,self.d_out)
    context_vec = self.out_proj(context_vec)

    return context_vec

# Part 4 : Transformers 

In [86]:
class TransformerBlock(nn.Module):
  def __init__(self,cfg):
    super().__init__()

    self.norm1 = nn.LayerNorm(cfg['d_out'])
    self.attn = MultiHeadAttention(cfg['d_in'],cfg['d_out'],cfg['context_window'],cfg['num_heads'],cfg['dp_attn'],cfg['qkv_bias'])
    self.dp_after_attn = nn.Dropout(cfg['dp_after_attn'])

    self.norm2 = nn.LayerNorm(cfg['d_out'])
    self.fc = nn.Sequential(nn.Linear(cfg['d_out'],4*cfg['d_out']),
                            nn.GELU(),
                            nn.Linear(4*cfg['d_out'],cfg['d_out']))
    self.dp_after_fc = nn.Dropout(cfg['dp_after_fc'])

  def forward(self,x):
    shortcut = x  
    x = self.norm1(x)
    x = self.attn(x)
    x = self.dp_after_attn(x)
    x = x + shortcut

    shortcut = x
    x = self.norm2(x)
    x = self.fc(x)
    x = self.dp_after_fc(x)
    x = x + shortcut

    return x 
    

### Model's Configuration 

In [87]:
GPT_CONFIG_124M = {    #TransformerBlock Level
          'd_in':768,
           'd_out':768,
           'context_window':256,  # You can not have num_tokens greater than this number 
           'num_heads':12,
           
           'dp_attn':0.1,
           'dp_after_attn':0.1,
           'dp_after_fc':0.1,

           'qkv_bias':False,
           
            # GPTModel Level
           'dp_before_attn':0.1,
           'vocab_size':50256+1,
           'num_layers':12}



# Part 5 : GPT Model 

In [88]:
class GPTModel(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['d_in'])
    self.pos_emb = nn.Embedding(cfg['context_window'],cfg['d_in'])
    self.dp_before_trans = nn.Dropout(cfg['dp_before_attn'])
    self.transformer_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['num_layers'])])
    self.final_norm = nn.LayerNorm(cfg['d_out'])
    self.head_out = nn.Linear(cfg['d_out'],cfg['vocab_size'])

  def forward(self,x):
    b,num_tokens = x.shape

    tok_em = self.tok_emb(x)
    pos_em = self.pos_emb(torch.arange(num_tokens,device=device))   # this is where it can handle dynamic sequence lengths (as long as it's lower than context window)
    x = tok_em + pos_em

    x = self.dp_before_trans(x)
    x = self.transformer_blocks(x)
    x = self.final_norm(x)
    logits = self.head_out(x)

    return logits

In [89]:
model = GPTModel(GPT_CONFIG_124M).to(device)

In [90]:
model.eval()
with torch.no_grad():
  print(model(torch.randint(0,50257,(2,256)).to(device)))        # torch.randint(0,vocab_size,(batch_size,num_tokens))     PS: num_tokens must be <= context_window
  print(model(torch.randint(0,50257,(2,256)).to(device)).shape) 

tensor([[[-1.1501e-01, -4.0099e-01, -7.0368e-02,  ...,  5.5857e-02,
          -4.8014e-01,  3.7116e-01],
         [ 6.1407e-02, -1.4992e-01,  1.0680e+00,  ...,  4.3743e-01,
          -7.0324e-01,  4.2321e-01],
         [ 4.6695e-01,  2.3328e-01, -3.4185e-02,  ...,  6.2094e-01,
          -6.9301e-01,  3.5567e-01],
         ...,
         [ 8.3024e-02, -2.7019e-01,  5.7516e-02,  ...,  4.7997e-01,
           5.8081e-02,  6.7800e-01],
         [ 5.8204e-01,  6.1453e-01,  6.5674e-01,  ..., -2.5922e-01,
          -1.9805e-01,  5.0750e-01],
         [-3.4647e-01,  6.2501e-01,  6.0778e-01,  ...,  5.1414e-01,
           3.9566e-01, -8.4009e-01]],

        [[-5.2159e-01, -3.2698e-01, -3.3057e-01,  ...,  1.9990e-01,
           6.9118e-01,  3.8865e-01],
         [ 2.7139e-01, -8.9852e-02,  3.9952e-01,  ...,  5.9899e-01,
          -3.8721e-01, -4.0687e-01],
         [ 5.2349e-01, -7.6593e-02, -6.5530e-01,  ..., -3.0248e-01,
          -1.1583e+00, -5.8816e-01],
         ...,
         [ 2.2410e-01, -1

# Part 6 : Next token  

In [91]:
# if we dont slice the mask, it will throw an error here, why?  beacuase the input num_tokens != context_window 

def generate_simple_text(starting_text,
                         context_window,    # this can be anything between [1,context_window]
                         num_tokens_generated = 20,temperature=1,k=None):  
  
    starting_tokens = text_to_token_ids(starting_text).to(device)       # this will have the shape (B=1,number of tokens after converting text to token ids)
    model.eval()
    with torch.no_grad():
        for _ in range(num_tokens_generated):
            starting_tokens = starting_tokens[:,-context_window:]    # how far back to pay attention 
            logits = model(starting_tokens)[:,-1,:]                  # we only want the last vector

            # new
            if k is not None:
              top_logits, _ = torch.topk(logits,k)                                      # find top 3. 
              new_logits = torch.where(logits<top_logits[:,-1],-torch.inf,logits)       # keep top 3, make other -inf.
              probs = torch.softmax(new_logits/temperature,dim=-1)                                  # probability of top 3 , the rest will be zero.
              token_predicted = torch.multinomial(probs,num_samples = 1)                # pick one (according to their probability) from top 3.
            # End
            else : 
                token_predicted = torch.argmax(logits,dim=-1,keepdim=True)
            
            starting_tokens = torch.cat([starting_tokens,token_predicted],dim=-1)
            
        print(token_ids_to_text(starting_tokens))

In [92]:
generate_simple_text(starting_text='Everty effort moves you',context_window=GPT_CONFIG_124M['context_window'],temperature=0.2,k=3)

Everty effort moves youreligiouszzo distortionscommerceventionalplings intellectual lore Mud }}annah Specialiologist Examplesector461 Exhibitvention Yugoslav aggrav


# Part 7 : Training 

In [94]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(),weight_decay =0.1)   # weight decay is L2 norm regularization.

In [95]:
torch.manual_seed(123)
epochs = 10 
TRAIN_LOSS,VAL_LOSS = [],[]

for epoch in range(epochs) :
  print(f'Epoch : {epoch:02d}',end='   ')                                                    # first print()
  train_loss,val_loss = [],[]

  # TRAINING 
  model.train()
  for x,y in train_dl:
    x,y = x.to(device),y.to(device)
    optimizer.zero_grad()
    y_hat = model(x)
    loss = loss_fn(y_hat.flatten(0,1),y.flatten())    # y_hat result : (512,50257),    y result : (512) 
    train_loss.append(loss.item())
    loss.backward()
    optimizer.step()
    
  mean_epoch_train_loss = round(torch.mean(torch.tensor(train_loss)).item(),3)
  print(f'Training Loss : {mean_epoch_train_loss:<10}',end=' ')                                 # second print()         
  TRAIN_LOSS.append(mean_epoch_train_loss)

  # VALIDATION 
  model.eval()
  with torch.no_grad():
    for x,y in val_dl:
      x,y = x.to(device),y.to(device)
      y_hat = model(x)
      loss = loss_fn(y_hat.flatten(0,1),y.flatten())
      val_loss.append(loss.item())

    mean_epoch_val_loss = round(torch.tensor(val_loss).mean().item(),3)                         # Third print()
    print(f'Validation Loss : {mean_epoch_val_loss}')
    VAL_LOSS.append(mean_epoch_val_loss)

  # Generate text 
  generate_simple_text(starting_text='Everty effort moves you',context_window=GPT_CONFIG_124M['context_window'],num_tokens_generated = 20,k=3)

  print('====='*25)

  

Epoch : 00   Training Loss : 8.533      Validation Loss : 6.898
Everty effort moves you he,, he I, he I, he I I, he I he was I I I
Epoch : 01   Training Loss : 6.982      Validation Loss : 6.703
Everty effort moves you a---- to to to to.



" to---- to.


Epoch : 02   Training Loss : 5.959      Validation Loss : 6.734
Everty effort moves you
" heburn of the

"



".



"
Epoch : 03   Training Loss : 5.728      Validation Loss : 6.72
Everty effort moves you

He to the
I to, and of the, I of of the.


Epoch : 04   Training Loss : 5.445      Validation Loss : 6.544
Everty effort moves youburnburn-- the house, as aisburn-- the factburn-- and I was the .
Epoch : 05   Training Loss : 5.021      Validation Loss : 6.474
Everty effort moves you he was a a the " was a Jack's the picture the picture the picture the picture me to
Epoch : 06   Training Loss : 4.665      Validation Loss : 6.416
Everty effort moves you to me to see and, I had been the donkey with a--his--and of his own
Epoch : 07   T

# Part 8.0 : Train/Val Loss Plot  

In [96]:
training_curve = go.Scatter(
  x = torch.arange(n_epochs),
  y = TRAIN_LOSS,
  mode = 'lines',
  line = dict(color='red'),
  name = 'Training Loss'
)

valid_curve = go.Scatter(
  x = torch.arange(n_epochs),
  y = VAL_LOSS,
  mode = 'lines',
  line = dict(color='yellow'),
  name = 'Validation Loss'
)

figure = go.Figure(data=[training_curve,valid_curve])
figure.show()

# Loading and Saving model weights in Pytorch

In [26]:
torch.save(
  {'model_state_dict':model.state_dict(),
   'optimizer_state_dict':optimizer.state_dict()},
   'model_and_optimizers.pth'
  )

In [27]:
checkpoint = torch.load('model_and_optimizers.pth',map_location=device)


model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(checkpoint['model_state_dict'])

optimizer = torch.optim.AdamW(model.parameters(),lr=0.0004,weight_decay =0.1)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.

