The following notebooks is exactly as `ALL_level_0`, except for using from scratch class `MultiheadAttention` we use pytorch's `nn.MultiheadAttention`
which decreases the amount of code we write immensly. 

In [1]:
import torch 
from torch import nn 
from torch.utils.data import Dataset,DataLoader
import tiktoken 
import matplotlib.pyplot as plt 
import plotly.graph_objects as go 

In [2]:
if torch.cuda.is_available():
  device = 'cuda'
# elif torch.backends.mps.is_available():
#   device = 'mps'
else :
  device = 'cpu'

print(device)

cpu


### Model's Configuration 

In [3]:
# cfg = {    #TransformerBlock Level
#           'd_in':12,
#            'd_out':12,
#            'context_window':256,  # You can not have num_tokens greater than this number 
#            'num_heads':4,

#            'dp_attn':0.0,
#            'dp_after_trans':0.0,
#            'dp_after_fc':0.0,

#            'qkv_bias':False,
           
#             # GPTModel Level
#            'dp_before_trans':0.0,
#            'vocab_size':50256+1,
#            'num_layers':4}

#-----------------------------------------------------

GPT_CONFIG_124M = {    #TransformerBlock Level
          'd_in':768,
           'd_out':768,
           'context_window':256,  # You can not have num_tokens greater than this number 
           'num_heads':12,

           'dp_attn':0.1,
           'dp_after_attn':0.1,
           'dp_after_fc':0.1,

           'qkv_bias':False,
           
            # GPTModel Level
           'dp_before_trans':0.1,
           'vocab_size':50256+1,
           'num_layers':12}

# Part 1 :  Fetch The Data 

In [4]:
with open('the-verdict.txt','r',encoding='utf-8') as f:
  raw_text = f.read()

print(f'First 20 characters: {raw_text[:20]}')
print(f'Number of Characters: {len(raw_text)}')


First 20 characters: I HAD always thought
Number of Characters: 20479


# Part 2 : Dataset Preparation 

### Dataset Class

In [5]:
class Data(Dataset):
  def __init__(self,raw_text,tokenizer,context_window,stride):
    self.token_id = tokenizer.encode(raw_text)
    self.X = []
    self.y = []

    for i in range(0,len(self.token_id)-context_window,stride):
      input_chunks = self.token_id[i:i+context_window]
      output_chunks = self.token_id[(i)+1:(i+context_window) +1]
      self.X.append(torch.tensor(input_chunks))
      self.y.append(torch.tensor(output_chunks))

  def __len__(self):return len(self.X)

  def __getitem__(self,idx):return self.X[idx],self.y[idx]

### Split 

Notice we split the corpus to train/val before creating the datasets

In [6]:
ratio = 0.9
split_index = int(len(raw_text)*0.9)

train_text = raw_text[:split_index]
val_text = raw_text[split_index:]

### Datasets 

In [7]:
train_ds = Data(train_text,tiktoken.get_encoding('gpt2'),context_window=256,stride=128)
val_ds   = Data(val_text,tiktoken.get_encoding('gpt2'),context_window=256,stride=128)

print(f'Number of tokens {len(train_ds.token_id)}')
print(f'Number of tokens {len(val_ds.token_id)}')

Number of tokens 4612
Number of tokens 534


### DataLoader 

In [8]:
train_dl = DataLoader(train_ds,batch_size=2,shuffle=False,drop_last=True,num_workers=0)
val_dl   = DataLoader(val_ds,batch_size=2,shuffle=False,drop_last=True,num_workers=0)

In [9]:
print(len(train_dl))
print(len(val_dl))    

17
1


In [10]:
for i,(x,y) in enumerate(train_dl):
  print(i+1)
  print(f'X : {x.shape}')
  print(f'y : {y.shape}')
  print('---'*10)

1
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
2
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
3
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
4
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
5
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
6
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
7
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
8
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
9
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
10
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
11
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
12
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------
1

In [12]:
for i,(x,y) in enumerate(val_dl):
  print(i+1)
  print(f'X : {x.shape}')
  print(f'y : {y.shape}')
  print('---'*10)

1
X : torch.Size([2, 256])
y : torch.Size([2, 256])
------------------------------


### Conversions 

These conversion functions allow us to convert text to token_ids,and vice versa easily.

In [13]:
def text_to_token_ids(text,tokenizer=tiktoken.get_encoding('gpt2')):
  token_ids = tokenizer.encode(text,allowed_special={'<|endoftext|>'})              # python list.
  token_ids_in_tensors = torch.tensor(token_ids).unsqueeze(0)                       # tensor,with batch dimesion.
  return token_ids_in_tensors

In [14]:
text_to_token_ids('Hello this is hawar <|endoftext|>')    # Note : the last token_id is 50256, (50257) is the vocab size by the author

tensor([[15496,   428,   318,   387,  5767,   220, 50256]])

In [15]:
def token_ids_to_text(token_ids_in_tensors,tokenizer=tiktoken.get_encoding('gpt2')):
  token_ids = token_ids_in_tensors.squeeze(0).tolist()    # Note: It can only handle batch_size = 1
  text = tokenizer.decode(token_ids)
  return text 

# Part 4 : Transformers 

In [17]:
GPT_CONFIG_124M = {    #TransformerBlock Level
          'd_in':768,
           'd_out':768,
           'context_window':256,  # You can not have num_tokens greater than this number 
           'num_heads':12,

           'dp_attn':0.1,
           'dp_after_attn':0.1,
           'dp_after_fc':0.1,

           'qkv_bias':False,
           
            # GPTModel Level
           'dp_before_trans':0.1,
           'vocab_size':50256+1,
           'num_layers':12}

In [38]:
class TransformerBlock(nn.Module):
  def __init__(self,cfg):
    super().__init__()

    self.norm1 = nn.LayerNorm(cfg['d_out'])

    # change #1
    self.attn = nn.MultiheadAttention(embed_dim=cfg['d_out'],
                                      num_heads=cfg['num_heads'],
                                      dropout=cfg['dp_attn'],
                                      bias=cfg['qkv_bias'],
                                      add_bias_kv=False,
                                      batch_first=True
                                      )
    
    
    self.dp_after_attn = nn.Dropout(cfg['dp_after_attn'])

    self.norm2 = nn.LayerNorm(cfg['d_out'])
    self.fc = nn.Sequential(nn.Linear(cfg['d_out'],4*cfg['d_out']),
                            nn.GELU(),
                            nn.Linear(4*cfg['d_out'],cfg['d_out']))
    self.dp_after_fc = nn.Dropout(cfg['dp_after_fc'])

  def forward(self,x):
    _,num_tokens,_ = x.shape

    shortcut = x  
    x = self.norm1(x)
    x,_ = self.attn(x,x,x,attn_mask= torch.triu(torch.ones(num_tokens,num_tokens),diagonal=1).bool())   # change #2 
    x = self.dp_after_attn(x)
    x = x + shortcut

    shortcut = x    
    x = self.norm2(x)
    x = self.fc(x)
    x = self.dp_after_fc(x)
    x = x + shortcut

    return x 
    

In [39]:
batch_size = 1
num_tokens = GPT_CONFIG_124M['context_window']

random_data = torch.randn(batch_size,num_tokens,GPT_CONFIG_124M['d_in'])
print(random_data.shape)

torch.Size([1, 256, 768])


In [40]:
trans = TransformerBlock(cfg=GPT_CONFIG_124M)
trans(random_data)

tensor([[[ 0.0703, -2.8388, -0.0384,  ...,  1.5352, -0.4029, -0.6082],
         [-0.2148, -1.1586, -1.3472,  ...,  0.3172,  0.1590,  0.8950],
         [-0.4251, -3.1890,  0.1934,  ..., -0.6564,  0.8092, -0.2599],
         ...,
         [-0.4471,  0.5058, -2.1656,  ...,  0.8858,  0.3880, -0.3287],
         [ 0.3484, -1.6628, -1.9328,  ...,  0.0359, -0.7563,  1.4599],
         [ 0.8678,  1.4391, -0.2957,  ..., -1.5626,  1.2505, -0.8946]]],
       grad_fn=<AddBackward0>)

# Part 5 : GPT Model 

In [41]:
class GPTModel(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['d_in'])
    self.pos_emb = nn.Embedding(cfg['context_window'],cfg['d_in'])
    self.dp_before_trans = nn.Dropout(cfg['dp_before_trans'])
    self.transformer_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['num_layers'])])
    self.final_norm = nn.LayerNorm(cfg['d_out'])
    self.head_out = nn.Linear(cfg['d_out'],cfg['vocab_size'])

  def forward(self,x):
    b,num_tokens = x.shape

    tok_em = self.tok_emb(x)
    pos_em = self.pos_emb(torch.arange(num_tokens,device=device))   # this is where it can handle dynamic sequence lengths (as long as it's lower than context window)
    x = tok_em + pos_em

    x = self.dp_before_trans(x)
    x = self.transformer_blocks(x)
    x = self.final_norm(x)
    logits = self.head_out(x)

    return logits

In [42]:
model = GPTModel(GPT_CONFIG_124M).to(device)

In [45]:
torch.manual_seed(1)

model.eval()
with torch.no_grad():
  print(model(torch.randint(0,50257,(2,256)).to(device)))        # torch.randint(0,vocab_size,(batch_size,num_tokens))     PS: num_tokens must be <= context_window
  print(model(torch.randint(0,50257,(2,256)).to(device)).shape) 

tensor([[[ 0.1011,  0.9919, -0.4107,  ..., -0.2400, -0.8477,  0.6456],
         [-0.1797,  0.5026, -0.0126,  ...,  0.3000, -0.1201,  0.8410],
         [ 0.1956,  1.3994,  0.3599,  ..., -0.5086,  0.2106, -1.0735],
         ...,
         [ 1.6026, -0.8599,  0.8412,  ...,  0.0071, -0.6474, -0.2509],
         [ 0.5988, -0.3025, -0.3175,  ..., -0.0873, -0.2705, -0.3720],
         [-0.2168,  0.2292, -0.1836,  ...,  0.7535,  0.6395, -0.6062]],

        [[ 0.0951,  0.5516, -0.2337,  ..., -1.0712, -0.0047,  0.3570],
         [ 0.2293,  0.4457, -0.2993,  ..., -0.1531, -0.5117,  1.2723],
         [-0.0207, -0.2684, -0.0341,  ..., -0.9701,  0.1178, -1.2332],
         ...,
         [ 0.1153, -0.0597,  0.4312,  ...,  0.1331, -0.4542, -0.3726],
         [ 1.5359,  0.1110, -0.2688,  ...,  0.5526, -0.0789,  0.0379],
         [-0.4708, -0.2329, -0.1557,  ...,  0.8003,  0.6335, -0.8232]]])
torch.Size([2, 256, 50257])


# Part 6 : Next token  

In [48]:
# if we dont slice the mask, it will throw an error here, why?  beacuase the input num_tokens != context_window 

def generate_simple_text(starting_text,
                         context_window,    # this can be anything between [1,context_window]
                         num_tokens_generated = 20,temperature=1,k=None):  
  
    starting_tokens = text_to_token_ids(starting_text).to(device)       # this will have the shape (B=1,number of tokens after converting text to token ids)
    model.eval()
    with torch.no_grad():
        for _ in range(num_tokens_generated):
            starting_tokens = starting_tokens[:,-context_window:]    # how far back to pay attention 
            logits = model(starting_tokens)[:,-1,:]                  # we only want the last vector

            # new
            if k is not None:
              top_logits, _ = torch.topk(logits,k)                                      # find top 3. 
              new_logits = torch.where(logits<top_logits[:,-1],-torch.inf,logits)       # keep top 3, make other -inf.
              probs = torch.softmax(new_logits/temperature,dim=-1)                                  # probability of top 3 , the rest will be zero.
              token_predicted = torch.multinomial(probs,num_samples = 1)                # pick one (according to their probability) from top 3.
            # End
            else : 
                token_predicted = torch.argmax(logits,dim=-1,keepdim=True)
            
            starting_tokens = torch.cat([starting_tokens,token_predicted],dim=-1)
            
        print(token_ids_to_text(starting_tokens))

In [49]:
generate_simple_text(starting_text='Everty effort moves you',context_window=GPT_CONFIG_124M['context_window'],temperature=0.2,k=3)

Everty effort moves youmorphAre spawns troubledclipse Qian dwindlingUFC Veniceano delic cis Coconut study supply creditors purposely seismic Ice Scholarship


# Part 7 : Training 

In [51]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0004,weight_decay =0.1)

In [52]:
torch.manual_seed(123)

epochs = 10
TRAIN_LOSS,VAL_LOSS = [],[]

for epoch in range(epochs) :
  train_loss,val_loss = [],[]

  # TRAINING 
  model.train()
  for x,y in train_dl:
    x,y = x.to(device),y.to(device)
    optimizer.zero_grad()
    y_hat = model(x)
    loss = loss_fn(y_hat.flatten(0,1),y.flatten())    # y_hat result : (512,50257),    y result : (512) 
    train_loss.append(loss.item())
    loss.backward()
    optimizer.step()
    
  mean_epoch_train_loss = round(torch.mean(torch.tensor(train_loss)).item(),3)
  print(f'Training Loss : {mean_epoch_train_loss:<10}',end=' ')
  TRAIN_LOSS.append(mean_epoch_train_loss)

  # VALIDATION 
  model.eval()
  with torch.no_grad():
    for x,y in val_dl:
      x,y = x.to(device),y.to(device)
      y_hat = model(x)
      loss = loss_fn(y_hat.flatten(0,1),y.flatten())
      val_loss.append(loss.item())

    mean_epoch_val_loss = round(torch.tensor(val_loss).mean().item(),3)
    print(f'Validation Loss : {mean_epoch_val_loss}')
    VAL_LOSS.append(mean_epoch_val_loss)

  # Generate text 
  generate_simple_text(starting_text='Everty effort moves you',context_window=GPT_CONFIG_124M['context_window'],num_tokens_generated = 20,k=3)

  print('======'*25)

  

Training Loss : 8.331      Validation Loss : 6.816
Everty effort moves you, and and I I-- and to and, I I was I, and, and, I
Training Loss : 5.784      Validation Loss : 6.208
Everty effort moves you to have to see the, I was, I felt it--as I was, I had a
Training Loss : 4.6        Validation Loss : 5.941
Everty effort moves you know; but I don't--I was, so. Gis--and I was, and
Training Loss : 3.341      Validation Loss : 5.897
Everty effort moves you say, and he said--and I was dead." I felt to have been that he was not
Training Loss : 2.357      Validation Loss : 6.053
Everty effort moves you know; but I don't think of a little: the tips.
"Yes--she's
Training Loss : 1.618      Validation Loss : 6.322
Everty effort moves you know.
"I didn't dabble the fact with a deprecatingly"Oh,
Training Loss : 1.079      Validation Loss : 6.518
Everty effort moves you'd never touched a brush."

"I didn't _rose Dubarry_ drawing-room
Training Loss : 0.775      Validation Loss : 6.77
Everty effort mov

# Part 8 : Plot 

In [53]:
training_curve = go.Scatter(
  x = torch.arange(epochs),
  y = TRAIN_LOSS,
  mode = 'lines',
  line = dict(color='red'),
  name = 'Training Loss'
)

valid_curve = go.Scatter(
  x = torch.arange(epochs),
  y = VAL_LOSS,
  mode = 'lines',
  line = dict(color='yellow'),
  name = 'Validation Loss'
)

figure = go.Figure(data=[training_curve,valid_curve])
figure.show()

# Loading and Saving model weights in Pytorch

In [33]:
torch.save(
  {'model_state_dict':model.state_dict(),
   'optimizer_state_dict':optimizer.state_dict()},
   'model_and_optimizers.pth'
  )

In [36]:
checkpoint = torch.load('model_and_optimizers.pth',map_location=device)


model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(checkpoint['model_state_dict'])

optimizer = torch.optim.AdamW(model.parameters(),lr=0.0004,weight_decay =0.1)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.

