In [1]:
import torch
import torch.nn as nn

In [43]:
GPT2_124M_CFG={
    'dropout':0.1,
    'n_layers':12,
    'n_heads':12,
    'emb_size':768,
    'context_length':1024,
    'vocab_size':50257
}

In [44]:
# norm = x-mu/sqrt(var).         # to prevent interval covarinat shift and probide model stability while training
# norm = scale * norm + shift
class LayerNormalization(nn.Module):
  def __init__(self,emb_size):
       super().__init__()
       self.scale = nn.Parameter(torch.ones(emb_size))
       self.shift = nn.Parameter(torch.ones(emb_size))
       self.eps = 1e-5

  def forward(self,x):
      mean = torch.mean(x,dim=-1,keepdim=True)
      var = torch.var(x,dim=-1,keepdim=True,unbiased=False)
      return self.scale* (x-mean)/torch.sqrt(var+self.eps) + self.shift

In [45]:
class GELU(nn.Module):
  def __init__(self):
    super().__init__()
  def forward(self,x):
    return 0.5 * x *(1+ torch.tanh(torch.tensor((2/torch.pi)**0.5))*(x+0.044715*x**3))
    # return 0.5*x*(1+torch.tanh(torch.tensor((2/torch.pi))**0.5)*(x+0.044715*x**3))

In [46]:
class FeedForward(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Linear(cfg['emb_size'],4*cfg['emb_size']),
        GELU(),
        nn.Linear(4*cfg['emb_size'],cfg['emb_size'])
    )
  def forward(self,x):
        return self.layers(x)

In [47]:
class MultiheadAttention(nn.Module):
  def __init__(self,din,dout,n_heads,context_length,dropout,qkv_bias=False):
        super().__init__()
        self.w_queries = nn.Linear(din,dout,qkv_bias)
        self.w_keys = nn.Linear(din,dout,qkv_bias)
        self.w_values = nn.Linear(din,dout,qkv_bias)
        self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))
        self.out_layer = nn.Linear(dout,dout)
        self.n_heads= n_heads
        self.dropout = nn.Dropout(dropout)

  def forward(self,x):
    batch, context_length, emb_size = x.shape
    queries = self.w_queries(x)
    keys = self.w_keys(x)
    values = self.w_values(x)
    head_dim = emb_size//self.n_heads

    queries = queries.view(batch,context_length,self.n_heads,head_dim)
    keys = keys.view(batch,context_length,self.n_heads,head_dim)
    values = values.view(batch,context_length,self.n_heads,head_dim)
     # b,cl,nheads,hd.


     # b, nheads, cl, hd
    queries = queries.transpose(1,2)
    keys = keys.transpose(1,2)
    values = values.transpose(1,2)

    attention_scores = queries @ keys.transpose(2,3)
       #b, nheads, cl,cl

    attention_scores.masked_fill(self.mask.bool()[:context_length,:context_length],-torch.inf)

    attention_weights = torch.softmax(attention_scores/(keys.shape[-1])**0.5, dim =-1)
    attention_weights = self.dropout(attention_weights)
    context_vectors = (attention_weights @ values).transpose(1,2)
    # b, nheads,cl,cl.     b,nheads, cl, hd
    # b, nheads, cl,hd.     .T -> b, cl,nheads,hd
    context_vectors = context_vectors.contiguous().view(batch, context_length, emb_size)
    context_vectors = self.out_layer(context_vectors)
    return context_vectors

In [48]:
class Transformer(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.ff = FeedForward(cfg)
    self.dropout = nn.Dropout(cfg['dropout'])
    self.norm1 = LayerNormalization(cfg['emb_size'])
    self.norm2 = LayerNormalization(cfg['emb_size'])
    self.att = MultiheadAttention(
        din = cfg['emb_size'],
        dout = cfg['emb_size'],
        n_heads=cfg['n_heads'],
        context_length = cfg['context_length'],
        dropout = cfg['dropout'])

  def forward(self,x):
    shortcut = x
    x = self.norm1(x)
    x= self.att(x)
    x= self.dropout(x)
    x= shortcut+x

    shortcut = x
    x = self.norm2(x)
    x= self.ff(x)
    x= self.dropout(x)
    x= shortcut+x

    return x

In [49]:
# input = torch.rand(2,3,768)
# block= Transformer(GPT2_124M_CFG)
# block.forward(input)

In [50]:
class GPT2(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_size'])
    self.pos_emb = nn.Embedding(cfg['context_length'],cfg['emb_size'])
    self.out_head = nn.Linear(cfg['emb_size'],cfg['vocab_size'],bias=False)
    self.drop_emb = nn.Dropout(cfg['dropout'])
    self.blocks = nn.Sequential(
        *[Transformer(cfg) for _ in range(cfg['n_layers'])]
    )
    self.final_norm = LayerNormalization(cfg['emb_size'])

  def forward(self,x):
    batch, n_tokens = x.shape
    token_embed = self.tok_emb(x)
    position_embed = self.pos_emb(torch.arange(n_tokens,device=x.device))
    x =  token_embed + position_embed
    x= self.drop_emb(x)
    x= self.blocks(x)
    x= self.final_norm(x)
    logits= self.out_head(x)
    return logits

In [51]:
torch.manual_seed(123)
inputs = torch.tensor([
    [6109,3626,6100,345],
    [6109,1110,6622,257]
])
model = GPT2(GPT2_124M_CFG)
logits = model.forward(inputs)
print(logits)

tensor([[[-1.5969,  1.9405, -0.1171,  ...,  0.2547,  1.6430, -0.6668],
         [-1.5396,  1.7169, -0.3379,  ...,  0.9980,  2.3826, -0.3360],
         [-0.7847,  2.3319,  0.0410,  ...,  1.4021,  1.1724, -0.1535],
         [-1.3134,  2.6708,  0.1153,  ...,  1.4573,  1.5512, -0.9585]],

        [[-1.9150,  1.8283, -0.3165,  ...,  0.2451,  1.5907, -0.9045],
         [-1.4808,  1.8762,  0.2355,  ...,  0.8987,  1.7394,  0.3615],
         [-0.4588,  2.2907,  0.3171,  ...,  0.8931,  1.5476, -0.3596],
         [-1.2323,  1.2671,  0.3746,  ...,  2.0974,  1.6325, -0.2108]]],
       grad_fn=<UnsafeViewBackward0>)


In [52]:
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,}")

163,009,536


In [53]:
def get_text_simple(idx,model,max_words,context_length):
  model.eval()
  for _ in range(max_words):
   idx_new = idx[:,-context_length:]
   with torch.no_grad():
    logits = model(idx_new)
   temp = logits[:,-1,:]
   temp= torch.softmax(temp,dim=-1)
   next = torch.argmax(temp,dim=-1,keepdim=True)
   idx = torch.cat((idx,next),dim=-1)
  return idx

In [54]:
torch.manual_seed(123)
inputs = torch.tensor([
    [6109,3626,6100,345],
    [6109,1110,6622,257]
])
next = get_text_simple(inputs,model,3,GPT2_124M_CFG['context_length'])
print(next)

tensor([[ 6109,  3626,  6100,   345,  1919, 21117, 12162],
        [ 6109,  1110,  6622,   257, 41927,   343, 26097]])


In [55]:
GPT2_small_cl={
    'dropout':0.1,
    'n_layers':12,
    'n_heads':12,
    'emb_size':768,
    'context_length':256,
    'vocab_size':50257
}

In [56]:
input = torch.tensor([
    [6109,  3626,  6100,   345],
    [3626,  6100,   345, 120]
])
target = torch.tensor([
    [3626,  6100,   345,120],
    [6100,   345, 120, 460]
])

In [57]:
logits = model.forward(input)

In [58]:
logits.flatten(0,1)

tensor([[-1.9094,  1.8976, -0.5901,  ...,  0.2374,  1.3902, -0.7221],
        [-1.2764,  1.6230, -0.2013,  ...,  1.0815,  2.3195,  0.1054],
        [-0.9390,  2.4984, -0.2993,  ...,  1.4939,  1.3567,  0.1068],
        ...,
        [-1.1970,  2.2058, -0.6729,  ...,  1.5247,  1.4166,  0.3063],
        [-1.1747,  2.5625,  0.0838,  ...,  0.7747,  1.6181, -0.9742],
        [-1.3902,  1.6654,  0.0133,  ...,  1.6345,  1.8741, -0.4281]],
       grad_fn=<ViewBackward0>)

In [59]:
logits

tensor([[[-1.9094,  1.8976, -0.5901,  ...,  0.2374,  1.3902, -0.7221],
         [-1.2764,  1.6230, -0.2013,  ...,  1.0815,  2.3195,  0.1054],
         [-0.9390,  2.4984, -0.2993,  ...,  1.4939,  1.3567,  0.1068],
         [-1.0291,  2.5541,  0.0081,  ...,  1.6322,  1.5161, -0.8314]],

        [[-1.3862,  1.6441, -0.3678,  ...,  0.6324,  2.0297, -0.3362],
         [-1.1970,  2.2058, -0.6729,  ...,  1.5247,  1.4166,  0.3063],
         [-1.1747,  2.5625,  0.0838,  ...,  0.7747,  1.6181, -0.9742],
         [-1.3902,  1.6654,  0.0133,  ...,  1.6345,  1.8741, -0.4281]]],
       grad_fn=<UnsafeViewBackward0>)

In [60]:
loss = nn.functional.cross_entropy(logits.flatten(0,1),target.flatten())

In [61]:
loss

tensor(11.3743, grad_fn=<NllLossBackward0>)

In [62]:
perplexity = torch.exp(loss)

In [63]:
perplexity

tensor(87053.2422, grad_fn=<ExpBackward0>)

In [64]:
with open('/content/theverdict.txt',encoding='utf-8') as file:
        txt_data = file.read()

In [65]:
txt_data

'The verdict\nEdith wharton\nI had always thought Jack Gisburn rather a cheap genius--though a good fellow\nenough--so it was no great surprise to me to hear that, in the height of his glory, he\nhad dropped his painting, married a rich widow, and established himself in a villa\non the Riviera. (Though I rather thought it would have been Rome or Florence.)\n"The height of his glory"--that was what the women called it. I can hear Mrs.\nGideon Thwing--his last Chicago sitter--deploring his unaccountable abdication.\n"Of course it\'s going to send the value of my picture \'way up; but I don\'t think of\nthat, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s\nlips, multiplied its RS as though they were reflected in an endless vista of mirrors.\nAnd it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia\nCroft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-\ndancers" to say, with tears in her eyes: "We shall not look up

In [66]:
print(txt_data[:100])

The verdict
Edith wharton
I had always thought Jack Gisburn rather a cheap genius--though a good fel


In [67]:
!pip install -q tiktoken

In [68]:
import tiktoken

In [69]:
tokenizer = tiktoken.get_encoding('gpt2')

In [70]:
total_tokens = tokenizer.encode(txt_data)

In [71]:
len(total_tokens)

5314

In [72]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self,txt,tokenizer,context_length,stride):
      self.input_ids = []
      self.target_ids = []

      token_ids = tokenizer.encode(txt)
      for i in range(0,len(token_ids)-context_length,stride):
        self.input_ids.append(torch.tensor(token_ids[i:i+context_length]))
        self.target_ids.append(torch.tensor(token_ids[i+1:i+context_length+1]))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self,idx):
        return self.input_ids[idx],self.target_ids[idx]



def create_dataloader_v1(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)
  dataloader = DataLoader(
      dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last,
      num_workers=num_workers
  )
  return dataloader

In [73]:
train_ratio = 0.9
split_idx = int(train_ratio*len(txt_data))
train_data = txt_data[:split_idx]
valid_data = txt_data[split_idx+1:]

torch.manual_seed(123)

train_loader = create_dataloader_v1(
    txt=txt_data,
    batch_size=2,
    max_length=GPT2_small_cl['context_length'],
    stride=GPT2_small_cl['context_length'],
    shuffle=True,
    drop_last=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    txt=valid_data,
    batch_size=2,
    max_length=GPT2_small_cl['context_length'],
    stride=GPT2_small_cl['context_length'],
    shuffle=True,
    drop_last=True,
    num_workers=0
)


In [74]:
for x,y in train_loader:
  print(x.shape,y.shape)

torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [75]:
for x,y in val_loader:
  print(x.shape,y.shape)

torch.Size([2, 256]) torch.Size([2, 256])


In [76]:
def calc_loss_batch(input_batch,target_batch,model,device):
   input_batch,target_batch = input_batch.to(device),target_batch.to(device)
   logits = model(input_batch)
   return nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())

def clac_class_loader(data_loader,model,device,num_batches = None):
  total_loss =0.
  if len(data_loader)==0:
    return float("nan")
  elif num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches,len(data_loader))
  for i,(input_batch,target_batch) in enumerate(data_loader):
    if i<num_batches:
      loss = calc_loss_batch(input_batch,target_batch,model,device)
      total_loss += loss.item()
    else:
      break
  return total_loss/num_batches



In [77]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.manual_seed(123)
with torch.no_grad():
  train_loss = clac_class_loader(train_loader,model,device)
  val_loss = clac_class_loader(val_loader,model,device)

print("training loss:",train_loss)
print("valid loss:",val_loss)

training loss: 11.054170894622803
valid loss: 11.078924179077148


In [78]:
def evaluate_model(model,train_loader,val_loader,device,eval_iter):
  model.eval()
  with torch.no_grad():
    train_loss = clac_class_loader(train_loader,model,device,eval_iter)
    val_loss = clac_class_loader(val_loader,model,device,eval_iter)
  model.train()
  return train_loss, val_loss

In [79]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"

In [80]:
def generate_and_print_sample(model,tokenizer,device,start_context):
  model.eval()
  context_size = model.pos_emb.weight.shape[0]
  encoded = text_to_token_ids(start_context,tokenizer).to(device)
  with torch.no_grad():
    token_ids = get_text_simple(idx=encoded,model=model,max_words=50,context_length=context_size)
  decoded_text = token_ids_to_text(token_ids,tokenizer)
  print(decoded_text.replace("\n"," "))
  model.train()

In [81]:
def train_model_simple(model,train_loader,val_loader,
                       optimizer,device,num_epochs,eval_freq,
                       eval_iter,start_context,tokenizer):
  train_losses, val_losses, track_tokens_seen = [],[],[]
  tokens_seen, global_step=0,-1

  for epoch in range(num_epochs):
      model.train()

      for input_batch, target_batch in train_loader:
          optimizer.zero_grad()
          loss = calc_loss_batch(input_batch,target_batch,model,device)
          loss.backward()
          optimizer.step()
          tokens_seen += input_batch.numel()
          global_step += 1

          if global_step % eval_freq == 0:
              train_loss, val_loss = evaluate_model(
                  model,train_loader,val_loader,device,eval_iter)
              train_losses.append(train_loss)
              val_losses.append(val_loss)
              track_tokens_seen.append(tokens_seen)
              print(f"Epoch {epoch+1} (Step {global_step})")
              print(f"Train loss:{train_loss} Valid loss:{val_loss}")

      generate_and_print_sample(
            model,tokenizer,device,start_context
      )
  return train_losses, val_losses, track_tokens_seen

# def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
#                        eval_freq, eval_iter, start_context, tokenizer):
#     # Initialize lists to track losses and tokens seen
#     train_losses, val_losses, track_tokens_seen = [], [], []
#     tokens_seen, global_step = 0, -1

#     # Main training loop
#     for epoch in range(num_epochs):
#         model.train()  # Set model to training mode

#         for input_batch, target_batch in train_loader:
#             optimizer.zero_grad() # Reset loss gradients from previous batch iteration
#             loss = calc_loss_batch(input_batch, target_batch, model, device)
#             loss.backward() # Calculate loss gradients
#             optimizer.step() # Update model weights using loss gradients
#             tokens_seen += input_batch.numel() # Returns the total number of elements (or tokens) in the input_batch.
#             global_step += 1

#             # Optional evaluation step
#             if global_step % eval_freq == 0:
#                 train_loss, val_loss = evaluate_model(
#                     model, train_loader, val_loader, device, eval_iter)
#                 train_losses.append(train_loss)
#                 val_losses.append(val_loss)
#                 track_tokens_seen.append(tokens_seen)
#                 print(f"Ep {epoch+1} (Step {global_step:06d}): "
#                       f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

#         # Print a sample text after each epoch
#         generate_and_print_sample(
#             model, tokenizer, device, start_context
#         )

#     return train_losses, val_losses, track_tokens_seen

In [84]:
import time
start_time = time.time()
# torch.manual_seed(123)
model =GPT2(GPT2_small_cl)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0004, weight_decay=0.1)
num_epochs=30
train_losses, val_losses, track_tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device, num_epochs,
    eval_freq=5, eval_iter=5, start_context = "Every effort moves you",
    tokenizer=tokenizer)
end_time = time.time()
print((end_time-start_time)/60)

Epoch 1 (Step 0)
Train loss:4.629014682769776 Valid loss:5.365590572357178
Epoch 1 (Step 5)
Train loss:4.4085033416748045 Valid loss:5.4297943115234375
Every effort moves you--and that he was the "--and that he was "--I had the "--I had the "I had been the "I had been the "I--I had been the "I had been
Epoch 2 (Step 10)
Train loss:4.159240865707398 Valid loss:5.262295722961426
Epoch 2 (Step 15)
Train loss:3.6877690315246583 Valid loss:5.14303731918335
Every effort moves you know. "I had always of the "Oh, and Mrs. "--his, and "I was "I was "Oh, and "I was "I didn't "I was "
Epoch 3 (Step 20)
Train loss:3.55321044921875 Valid loss:5.114974498748779
Epoch 3 (Step 25)
Train loss:3.329805374145508 Valid loss:5.184820652008057
Every effort moves you know my way he was a little me--and I was that I had been to see the "Oh, I felt "I had been the "I turned-rooms the "I had been was not that I was "
Epoch 4 (Step 30)
Train loss:2.9092750549316406 Valid loss:5.0409955978393555
Epoch 4 (Step 35)