In [1]:
import torch
import torch.nn as nn
import tiktoken
import os
from torch.utils.data import DataLoader, Dataset

In [2]:
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
cfg = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [4]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)
        norm_x = (x-mean)/torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [5]:
class Feed_Forward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], cfg["emb_dim"]*4),
            nn.GELU(),
            nn.Linear(cfg["emb_dim"]*4, cfg["emb_dim"])
        )
    def forward(self, x):
        return self.layers(x)

In [6]:
class MultiHead_Attention(nn.Module):
    def __init__(self,d_in,
                 d_out,
                 num_head,
                 dropout,
                 context_length,
                 bias=False):
        super().__init__()
        self.W_Query = nn.Linear(d_in, d_out, bias=bias)
        self.W_Key = nn.Linear(d_in, d_out, bias=bias)
        self.W_Value = nn.Linear(d_in, d_out, bias=bias)
        self.dropout = nn.Dropout(dropout)
        self.num_head = num_head
        self.head_dim = d_out//num_head
        self.d_out = d_out
        self.out_project = nn.Linear(d_out, d_out)

        self. register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_Key(x)
        query = self.W_Query(x)
        value = self.W_Value(x)

        keys = keys.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        query = query.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        value = value.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        
        att_score = query @ keys.transpose(2,3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        att_score.masked_fill_(mask_bool,-torch.inf)

        att_weight = torch.softmax(att_score/keys.shape[-1]**0.5, dim=-1)
        att_weight = self.dropout(att_weight)
        
        context_vec = (att_weight @ value).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, d_in)
        context_vec = self.out_project(context_vec)
        return context_vec

In [7]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.att = MultiHead_Attention(d_in=cfg['emb_dim'],
                                       d_out=cfg['emb_dim'],
                                       num_head=cfg['n_heads'],
                                       dropout=cfg['drop_rate'],
                                       context_length=cfg['context_length'],
                                       bias=cfg['qkv_bias']
                                       )
        self.feed_forward = Feed_Forward(cfg=cfg)
        self.norm1 = LayerNorm(emb_dim=cfg['emb_dim'])
        self.norm2 = LayerNorm(emb_dim=cfg['emb_dim']) 
        self.shortcut_drop = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.shortcut_drop(x)
        x = x + shortcut

        shortcut = x

        x = self.norm2(x)
        x = self.feed_forward(x)
        x = self.shortcut_drop(x)
        x = x + shortcut

        return x

In [8]:
torch.manual_seed(42)
x = torch.randn(2,4,768)
transformer = TransformerBlock(cfg=cfg)
print(f"Input: {x.shape}\nOutput: {transformer(x).shape}")


Input: torch.Size([2, 4, 768])
Output: torch.Size([2, 4, 768])


In [9]:
torch.manual_seed(42)
emb = nn.Embedding(num_embeddings=10, embedding_dim=10)

print("Embedding weight shape:", emb.weight.shape)  # (10, 3)

tokens = torch.tensor([0, 1, 2, 3])
out = emb(tokens)

print("Tokens:", tokens)
print(f"Output shape: \n{out.shape}")  # (4, 3)


Embedding weight shape: torch.Size([10, 10])
Tokens: tensor([0, 1, 2, 3])
Output shape: 
torch.Size([4, 10])


In [10]:
class GPT_Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_block = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        self.final_norm = LayerNorm(emb_dim=cfg['emb_dim'])
        self.out_head = nn.Linear(
            cfg['emb_dim'], cfg['vocab_size'], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_length = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_length, device=in_idx.device))
        x = pos_embeds + tok_embeds
        x = self.drop_emb(x)
        x = self.trf_block(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        
        return logits

In [11]:
torch.manual_seed(42)
inputs = torch.tensor(
  [123, 465, 789, 120])
batch = torch.stack((inputs, inputs), dim=0)
batch.shape

torch.Size([2, 4])

In [12]:
model = GPT_Model(cfg=cfg)
a = model(batch)
a[:, -1, :]

tensor([[ 0.4387,  0.6203,  0.4267,  ...,  1.2549, -0.0543,  0.2579],
        [ 0.4399,  0.1083,  0.9101,  ...,  0.9163, -0.1864,  0.1465]],
       grad_fn=<SliceBackward0>)

In [13]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Number of parameters: {total_params:,}")

Total Number of parameters: 162,419,712


In [14]:
total_size_bytes = total_params * 4 #A
total_size_mb = total_size_bytes / (1024 * 1024) #B
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 619.58 MB


# Generating Text

In [15]:
def Generate_Text(model, idx, max_tokens, context_size):
    for _ in range(max_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.inference_mode():
            logits = model(idx_cond)

            logits = logits[:, -1, :]
            probs = torch.softmax(logits, dim=-1)
            next_idx = torch.argmax(probs, dim=-1, keepdim=True)
            idx = torch.cat((idx, next_idx), dim=-1)
    return idx

In [16]:
start_context = "what is there"
encoded = tokenizer.encode(start_context)
encoded_tensor = torch.tensor(encoded).unsqueeze(dim=0)
print(encoded_tensor.shape)

torch.Size([1, 3])


In [17]:
model.eval() #A
out = Generate_Text(
model=model,
idx=encoded_tensor,
max_tokens=1,
context_size=cfg["context_length"]
)
print("Output:", out.shape)
print("Output length:", len(out[0]))

Output: torch.Size([1, 4])
Output length: 4


In [18]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

what is thereacid


# loss calculation

In [19]:
inputs = torch.tensor([[16899, 3626, 6100],
                       [40, 1107, 588]])

targets = torch.tensor([[3626, 6100, 345],
                        [1107, 588, 11311]])

In [20]:
targets[0]

tensor([3626, 6100,  345])

In [21]:
torch.manual_seed(42)
model.eval()
with torch.inference_mode():
    logits = model(inputs)
probs = torch.softmax(logits, dim=-1)
print(f"Logits: {probs}")

Logits: tensor([[[1.9280e-05, 1.9655e-05, 1.6457e-05,  ..., 1.7058e-05,
          7.0627e-06, 1.1659e-05],
         [2.7763e-05, 1.3189e-05, 1.3139e-05,  ..., 7.3908e-06,
          7.5067e-06, 6.5229e-05],
         [1.6172e-05, 1.0602e-05, 3.1717e-05,  ..., 1.6024e-05,
          3.5050e-06, 3.7510e-05]],

        [[2.2604e-05, 2.3814e-05, 1.7510e-05,  ..., 2.0320e-05,
          2.4596e-05, 3.4492e-05],
         [1.3749e-05, 2.5618e-05, 9.0479e-06,  ..., 1.1684e-05,
          2.4652e-05, 7.0355e-05],
         [4.1211e-05, 6.7314e-06, 1.4997e-05,  ..., 7.6690e-06,
          7.1872e-06, 6.0617e-06]]])


In [22]:
outputs = torch.argmax(probs, dim=-1, keepdim=True)
outputs

tensor([[[  718],
         [44449],
         [40115]],

        [[29716],
         [40825],
         [19647]]])

In [23]:
def token_to_text(tok_ids, tokenizer):
    flatn = tok_ids.squeeze(0)
    return tokenizer.decode(flatn.tolist())


In [24]:
token_to_text(outputs[0].flatten(), tokenizer)

' 6 relativity HO'

In [25]:
batch = 0
prob_target_1 = probs[batch, [0,1,2], targets[batch]]
prob_target_2 = probs[batch+1, [0,1,2], targets[batch+1]]
log_prob = torch.log(torch.cat((prob_target_1, prob_target_2)))
log_prob

tensor([-11.4088, -10.3796, -11.5414, -11.2733, -10.4320, -11.9387])

In [26]:
log_avg = torch.mean(log_prob)
negative_log = log_avg * -1
negative_log

tensor(11.1623)

In [27]:
flat_logits = logits.flatten(0, 1)
flat_logits.shape
flat_target = targets.flatten()

In [28]:
loss = torch.nn.functional.cross_entropy(flat_logits, flat_target)
loss

tensor(11.1623)

In [29]:
perplexity = torch.exp(loss)
perplexity

tensor(70424.2812)

In [30]:
logits.shape

torch.Size([2, 3, 50257])

In [31]:
print(flat_logits.shape)  # should be (batch*tokens, 56258)
print(flat_target.max())  # should be < 56258


torch.Size([6, 50257])
tensor(11311)


In [32]:
V = 56258
print("Upper bound:", V)
print("Loss upper bound (random):", torch.log(torch.tensor(V, dtype=torch.float)))


Upper bound: 56258
Loss upper bound (random): tensor(10.9377)


In [33]:
print(loss.item())  # what's the numeric value?

11.162293434143066


# Real Loss

In [34]:
file_path = "./DATA/the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

In [35]:
characters_count = len(text)
token_len = len(tokenizer.encode(text))
token_len

5146

In [36]:
class GPT_Dataset(Dataset):
    def __init__(self, txt, context_len, stride, tokenizer):
        super().__init__()

        self.input_idx = []
        self.output_idx = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - context_len, stride):
            input_chunk = token_ids[i : i+context_len]
            output_chunks = token_ids[i+1 : i+context_len+1]

            self.input_idx.append(torch.tensor(input_chunk))
            self.output_idx.append(torch.tensor(output_chunks))

    def __len__(self):
        return len(self.input_idx)
    
    def __getitem__(self, x):
        return self.input_idx[x], self.output_idx[x]

In [37]:
def create_data_loader(txt, context_len=256, stride=128,
                   batch_size=8, shuffle=False, num_workers=0,
                   drop_last=True):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPT_Dataset( txt, context_len, stride, tokenizer)

    data_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return data_loader


In [38]:
train_ratio = 0.9
split_idx = int(train_ratio * len(text))
traing_data = text[:split_idx]
test_data = text[split_idx:]

train_dataloader = create_data_loader(
    traing_data,
    context_len=cfg["context_length"],
    stride=256,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    drop_last=True
    )

test_dataloader = create_data_loader(
    test_data,
    context_len=cfg["context_length"],
    stride=256,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    drop_last=True
    )

In [39]:
for x, y in train_dataloader:
    print(x.shape, y.shape)

torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [40]:
def corss_entropy_loss(input_batch, target_batch, device, model):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())

    return loss


def corss_entropy_loss_loader(data_loader, model, device, num_batch=None):
    total_loss = 0

    if len(data_loader) == 0:
        return float("nan")
    elif num_batch is None:
        num_batch = len(data_loader)
    else:
        num_batch = min(num_batch, len(data_loader))
    
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batch:
            loss = corss_entropy_loss(input_batch, target_batch, device, model)
            total_loss += loss.item()
        else:
            break
    return total_loss/num_batch

In [41]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = corss_entropy_loss_loader(train_loader, model, device, num_batch=eval_iter)
        val_loss = corss_entropy_loss_loader(val_loader, model, device, num_batch=eval_iter)
    model.train()
    return train_loss, val_loss

In [71]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())


In [43]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = Generate_Text(
            model=model, idx=encoded,
            max_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [44]:
from tqdm import tqdm
def train_model_sample(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_token_seen = [], [], []

    token_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        batch_iter = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
        for input_batch, target_batch in batch_iter:
            optimizer.zero_grad()
            loss = corss_entropy_loss(input_batch, target_batch, device, model)
            loss.backward()
            optimizer.step()
            token_seen += input_batch.numel()
            global_step +=1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_token_seen.append(token_seen)
                print(f"Ep {epoch+1}(Step {global_step:06d}): "
                      f"Train Loss: {train_loss:.3f}, val Loss: {val_loss:.3f}")
                
        generate_and_print_sample(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_token_seen
            

In [45]:
import time
device = "cuda" if torch.cuda.is_available() else "cpu"
start_time = time.time()

num_epoch = 10

torch.manual_seed(42)
model = GPT_Model(cfg)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
train_losses, val_losses, tokens_seen = train_model_sample(model, train_dataloader, test_dataloader, optimizer,
                                                       device, num_epoch, eval_freq=5, eval_iter=5,
                                                       start_context="Every step moves you", tokenizer=tokenizer)

end_time = time.time()

train_time = (end_time - start_time)/60

print(f"Total trainig time: {train_time}")

Epoch 1/10:  33%|███▎      | 3/9 [00:00<00:01,  5.66it/s]

Ep 1(Step 000000): Train Loss: 10.027, val Loss: 10.131


Epoch 1/10:  89%|████████▉ | 8/9 [00:01<00:00,  7.80it/s]

Ep 1(Step 000005): Train Loss: 8.075, val Loss: 8.356


                                                         

Every step moves you,                                                 


Epoch 2/10:  44%|████▍     | 4/9 [00:00<00:00,  9.57it/s]

Ep 2(Step 000010): Train Loss: 6.784, val Loss: 7.122


                                                         

Ep 2(Step 000015): Train Loss: 6.053, val Loss: 6.650




Every step moves you, the, the, the, the, the, the, the, the.                                 


Epoch 3/10:  56%|█████▌    | 5/9 [00:00<00:00,  8.60it/s]

Ep 3(Step 000020): Train Loss: 12.585, val Loss: 13.870


                                                         

Ep 3(Step 000025): Train Loss: 5.597, val Loss: 6.425




Every step moves you I                                                 


Epoch 4/10:  67%|██████▋   | 6/9 [00:00<00:00,  9.15it/s]

Ep 4(Step 000030): Train Loss: 5.313, val Loss: 6.512


                                                         

Ep 4(Step 000035): Train Loss: 5.092, val Loss: 6.526




Every step moves you.  "--I, and he had, and he had. ". Gisburn--I. ". ". ". ", and. ". Gisburn--I, and he was


Epoch 5/10:  78%|███████▊  | 7/9 [00:00<00:00,  8.59it/s]

Ep 5(Step 000040): Train Loss: 4.645, val Loss: 6.387


                                                         

Every step moves you a little was a a little wild--I felt, I had been                                    


Epoch 6/10:  11%|█         | 1/9 [00:00<00:01,  4.90it/s]

Ep 6(Step 000045): Train Loss: 4.527, val Loss: 6.499


Epoch 6/10:  89%|████████▉ | 8/9 [00:00<00:00,  8.54it/s]

Ep 6(Step 000050): Train Loss: 3.851, val Loss: 6.344


                                                         

Every step moves you the last, and--I, and I felt, and I had been of the picture was the fact, in the Riv of the picture, in, in the last, in the picture--as, the donkey.      


Epoch 7/10:  44%|████▍     | 4/9 [00:00<00:00,  9.16it/s]

Ep 7(Step 000055): Train Loss: 3.550, val Loss: 6.260


                                                         

Ep 7(Step 000060): Train Loss: 2.979, val Loss: 6.176




Every step moves you know," was one of the axioms--and so inevitably the                                    


Epoch 8/10:  56%|█████▌    | 5/9 [00:00<00:00,  8.49it/s]

Ep 8(Step 000065): Train Loss: 2.494, val Loss: 6.186


                                                         

Ep 8(Step 000070): Train Loss: 2.513, val Loss: 6.263




Every step moves you in the picture was a little wild--I felt nervous and uncertain.                                    


Epoch 9/10:  67%|██████▋   | 6/9 [00:00<00:00,  9.01it/s]

Ep 9(Step 000075): Train Loss: 1.856, val Loss: 6.303


                                                         

Ep 9(Step 000080): Train Loss: 1.741, val Loss: 6.329




Every step moves you in the inevitable.      "I looked me.                                   


Epoch 10/10:  78%|███████▊  | 7/9 [00:00<00:00,  8.65it/s]

Ep 10(Step 000085): Train Loss: 1.451, val Loss: 6.354


                                                          

Every step moves you?"  "Yes--and by a smile that he had the background of her. "Yes--as! The women had to see a smile behind his close grayish beard--as if he had the donkey. "There were days when I
Total trainig time: 0.28940397103627524


In [84]:
def generate(model, idx, context_len, max_new_tok, top_k, temp=0.0, eos_id = None):

    for _ in range(max_new_tok):
        idx = idx.to(device)
        idx_cond = idx[:, -context_len:]
        with torch.inference_mode():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        top_k_logits, _ = torch.topk(logits, top_k)
        min_val = top_k_logits[:, -1]
        logits = torch.where(condition=logits<min_val,
                                 input=torch.tensor(float("-inf")).to(logits.device),
                                 other=logits)
        if temp>0.0:
            logits = logits/temp
            probs = torch.softmax(logits, dim=-1)
            preds = torch.multinomial(probs, 1)
        else:
            probs = torch.softmax(logits, dim=-1)
            preds = torch.multinomial(probs, 1)

        if preds == eos_id:
            break

        idx = torch.cat((idx, preds), dim= 1)
    return idx

In [88]:
text_gen = generate(model, idx=text_to_token_ids("I went to the park", tokenizer=tokenizer), max_new_tok=10, top_k=25, temp=1.6, context_len=cfg["context_length"])
print(f"output:\n{token_ids_to_text(text_gen, tokenizer)}")

output:
I went to the park-century one of that the factoms he res
