In [1]:
import torch
import torch.nn as nn
import tiktoken
import os
from torch.utils.data import DataLoader, Dataset

In [2]:
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
cfg = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [4]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)
        norm_x = (x-mean)/torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [5]:
class Feed_Forward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], cfg["emb_dim"]*4),
            nn.GELU(),
            nn.Linear(cfg["emb_dim"]*4, cfg["emb_dim"])
        )
    def forward(self, x):
        return self.layers(x)

In [6]:
class MultiHead_Attention(nn.Module):
    def __init__(self,d_in,
                 d_out,
                 num_head,
                 dropout,
                 context_length,
                 bias=False):
        super().__init__()
        self.W_Query = nn.Linear(d_in, d_out, bias=bias)
        self.W_Key = nn.Linear(d_in, d_out, bias=bias)
        self.W_Value = nn.Linear(d_in, d_out, bias=bias)
        self.dropout = nn.Dropout(dropout)
        self.num_head = num_head
        self.head_dim = d_out//num_head
        self.d_out = d_out
        self.out_project = nn.Linear(d_out, d_out)

        self. register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_Key(x)
        query = self.W_Query(x)
        value = self.W_Value(x)

        keys = keys.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        query = query.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        value = value.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        
        att_score = query @ keys.transpose(2,3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        att_score.masked_fill_(mask_bool,-torch.inf)

        att_weight = torch.softmax(att_score/keys.shape[-1]**0.5, dim=-1)
        att_weight = self.dropout(att_weight)
        
        context_vec = (att_weight @ value).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, d_in)
        context_vec = self.out_project(context_vec)
        return context_vec

In [7]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.att = MultiHead_Attention(d_in=cfg['emb_dim'],
                                       d_out=cfg['emb_dim'],
                                       num_head=cfg['n_heads'],
                                       dropout=cfg['drop_rate'],
                                       context_length=cfg['context_length'],
                                       bias=cfg['qkv_bias']
                                       )
        self.feed_forward = Feed_Forward(cfg=cfg)
        self.norm1 = LayerNorm(emb_dim=cfg['emb_dim'])
        self.norm2 = LayerNorm(emb_dim=cfg['emb_dim']) 
        self.shortcut_drop = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.shortcut_drop(x)
        x = x + shortcut

        shortcut = x

        x = self.norm2(x)
        x = self.feed_forward(x)
        x = self.shortcut_drop(x)
        x = x + shortcut

        return x

In [8]:
torch.manual_seed(42)
x = torch.randn(2,4,768)
transformer = TransformerBlock(cfg=cfg)
print(f"Input: {x.shape}\nOutput: {transformer(x).shape}")


Input: torch.Size([2, 4, 768])
Output: torch.Size([2, 4, 768])


In [9]:
torch.manual_seed(42)
emb = nn.Embedding(num_embeddings=10, embedding_dim=10)

print("Embedding weight shape:", emb.weight.shape)  # (10, 3)

tokens = torch.tensor([0, 1, 2, 3])
out = emb(tokens)

print("Tokens:", tokens)
print(f"Output shape: \n{out}")  # (4, 3)


Embedding weight shape: torch.Size([10, 10])
Tokens: tensor([0, 1, 2, 3])
Output shape: 
tensor([[ 1.9269,  1.4873,  0.9007, -2.1055,  0.6784, -1.2345, -0.0431, -1.6047,
         -0.7521,  1.6487],
        [-0.3925, -1.4036, -0.7279, -0.5594, -0.7688,  0.7624,  1.6423, -0.1596,
         -0.4974,  0.4396],
        [-0.7581,  1.0783,  0.8008,  1.6806,  1.2791,  1.2964,  0.6105,  1.3347,
         -0.2316,  0.0418],
        [-0.2516,  0.8599, -1.3847, -0.8712, -0.2234,  1.7174,  0.3189, -0.4245,
          0.3057, -0.7746]], grad_fn=<EmbeddingBackward0>)


In [10]:
class GPT_Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_block = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        self.final_norm = LayerNorm(emb_dim=cfg['emb_dim'])
        self.out_head = nn.Linear(
            cfg['emb_dim'], cfg['vocab_size'], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_length = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_length, device=in_idx.device))
        x = pos_embeds + tok_embeds
        x = self.drop_emb(x)
        x = self.trf_block(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        
        return logits

In [11]:
torch.manual_seed(42)
inputs = torch.tensor(
  [123, 465, 789, 120])
batch = torch.stack((inputs, inputs), dim=0)
batch

tensor([[123, 465, 789, 120],
        [123, 465, 789, 120]])

In [12]:
model = GPT_Model(cfg=cfg)
a = model(batch)
a.shape

torch.Size([2, 4, 50257])

In [13]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Number of parameters: {total_params:,}")

Total Number of parameters: 162,419,712


In [14]:
total_size_bytes = total_params * 4 #A
total_size_mb = total_size_bytes / (1024 * 1024) #B
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 619.58 MB


# Generating Text

In [None]:
def Generate_Text(model, idx, max_tokens, context_size):
    for _ in range(max_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.inference_mode():
            logits = model(idx_cond)

            logits = logits[:, -1, :]
            probs = torch.softmax(logits, dim=-1)
            next_idx = torch.argmax(probs, dim=-1, keepdim=True)
            idx = torch.cat((idx, next_idx), dim=-1)
    return idx

In [None]:
start_context = "what is there"
encoded = tokenizer.encode(start_context)
encoded_tensor = torch.tensor(encoded).unsqueeze(dim=0)
print(encoded_tensor.shape)

In [None]:
model.eval() #A
out = Generate_Text(
model=model,
idx=encoded_tensor,
max_tokens=50,
context_size=cfg["context_length"]
)
print("Output:", out)
print("Output length:", len(out[0]))

In [None]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

# loss calculation

In [None]:
inputs = torch.tensor([[16899, 3626, 6100],
                       [40, 1107, 588]])

targets = torch.tensor([[3626, 6100, 345],
                        [1107, 588, 11311]])

In [None]:
targets[0]

In [None]:
torch.manual_seed(42)
model.eval()
with torch.inference_mode():
    logits = model(inputs)
probs = torch.softmax(logits, dim=-1)
print(f"Logits: {probs}")

In [None]:
outputs = torch.argmax(probs, dim=-1, keepdim=True)
outputs

In [None]:
def token_to_text(tok_ids, tokenizer):
    flatn = tok_ids.squeeze(0)
    return tokenizer.decode(flatn.tolist())


In [None]:
token_to_text(outputs[0].flatten(), tokenizer)

In [None]:
batch = 0
prob_target_1 = probs[batch, [0,1,2], targets[batch]]
prob_target_2 = probs[batch+1, [0,1,2], targets[batch+1]]
log_prob = torch.log(torch.cat((prob_target_1, prob_target_2)))
log_prob

In [None]:
log_avg = torch.mean(log_prob)
negative_log = log_avg * -1
negative_log

In [None]:
flat_logits = logits.flatten(0, 1)
flat_logits.shape
flat_target = targets.flatten()

In [None]:
loss = torch.nn.functional.cross_entropy(flat_logits, flat_target)
loss

In [None]:
perplexity = torch.exp(loss)
perplexity

In [None]:
logits.shape

In [None]:
print(flat_logits.shape)  # should be (batch*tokens, 56258)
print(flat_target.max())  # should be < 56258


In [None]:
V = 56258
print("Upper bound:", V)
print("Loss upper bound (random):", torch.log(torch.tensor(V, dtype=torch.float)))


In [None]:
print(loss.item())  # what's the numeric value?

# Real Loss

In [15]:
file_path = "./DATA/the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

In [16]:
characters_count = len(text)
token_len = len(tokenizer.encode(text))
token_len

5146

In [17]:
class GPT_Dataset(Dataset):
    def __init__(self, txt, context_len, stride, tokenizer):
        super().__init__()

        self.input_idx = []
        self.output_idx = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - context_len, stride):
            input_chunk = token_ids[i : i+context_len]
            output_chunks = token_ids[i+1 : i+context_len+1]

            self.input_idx.append(torch.tensor(input_chunk))
            self.output_idx.append(torch.tensor(output_chunks))

    def __len__(self):
        return len(self.input_idx)
    
    def __getitem__(self, x):
        return self.input_idx[x], self.output_idx[x]

In [18]:
def create_data_loader(txt, context_len=256, stride=128,
                   batch_size=4, shuffle=False, num_workers=0,
                   drop_last=True):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPT_Dataset( txt, context_len, stride, tokenizer)

    data_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return data_loader


In [19]:
train_ratio = 0.9
split_idx = int(train_ratio * len(text))
traing_data = text[:split_idx]
test_data = text[split_idx:]

train_dataloader = create_data_loader(
    traing_data,
    context_len=cfg["context_length"],
    stride=256,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    drop_last=True
    )

test_dataloader = create_data_loader(
    test_data,
    context_len=cfg["context_length"],
    stride=256,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    drop_last=True
    )

In [20]:
for x, y in train_dataloader:
    print(x.shape, y.shape)

torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [24]:
def corss_entropy_loss(input_batch, target_batch, device, model):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())

    return loss


def corss_entropy_loss_loader(data_loader, model, device, num_batch=None):
    total_loss = 0

    if len(data_loader) == 0:
        return float("nan")
    elif num_batch is None:
        num_batch = len(data_loader)
    else:
        num_batch = min(num_batch, len(data_loader))
    
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batch:
            loss = corss_entropy_loss(input_batch, target_batch, device, model)
            total_loss += loss.item()
        else:
            break
    return total_loss/num_batch

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

model.eval()

with torch.inference_mode():
    train_loss = corss_entropy_loss_loader(train_dataloader, model=model, device=device)

print(f"Training Loss: {train_loss}")

Training Loss: 11.00371954176161
