In [1]:
import torch
import torch.nn as nn
import tiktoken


In [2]:
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
cfg = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [4]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)
        norm_x = (x-mean)/torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [5]:
class Feed_Forward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], cfg["emb_dim"]*4),
            nn.GELU(),
            nn.Linear(cfg["emb_dim"]*4, cfg["emb_dim"])
        )
    def forward(self, x):
        return self.layers(x)

In [6]:
class MultiHead_Attention(nn.Module):
    def __init__(self,d_in,
                 d_out,
                 num_head,
                 dropout,
                 context_length,
                 bias=False):
        super().__init__()
        self.W_Query = nn.Linear(d_in, d_out, bias=bias)
        self.W_Key = nn.Linear(d_in, d_out, bias=bias)
        self.W_Value = nn.Linear(d_in, d_out, bias=bias)
        self.dropout = nn.Dropout(dropout)
        self.num_head = num_head
        self.head_dim = d_out//num_head
        self.d_out = d_out
        self.out_project = nn.Linear(d_out, d_out)

        self. register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_Key(x)
        query = self.W_Query(x)
        value = self.W_Value(x)

        keys = keys.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        query = query.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        value = value.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        
        att_score = query @ keys.transpose(2,3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        att_score.masked_fill_(mask_bool,-torch.inf)

        att_weight = torch.softmax(att_score/keys.shape[-1]**0.5, dim=-1)
        att_weight = self.dropout(att_weight)
        
        context_vec = (att_weight @ value).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, d_in)
        context_vec = self.out_project(context_vec)
        return context_vec

In [7]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.att = MultiHead_Attention(d_in=cfg['emb_dim'],
                                       d_out=cfg['emb_dim'],
                                       num_head=cfg['n_heads'],
                                       dropout=cfg['drop_rate'],
                                       context_length=cfg['context_length'],
                                       bias=cfg['qkv_bias']
                                       )
        self.feed_forward = Feed_Forward(cfg=cfg)
        self.norm1 = LayerNorm(emb_dim=cfg['emb_dim'])
        self.norm2 = LayerNorm(emb_dim=cfg['emb_dim']) 
        self.shortcut_drop = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.shortcut_drop(x)
        x = x + shortcut

        shortcut = x

        x = self.norm2(x)
        x = self.feed_forward(x)
        x = self.shortcut_drop(x)
        x = x + shortcut

        return x

In [8]:
torch.manual_seed(42)
x = torch.randn(2,4,768)
transformer = TransformerBlock(cfg=cfg)
print(f"Input: {x.shape}\nOutput: {transformer(x).shape}")


Input: torch.Size([2, 4, 768])
Output: torch.Size([2, 4, 768])


In [9]:
torch.manual_seed(42)
emb = nn.Embedding(num_embeddings=10, embedding_dim=10)

print("Embedding weight shape:", emb.weight.shape)  # (10, 3)

tokens = torch.tensor([0, 1, 2, 3])
out = emb(tokens)

print("Tokens:", tokens)
print(f"Output shape: \n{out}")  # (4, 3)


Embedding weight shape: torch.Size([10, 10])
Tokens: tensor([0, 1, 2, 3])
Output shape: 
tensor([[ 1.9269,  1.4873,  0.9007, -2.1055,  0.6784, -1.2345, -0.0431, -1.6047,
         -0.7521,  1.6487],
        [-0.3925, -1.4036, -0.7279, -0.5594, -0.7688,  0.7624,  1.6423, -0.1596,
         -0.4974,  0.4396],
        [-0.7581,  1.0783,  0.8008,  1.6806,  1.2791,  1.2964,  0.6105,  1.3347,
         -0.2316,  0.0418],
        [-0.2516,  0.8599, -1.3847, -0.8712, -0.2234,  1.7174,  0.3189, -0.4245,
          0.3057, -0.7746]], grad_fn=<EmbeddingBackward0>)


In [10]:
class GPT_Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_block = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        self.final_norm = LayerNorm(emb_dim=cfg['emb_dim'])
        self.out_head = nn.Linear(
            cfg['emb_dim'], cfg['vocab_size'], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_length = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_length, device=in_idx.device))
        x = pos_embeds + tok_embeds
        x = self.drop_emb(x)
        x = self.trf_block(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        
        return logits

In [11]:
torch.manual_seed(42)
inputs = torch.tensor(
  [123, 465, 789, 120])
batch = torch.stack((inputs, inputs), dim=0)
batch

tensor([[123, 465, 789, 120],
        [123, 465, 789, 120]])

In [12]:
model = GPT_Model(cfg=cfg)
a = model(batch)
a.shape

torch.Size([2, 4, 50257])

In [13]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Number of parameters: {total_params:,}")

Total Number of parameters: 163,009,536


In [14]:
total_size_bytes = total_params * 4 #A
total_size_mb = total_size_bytes / (1024 * 1024) #B
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.83 MB


In [15]:
def Generate_Text(model, idx, max_tokens, context_size):
    for _ in range(max_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.inference_mode():
            logits = model(idx_cond)

            logits = logits[:, -1, :]
            probs = torch.softmax(logits, dim=-1)
            next_idx = torch.argmax(probs, dim=-1, keepdim=True)
            idx = torch.cat((idx, next_idx), dim=-1)
    return idx

In [24]:
start_context = "what is there"
encoded = tokenizer.encode(start_context)
encoded_tensor = torch.tensor(encoded).unsqueeze(dim=0)
print(encoded_tensor.shape)

torch.Size([1, 3])


In [25]:
model.eval() #A
out = Generate_Text(
model=model,
idx=encoded_tensor,
max_tokens=50,
context_size=cfg["context_length"]
)
print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[10919,   318,   612, 47485,  2382, 34721, 49493, 48338, 40333,  2054,
         45447, 38546, 39627, 11858, 43620, 39638, 46384,   768, 44825, 47034,
         13424,  4278, 40866, 41178, 21856,  6970, 18509,  9107, 44508, 33203,
         42179, 23475, 49606, 49986, 39536, 41052, 21831,  5198, 19760, 14388,
         22785, 13127,  1002, 13187, 29694, 10246, 16488, 21437, 16052, 37811,
         15283, 39308, 27211]])
Output length: 53


In [26]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

what is there Dickinsonamer adherenceRunneraretz saliva tre Jonahethnic Communism gods ExtrasrackAggning...? despicableblackties altersRoom nicotine knowing Gnzer wildfireliber risked Edinburgh Acernecessary soften_( fox appeal Yuk FisherSpeed placement If ecosystem +/-PP axis payload coordinator""" sealedsav420
