In [53]:
import torch
import torch.nn as nn
import tiktoken
import os
from torch.utils.data import DataLoader, Dataset
import numpy as np

In [13]:
file_path = "./DATA/the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

In [14]:
tokenizer = tiktoken.get_encoding("gpt2")

In [15]:
cfg = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [16]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)
        norm_x = (x-mean)/torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [17]:
class Feed_Forward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], cfg["emb_dim"]*4),
            nn.GELU(),
            nn.Linear(cfg["emb_dim"]*4, cfg["emb_dim"])
        )
    def forward(self, x):
        return self.layers(x)

In [89]:
class MultiHead_Attention(nn.Module):
    def __init__(self,d_in,
                 d_out,
                 num_head,
                 dropout,
                 context_length,
                 bias=False):
        super().__init__()
        self.W_Query = nn.Linear(d_in, d_out, bias=bias)
        self.W_Key = nn.Linear(d_in, d_out, bias=bias)
        self.W_Value = nn.Linear(d_in, d_out, bias=bias)
        self.dropout = nn.Dropout(dropout)
        self.num_head = num_head
        self.head_dim = d_out//num_head
        self.d_out = d_out
        self.out_project = nn.Linear(d_out, d_out)

        self. register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_Key(x)
        query = self.W_Query(x)
        value = self.W_Value(x)

        keys = keys.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        query = query.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        value = value.view(b, num_tokens, self.num_head, self.head_dim).transpose(1,2)
        
        att_score = query @ keys.transpose(2,3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        att_score.masked_fill_(mask_bool,-torch.inf)

        att_weight = torch.softmax(att_score/keys.shape[-1]**0.5, dim=-1)
        att_weight = self.dropout(att_weight)
        
        context_vec = (att_weight @ value).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, d_in)
        context_vec = self.out_project(context_vec)
        return context_vec

In [19]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.att = MultiHead_Attention(d_in=cfg['emb_dim'],
                                       d_out=cfg['emb_dim'],
                                       num_head=cfg['n_heads'],
                                       dropout=cfg['drop_rate'],
                                       context_length=cfg['context_length'],
                                       bias=cfg['qkv_bias']
                                       )
        self.feed_forward = Feed_Forward(cfg=cfg)
        self.norm1 = LayerNorm(emb_dim=cfg['emb_dim'])
        self.norm2 = LayerNorm(emb_dim=cfg['emb_dim']) 
        self.shortcut_drop = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.shortcut_drop(x)
        x = x + shortcut

        shortcut = x

        x = self.norm2(x)
        x = self.feed_forward(x)
        x = self.shortcut_drop(x)
        x = x + shortcut

        return x

In [20]:
class GPT_Model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_block = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        self.final_norm = LayerNorm(emb_dim=cfg['emb_dim'])
        self.out_head = nn.Linear(
            cfg['emb_dim'], cfg['vocab_size'], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_length = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_length, device=in_idx.device))
        x = pos_embeds + tok_embeds
        x = self.drop_emb(x)
        x = self.trf_block(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        
        return logits

In [21]:
model = GPT_Model(cfg=cfg)

In [22]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Number of parameters: {total_params:,}")

Total Number of parameters: 162,419,712


In [23]:
total_size_bytes = total_params * 4 #A
total_size_mb = total_size_bytes / (1024 * 1024) #B
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 619.58 MB


# Generating Text

In [24]:
def Generate_Text(model, idx, max_tokens, context_size):
    for _ in range(max_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.inference_mode():
            logits = model(idx_cond)

            logits = logits[:, -1, :]
            probs = torch.softmax(logits, dim=-1)
            next_idx = torch.argmax(probs, dim=-1, keepdim=True)
            idx = torch.cat((idx, next_idx), dim=-1)
    return idx

# Loss

In [25]:
characters_count = len(text)
token_len = len(tokenizer.encode(text))
token_len

5146

In [26]:
class GPT_Dataset(Dataset):
    def __init__(self, txt, context_len, stride, tokenizer):
        super().__init__()

        self.input_idx = []
        self.output_idx = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - context_len, stride):
            input_chunk = token_ids[i : i+context_len]
            output_chunks = token_ids[i+1 : i+context_len+1]

            self.input_idx.append(torch.tensor(input_chunk))
            self.output_idx.append(torch.tensor(output_chunks))

    def __len__(self):
        return len(self.input_idx)
    
    def __getitem__(self, x):
        return self.input_idx[x], self.output_idx[x]

In [27]:
def create_data_loader(txt, context_len=256, stride=128,
                   batch_size=8, shuffle=False, num_workers=0,
                   drop_last=True):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPT_Dataset( txt, context_len, stride, tokenizer)

    data_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return data_loader


In [28]:
train_ratio = 0.9
split_idx = int(train_ratio * len(text))
traing_data = text[:split_idx]
test_data = text[split_idx:]

train_dataloader = create_data_loader(
    traing_data,
    context_len=cfg["context_length"],
    stride=256,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    drop_last=True
    )

test_dataloader = create_data_loader(
    test_data,
    context_len=cfg["context_length"],
    stride=256,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    drop_last=True
    )

In [29]:
for x, y in train_dataloader:
    print(x.shape, y.shape)

torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [30]:
def corss_entropy_loss(input_batch, target_batch, device, model):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())

    return loss


def corss_entropy_loss_loader(data_loader, model, device, num_batch=None):
    total_loss = 0

    if len(data_loader) == 0:
        return float("nan")
    elif num_batch is None:
        num_batch = len(data_loader)
    else:
        num_batch = min(num_batch, len(data_loader))
    
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batch:
            loss = corss_entropy_loss(input_batch, target_batch, device, model)
            total_loss += loss.item()
        else:
            break
    return total_loss/num_batch

In [31]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = corss_entropy_loss_loader(train_loader, model, device, num_batch=eval_iter)
        val_loss = corss_entropy_loss_loader(val_loader, model, device, num_batch=eval_iter)
    model.train()
    return train_loss, val_loss

In [32]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())


In [33]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = Generate_Text(
            model=model, idx=encoded,
            max_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [34]:
from tqdm import tqdm
def train_model_sample(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_token_seen = [], [], []

    token_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        batch_iter = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
        for input_batch, target_batch in batch_iter:
            optimizer.zero_grad()
            loss = corss_entropy_loss(input_batch, target_batch, device, model)
            loss.backward()
            optimizer.step()
            token_seen += input_batch.numel()
            global_step +=1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_token_seen.append(token_seen)
                print(f"Ep {epoch+1}(Step {global_step:06d}): "
                      f"Train Loss: {train_loss:.3f}, val Loss: {val_loss:.3f}")
                
        generate_and_print_sample(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_token_seen
            

In [35]:
import time
device = "cuda" if torch.cuda.is_available() else "cpu"
start_time = time.time()

num_epoch = 10

torch.manual_seed(42)
model = GPT_Model(cfg)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
train_losses, val_losses, tokens_seen = train_model_sample(model, train_dataloader, test_dataloader, optimizer,
                                                       device, num_epoch, eval_freq=5, eval_iter=5,
                                                       start_context="Every step moves you", tokenizer=tokenizer)

end_time = time.time()

train_time = (end_time - start_time)/60

print(f"Total trainig time: {train_time}")

Epoch 1/10:  33%|███▎      | 3/9 [00:01<00:01,  3.13it/s]

Ep 1(Step 000000): Train Loss: 10.027, val Loss: 10.131


Epoch 1/10:  89%|████████▉ | 8/9 [00:01<00:00,  6.33it/s]

Ep 1(Step 000005): Train Loss: 8.075, val Loss: 8.356


                                                         

Every step moves you,                                                 


Epoch 2/10:  44%|████▍     | 4/9 [00:00<00:00,  9.60it/s]

Ep 2(Step 000010): Train Loss: 6.784, val Loss: 7.122


                                                         

Ep 2(Step 000015): Train Loss: 6.053, val Loss: 6.650




Every step moves you, the, the, the, the, the, the, the, the.                                 


Epoch 3/10:  44%|████▍     | 4/9 [00:00<00:00,  8.17it/s]

Ep 3(Step 000020): Train Loss: 12.585, val Loss: 13.870


                                                         

Ep 3(Step 000025): Train Loss: 5.597, val Loss: 6.425




Every step moves you I                                                 


Epoch 4/10:  67%|██████▋   | 6/9 [00:00<00:00,  9.42it/s]

Ep 4(Step 000030): Train Loss: 5.313, val Loss: 6.512


                                                         

Ep 4(Step 000035): Train Loss: 5.092, val Loss: 6.526




Every step moves you.  "--I, and he had, and he had. ". Gisburn--I. ". ". ". ", and. ". Gisburn--I, and he was


Epoch 5/10:  67%|██████▋   | 6/9 [00:00<00:00,  8.26it/s]

Ep 5(Step 000040): Train Loss: 4.645, val Loss: 6.387


                                                         

Every step moves you a little was a a little wild--I felt, I had been                                    


Epoch 6/10:  11%|█         | 1/9 [00:00<00:01,  4.94it/s]

Ep 6(Step 000045): Train Loss: 4.527, val Loss: 6.499


Epoch 6/10:  89%|████████▉ | 8/9 [00:00<00:00,  8.28it/s]

Ep 6(Step 000050): Train Loss: 3.851, val Loss: 6.344


                                                         

Every step moves you the last, and--I, and I felt, and I had been of the picture was the fact, in the Riv of the picture, in, in the last, in the picture--as, the donkey.      


Epoch 7/10:  44%|████▍     | 4/9 [00:00<00:00,  9.69it/s]

Ep 7(Step 000055): Train Loss: 3.550, val Loss: 6.260


                                                         

Ep 7(Step 000060): Train Loss: 2.979, val Loss: 6.176




Every step moves you know," was one of the axioms--and so inevitably the                                    


Epoch 8/10:  56%|█████▌    | 5/9 [00:00<00:00,  8.75it/s]

Ep 8(Step 000065): Train Loss: 2.494, val Loss: 6.186


                                                         

Ep 8(Step 000070): Train Loss: 2.513, val Loss: 6.263




Every step moves you in the picture was a little wild--I felt nervous and uncertain.                                    


Epoch 9/10:  67%|██████▋   | 6/9 [00:00<00:00,  9.41it/s]

Ep 9(Step 000075): Train Loss: 1.856, val Loss: 6.303


                                                         

Ep 9(Step 000080): Train Loss: 1.741, val Loss: 6.329




Every step moves you in the inevitable.      "I looked me.                                   


Epoch 10/10:  78%|███████▊  | 7/9 [00:00<00:00,  8.70it/s]

Ep 10(Step 000085): Train Loss: 1.451, val Loss: 6.354


                                                          

Every step moves you?"  "Yes--and by a smile that he had the background of her. "Yes--as! The women had to see a smile behind his close grayish beard--as if he had the donkey. "There were days when I
Total trainig time: 0.45068724552790324


In [36]:
def generate(model, idx, context_len, max_new_tok, top_k, temp=0.0, eos_id = None):

    for _ in range(max_new_tok):
        idx = idx.to(device)
        idx_cond = idx[:, -context_len:]
        with torch.inference_mode():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        top_k_logits, _ = torch.topk(logits, top_k)
        min_val = top_k_logits[:, -1]
        logits = torch.where(condition=logits<min_val,
                                 input=torch.tensor(float("-inf")).to(logits.device),
                                 other=logits)
        if temp>0.0:
            logits = logits/temp
            probs = torch.softmax(logits, dim=-1)
            preds = torch.multinomial(probs, 1)
        else:
            probs = torch.softmax(logits, dim=-1)
            preds = torch.multinomial(probs, 1)

        if preds == eos_id:
            break

        idx = torch.cat((idx, preds), dim= 1)
    return idx

In [74]:
text_gen = generate(model, idx=text_to_token_ids("I went to the park", tokenizer=tokenizer), max_new_tok=10, top_k=25, temp=1.6, context_len=cfg["context_length"])
print(f"output:\n{token_ids_to_text(text_gen, tokenizer)}")

output:
I went to the park Herm mighty up his subject being for you knowinteresting


# Importing pre-trained weights from original gpt-2 model.


In [38]:
from preTrained_weights.load_gpt_2 import load_gpt_2_weights

In [39]:
DIR = "./preTrained_weights"

In [40]:
settings, params = load_gpt_2_weights(dir=DIR)

In [41]:
print(settings)

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}


In [46]:
model_configs = {
    "emb_dim": 768,
    "n_layers": 12,
    "n_heads": 12
}

NEW_CONFIG = cfg.copy()
NEW_CONFIG.update(model_configs)

NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

In [None]:
gpt = GPT_Model(NEW_CONFIG)
gpt.eval()


In [None]:
v = np.split()

In [91]:
params["blocks"][1]["attn"]["c_attn"]["w"].shape

(768, 2304)

In [94]:
def assign(lhs, rhs):
    if lhs.shape != rhs.shape:
        raise ValueError("Shape miss match")
    return nn.Parameter(torch.tensor(rhs))

In [127]:
def load_weights(gpt, params):

    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])

    for b in range(len(params["blocks"])):
        W_q, W_k, W_v = np.split(params["blocks"][b]["attn"]["c_attn"]["w"], 3, axis=-1)
        gpt.trf_block[b].att.W_Query.weight = assign(gpt.trf_block[b].att.W_Query.weight, W_q.T)
        gpt.trf_block[b].att.W_Key.weight = assign(gpt.trf_block[b].att.W_Key.weight, W_k.T)
        gpt.trf_block[b].att.W_Value.weights = assign(gpt.trf_block[b].att.W_Value.weight, W_v.T)

        B_q, B_k, B_v = np.split(params["blocks"][b]["attn"]["c_attn"]["b"], 3, axis=-1)

        gpt.trf_block[b].att.W_Query.bias = assign(gpt.trf_block[b].att.W_Query.bias, B_q)
        gpt.trf_block[b].att.W_Key.bias = assign(gpt.trf_block[b].att.W_Key.bias, B_k)
        gpt.trf_block[b].att.W_Value.bias = assign(gpt.trf_block[b].att.W_Value.bias, B_v)

        gpt.trf_block[b].att.out_project.weight = assign(gpt.trf_block[b].att.out_project.weight, params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_block[b].att.out_project.bias = assign(gpt.trf_block[b].att.out_project.bias, params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_block[b].feed_forward.layers[0].weight = assign(gpt.trf_block[b].feed_forward.layers[0].weight, params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_block[b].feed_forward.layers[2].weight = assign(gpt.trf_block[b].feed_forward.layers[2].weight, params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_block[b].feed_forward.layers[0].bias = assign(gpt.trf_block[b].feed_forward.layers[0].bias, params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_block[b].feed_forward.layers[2].bias = assign(gpt.trf_block[b].feed_forward.layers[2].bias, params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_block[b].norm1.scale = assign(gpt.trf_block[b].norm1.scale, params["blocks"][b]["ln_1"]["g"])
        gpt.trf_block[b].norm1.shift = assign(gpt.trf_block[b].norm1.shift, params["blocks"][b]["ln_1"]["b"])
        gpt.trf_block[b].norm2.scale = assign(gpt.trf_block[b].norm2.scale, params["blocks"][b]["ln_2"]["g"])
        gpt.trf_block[b].norm2.shift = assign(gpt.trf_block[b].norm2.shift, params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

    print("Weights have been loaded successfully....")

In [128]:
load_weights(gpt, params)
gpt.to(device)

Weights have been loaded successfully....


GPT_Model(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_block): Sequential(
    (0): TransformerBlock(
      (att): MultiHead_Attention(
        (W_Query): Linear(in_features=768, out_features=768, bias=True)
        (W_Key): Linear(in_features=768, out_features=768, bias=True)
        (W_Value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_project): Linear(in_features=768, out_features=768, bias=True)
      )
      (feed_forward): Feed_Forward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (shortcut_drop): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHead_Attention(
     

In [163]:
torch.manual_seed(42)
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tok=25,
    context_len=cfg["context_length"],
    top_k=50,
    temp=1.5
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you for the only that-gives, and hads. When anor/The problem at night time - at a (
