In [6]:
import torch
import tiktoken
import torch.nn as nn

In [19]:
cfg = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [17]:
class LayerNorm(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.eps = 1e-5
        
    def forward(self,x):
        self.mean = torch.mean(x,dim=-1,keepdim=True)
        self.var = torch.var(x,dim=-1,unbiased=False,keepdim=True)
        self.shift = nn.Parameter(torch.zeros(x.size()[-1]))
        self.scale = nn.Parameter(torch.ones(x.size()[-1]))
        norm_x = (x-self.mean)/(self.var+self.eps)
        scaled_norm_x = self.scale*norm_x + self.shift
        return scaled_norm_x
    
x = torch.tensor([[1,2],[3,4]])
ln = LayerNorm()
mean = torch.mean(ln(x.float()),dim=-1)
print(mean)

tensor([0., 0.], grad_fn=<MeanBackward1>)


In [9]:
class GELU(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))
        
class FeedForward(torch.nn.Module):
    def __init__(self,embed_size):
        super().__init__()
        self.nwt = torch.nn.Sequential(torch.nn.Linear(embed_size,4*embed_size),
                   GELU(),
                   torch.nn.Linear(4*embed_size,embed_size))
        
    def forward(self,x):
        return self.nwt(x)
        

In [15]:
x = torch.rand(2,3,768)
ff = FeedForward(768)
x = ff(x)
print(x.shape)
print(ff.nwt[0].weight)

torch.Size([2, 3, 768])
Parameter containing:
tensor([[ 0.0317,  0.0104,  0.0269,  ..., -0.0217, -0.0332, -0.0078],
        [ 0.0092, -0.0069, -0.0194,  ..., -0.0120,  0.0200, -0.0191],
        [-0.0184,  0.0054, -0.0235,  ..., -0.0077,  0.0056,  0.0142],
        ...,
        [-0.0277,  0.0077,  0.0311,  ...,  0.0030, -0.0343,  0.0110],
        [-0.0012,  0.0244, -0.0054,  ...,  0.0124,  0.0205, -0.0307],
        [-0.0267,  0.0097, -0.0140,  ..., -0.0276, -0.0281,  0.0155]],
       requires_grad=True)


In [3]:
%pwd

'/Users/mukulagarwal/Desktop/Projects/transformers/making_LLM_from_scratch/Notebooks'

In [4]:
%cd ..

/Users/mukulagarwal/Desktop/Projects/transformers/making_LLM_from_scratch


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [22]:
from Scripts.mha_attention import MultiHeadAttention
class TransformerBlock(torch.nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.ln1 = LayerNorm()
        self.ln2 = LayerNorm()
        self.ff = FeedForward(cfg['emb_dim'])
        self.dropout = torch.nn.Dropout(cfg['drop_rate'])
        self.mha = MultiHeadAttention(
            d_in = cfg['emb_dim'],
            d_out=cfg['emb_dim'],
            context_length=cfg['context_length'],
            dropout=cfg['drop_rate'],
            num_heads=cfg['n_heads'],
            qkv_bias=cfg['qkv_bias']
        )
        
    def forward(self,x):
        shortcut = x
        norm_x = self.ln1(x)
        mha_x = self.mha(norm_x)
        drop_x1 = self.dropout(mha_x)
        x = drop_x1 + shortcut
        
        shortcut = x
        norm_x_2 = self.ln2(x)
        ff_x = self.ff(norm_x_2)
        drop_x2 = self.dropout(ff_x)
        x = drop_x2 + shortcut 
        
        return x

In [23]:
x = torch.rand(2,3,768)
trf = TransformerBlock(cfg)
x = trf(x)
print(x.shape)
#print(ff.nwt[0].weight)

torch.Size([2, 3, 768])


In [36]:
class GPT(torch.nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.token_embeddings = torch.nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.positional_embeddings = torch.nn.Embedding(cfg['context_length'],cfg['emb_dim'])
        self.dropout = torch.nn.Dropout(cfg['drop_rate'])
        self.trf = torch.nn.Sequential(
            *[
                TransformerBlock(cfg)
                for _ in range(cfg['n_layers'])
            ]
        )
        
        self.final_ln = LayerNorm()
        self.out_lin = torch.nn.Linear(cfg['emb_dim'],cfg['vocab_size'])
        
    def forward(self,x):
        b,seq_length = x.size()
        token_embeddings = self.token_embeddings(x)
        positional_embeddings = self.positional_embeddings(torch.arange(seq_length))
        embeddings = token_embeddings + positional_embeddings
        embed_drop = self.dropout(embeddings)
        out_trf = self.trf(embed_drop)
        norm_out = self.final_ln(out_trf)
        logits = self.out_lin(norm_out)
        return logits

In [40]:
txt1 = "Every effort moves you"
txt2 = "Hello my name is"

tokenizer = tiktoken.get_encoding("gpt2")
txt1_token = torch.tensor(tokenizer.encode(txt1))
txt2_token = torch.tensor(tokenizer.encode(txt2))
tokens = torch.stack((txt1_token,txt2_token))
tokens

tensor([[ 6109,  3626,  6100,   345],
        [15496,   616,  1438,   318]])

In [41]:
gpt = GPT(cfg)
gpt(tokens)

tensor([[[-0.3799,  0.5583, -0.0717,  ..., -0.4488,  0.2797, -0.0444],
         [ 0.0397,  0.3390, -0.0437,  ..., -0.1328, -0.1661, -0.2715],
         [-0.4680, -0.0273, -0.1700,  ...,  0.1494,  0.5256, -0.3046],
         [-0.1864, -0.1921, -0.0889,  ...,  0.1119,  0.3309, -0.4785]],

        [[-0.3054,  0.4613, -0.0851,  ..., -0.6252,  0.2391,  0.0135],
         [ 0.2196, -0.2426,  0.0326,  ...,  0.4261, -0.1267, -0.4724],
         [-0.8503, -0.1443, -0.9523,  ...,  0.2352,  0.2361, -0.2981],
         [-0.0036, -0.7287, -0.4211,  ..., -0.0189,  0.1912, -0.4896]]],
       grad_fn=<ViewBackward0>)

In [60]:
total_params = sum(p.numel() for p in gpt.parameters())
total_params

162469969

In [57]:
outlin_para = sum(p.numel() for p in gpt.out_lin.parameters())
total_params = sum(p.numel() for p in gpt.parameters()) - outlin_para

In [58]:
print(sum(p.numel() for p in trf.ff.parameters()))
print(sum(p.numel() for p in trf.mha.parameters()))

4722432
2360064


In [61]:
(total_params*4)/ (1024*1024)

619.7737464904785

In [231]:
def generate_simple_text(model,idx,max_tokens,content_size):
    for _ in range(max_tokens):
        #idx = idx[:,-content_size:]
        logits = model(idx)[:,-1,:]
        probas = torch.softmax(logits,dim=-1)
        token_id = torch.argmax(probas,dim=-1,keepdim=True)
        idx = torch.cat((idx,token_id),dim=1)
    return idx

idx = generate_simple_text(gpt,tokens,8,256)
idx

tensor([[ 6109,  3626,  6100,   345, 14042, 32086, 26777, 33848,  4828, 34735,
         12604, 18698],
        [15496,   616,  1438,   318, 37358,  1330, 25106, 30851, 26559, 20022,
         50145, 43669]])

In [77]:
print(
    tokenizer.decode(idx[1].tolist())
)

Hello my name is tale awful unfamiliar widation Makoto descriptor Houth


##### Chapter - 5

In [None]:
def text_to_token_ids(text,tokenizer):
    tokens = tokenizer.encode(text,allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(tokens).unsqueeze(0)   ## Adding Batch Dimension
    return encoded_tensor

def token_ids_to_text(token_ids,tokenizer):
    token_ids_flat = token_ids.squeeze(0)
    text = tokenizer.decode(token_ids_flat.tolist())
    return text

start_context = "Every Effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

idx = generate_simple_text(
    gpt,
    idx = text_to_token_ids(start_context,tokenizer),
    max_tokens=10,
    content_size=256
)

token_ids_to_text(idx,tokenizer)

'Every Effort moves you coloured Rum soldier verifying░ Rather Only Siem cancel antis'

In [93]:
tokenizer = tiktoken.get_encoding("gpt2")
txt1 = "Every effort movesyou"
txt2 = "I really like chocolate"
token_ids_1 = torch.tensor(tokenizer.encode(txt1))
token_ids_2 = torch.tensor(tokenizer.encode(txt2))
inputs = torch.stack((token_ids_1[:3],token_ids_2[:3]),dim=0)
print(inputs)
print()
targets = torch.stack((token_ids_1[-3:],token_ids_2[-3:]),dim=0)
print(targets)

tensor([[6109, 3626, 6100],
        [  40, 1107,  588]])

tensor([[ 3626,  6100,  5832],
        [ 1107,   588, 11311]])


In [94]:
inputs

tensor([[6109, 3626, 6100],
        [  40, 1107,  588]])

In [95]:
targets

tensor([[ 3626,  6100,  5832],
        [ 1107,   588, 11311]])

In [120]:
model = GPT(cfg)
with torch.no_grad():
    logits = model(inputs)
    
probas = torch.softmax(logits,dim = -1)
print(probas.shape)

torch.Size([2, 3, 50257])


In [98]:
next_wrd = torch.argmax(probas,dim=-1,keepdim=True)
next_wrd

tensor([[[ 9136],
         [ 4374],
         [39159]],

        [[42019],
         [ 2947],
         [12854]]])

In [105]:
token_ids_to_text(next_wrd[0].flatten(),tokenizer)

' substanceocaljam'

In [109]:

text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]] 
print("Text 1:", target_probas_1)
text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]] 
print("Text 2:", target_probas_2)

Text 1: tensor([3.2870e-05, 1.5803e-05, 3.8227e-05])
Text 2: tensor([1.5449e-05, 2.8701e-05, 1.8449e-05])


In [111]:
log_probas = torch.log(torch.cat((target_probas_1,target_probas_2)))
log_probas

tensor([-10.3230, -11.0553, -10.1720, -11.0780, -10.4586, -10.9005])

In [113]:
avg_log_loss = -1* torch.mean(log_probas)
avg_log_loss

tensor(10.6645)

In [121]:
print(logits.shape)
print(targets.shape)

torch.Size([2, 3, 50257])
torch.Size([6])


In [122]:
logits = logits.flatten(start_dim=0,end_dim=1)
targets = targets.flatten()
print(logits.shape)
print(targets.shape)

torch.Size([6, 50257])
torch.Size([6])


In [125]:
print(logits.shape)
print(targets.shape)

torch.Size([6, 50257])
torch.Size([6])


In [126]:
torch.nn.functional.cross_entropy(logits,targets)

tensor(10.9151)

In [128]:
loss = torch.nn.CrossEntropyLoss()
loss = loss(logits,targets)

In [142]:
with open("/Users/mukulagarwal/Desktop/Projects/transformers/making_LLM_from_scratch/the-verdict.txt") as file:
    text = file.read()
    
train_ratio = int(len(text)*0.90)
train_data = text[:train_ratio]
val_data = text[train_ratio:]

In [143]:
from Scripts.text_preprocessing import create_dataloader
trail_dl = create_dataloader(
    text = train_data,
    batch_size=2,
    maxlength=6,
    stride=6
)
for i, (input_id,target_id) in enumerate(trail_dl):
    print(input_id)
    print(target_id)
    break

tensor([[   40,   367,  2885,  1464,  1807,  3619],
        [  402,   271, 10899,  2138,   257,  7026]])
tensor([[  367,  2885,  1464,  1807,  3619,   402],
        [  271, 10899,  2138,   257,  7026, 15632]])


In [152]:
from Scripts.text_preprocessing import create_dataloader
train_dl = create_dataloader(
    text = train_data,
    batch_size=2,
    maxlength=cfg['context_length'],
    stride=cfg['context_length'],
    shuffle=True,
    num_workers=0
)

val_dl = create_dataloader(
    text = val_data,
    batch_size=2,
    maxlength=cfg['context_length'],
    stride=cfg['context_length'],
    shuffle=False,
    num_workers=0
)

for i, (input_id,target_id) in enumerate(train_dl):
    print(input_id.shape,target_id.shape)

torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [213]:
def calc_batch_loss(input_batch,target_batch,model,device):
    input_batch = input_batch.to(device=device)
    target_batch = target_batch.to(device=device)
    
    logits = model(input_batch)
        
    loss = torch.nn.functional.cross_entropy(logits.flatten(start_dim=0,end_dim=1),
                                             target_batch.flatten())
    return loss

In [190]:
def calc_loss_loader(data_loader,model,device,num_batches=None):
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches,len(data_loader))
        
    loss_batch = []
        
    for i, (inputs_id,target_id) in enumerate(data_loader):
        if i < num_batches:
            logits = model(inputs_id)
            
            loss = torch.nn.functional.cross_entropy(logits.flatten(start_dim=0,end_dim=1),
                                             target_id.flatten())
            loss_batch.append(loss)
        else:
            break
        
    return torch.mean(torch.tensor(loss_batch)).item()        

In [191]:
model = GPT(cfg)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
model.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_dl, model, device) 
    val_loss = calc_loss_loader(val_dl, model, device)
    print("Training loss:", train_loss)
    print("Validation loss:", val_loss)

Training loss: 10.877686500549316
Validation loss: 10.884920120239258


In [192]:
def evaluate_model(model,train_loader,val_loader,device,eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader,model,device)
        val_loss = calc_loss_loader(val_loader,model,device)
        
    model.train()
    return train_loss,val_loss

In [232]:
def generate_and_print_simple_text(start_context,tokenizer,model):
    model.eval()
    idx = torch.tensor(tokenizer.encode(start_context)).unsqueeze(0)
    content_size = 256
    output_id = generate_simple_text(model,idx,max_tokens=50,
                                     content_size = content_size)
    text = token_ids_to_text(output_id,tokenizer)
    print(text.replace("\n"," "))   
    model.train()

In [235]:
tokenizer = tiktoken.get_encoding("gpt2")
torch.manual_seed(123)
def training_gpt(model,train_dl,val_dl,optimizer,device,
                 eval_freq,eval_iter,num_epochs,
                 start_context = "Every effort moves", tokenizer=tokenizer):
    track_train_loss,track_val_loss,track_token_count = [],[],[]
    tokens_seen,global_step_id = 0, -1
    
    model.train()
    for epoch in range(num_epochs):
        for input_ids,target_ids in train_dl:
            optimizer.zero_grad()
            
            loss = calc_batch_loss(input_ids,target_ids,model,device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_ids.numel()
            global_step_id += 1
            
            if global_step_id%eval_freq == 0:
                train_loss,val_loss = evaluate_model(model,train_dl,val_dl,device,eval_iter)
                track_train_loss.append(train_loss)
                track_val_loss.append(track_val_loss)
                track_token_count.append(tokens_seen)
                
                print(f"Ep {epoch+1} (Step {global_step_id:06d}): "
                 f"Train loss {train_loss:.3f}, "
                 f"Val loss {val_loss:.3f}")
                
        generate_and_print_simple_text(start_context,tokenizer,model)
    return track_train_loss,track_val_loss,track_token_count

In [233]:
torch.manual_seed(123)
model = GPT(cfg)
model.to(device)
device = torch.device("cpu")
optimizer = torch.optim.AdamW(
     model.parameters(),
    lr=0.0004, weight_decay=0.1
)
train_losses, val_losses, tokens_seen= training_gpt(model, train_dl, val_dl, optimizer, device,
                                                    eval_freq=5, eval_iter=5,num_epochs=10,
                                                    start_context="Every effort moves you", tokenizer=tokenizer )


Ep 1 (Step 000000): Train loss 10.392, Val loss 10.480
Ep 1 (Step 000005): Train loss 9.539, Val loss 9.842
Every effort moves you, the, the, the, the the, the, I had.                                   
Ep 2 (Step 000010): Train loss 8.433, Val loss 9.139
Ep 2 (Step 000015): Train loss 7.113, Val loss 8.314
Every effort moves you know,,,,,,,,,,,.                                     
Ep 3 (Step 000020): Train loss 5.347, Val loss 7.328
Ep 3 (Step 000025): Train loss 3.710, Val loss 6.546
Every effort moves you know the was not that my hostess was " the of the was of a of the of the was of the was I had to. I was. of his. I had. I was I. the was. " of the I was his
Ep 4 (Step 000030): Train loss 2.482, Val loss 6.341
Ep 4 (Step 000035): Train loss 1.568, Val loss 6.273
Every effort moves you know; and to me--I glanced after him, so that I. "There: "--had, in fact, becoming the man of the moment--as Jack himself, one might put it, had been. "strong he didn't.
Ep 5 (Step 000040): Train los

In [216]:
print(val_losses)

8.048562049865723


In [236]:
def generate_simple_text(model,idx,max_tokens,content_size):
    for _ in range(max_tokens):
        idx = idx[:,-content_size:]
        logits = model(idx)[:,-1,:]
        probas = torch.softmax(logits,dim=-1)
        token_id = torch.argmax(probas,dim=-1,keepdim=True)
        idx = torch.cat((idx,token_id),dim=1)
    return idx

In [238]:
model.eval()
tokenizer = tiktoken.get_encoding("gpt2")
ids = generate_simple_text(
    model=model,
    idx = text_to_token_ids("Every effort moves you",tokenizer=tokenizer),
    max_tokens=25,
    content_size=cfg['context_length']
)
print(token_ids_to_text(ids,tokenizer=tokenizer))

Every effort moves you?"




He, his pictures--so handsome, so charming, so disarming, that one longed


HyperParameter Tuning

In [1]:
import itertools
HPARAM_GRID = {
    "batch_size": [2, 4, 8, 16],
    "drop_rate": [0.0, 0.1, 0.2],
    "warmup_iters": [10, 20, 30],
    "weight_decay": [0.1, 0.01, 0.0],
    "peak_lr": [0.0001, 0.0005, 0.001, 0.005],
    "initial_lr": [0.00005, 0.0001],
    "min_lr": [0.00005, 0.00001, 0.0001],
    "n_epochs": [5, 10, 15, 20, 25],
}

hyperparameter_combinations = list(itertools.product(*HPARAM_GRID.values()))
total_combinations = len(hyperparameter_combinations)
print(f"Total hyperparameter configurations: {total_combinations}")

Total hyperparameter configurations: 12960
