In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import mmap
import random
from transformers import GPT2Tokenizer

In [29]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Set the device as cuda if cuda is available
print(device)

cuda


Defining hyperparameters

In [30]:
block_size = 64  # Size of the input sequence 
batch_size = 32  # Amount of inputs the model would train on in one single iteration
max_iters = 300
lr = 3e-4  # Learning rate (defines the rate with which the model would converge)
eval_iters = 10
# eval_interval = 500
n_embd = 200  # Number of embedding tokens generated per input character
n_layers = 1   # Number of decoder layers
n_head = 1  # Number of attention layers running in parallel
dropout = 0.2  # dropout to prevent overfitting
max_token_size = 64
EOS_Token =50256

In [31]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

vocab_size = len(tokenizer.get_vocab())

print(vocab_size)

50257


Dataloader function

In [44]:


def get_random_chunk(split):
    filename = "openwebtext/train.txt" if split=='train' else "openwebtext/val.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            start_pos = random.randint(0, (file_size) - block_size*batch_size*max_token_size)

            mm.seek(start_pos)
            block = mm.read(block_size*batch_size*max_token_size)

            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            
            decoded_block = decoded_block[decoded_block.find(" ")+1:decoded_block.rfind(" ")]
            # print(decoded_block)
            data = torch.tensor(tokenizer.encode(decoded_block), dtype=torch.long)
            
    return data

def get_batch(split):
    data = get_random_chunk(split)
    # Fetches a random chunk of data of size (batch_size*block_size) from the whole dataset
    ix = torch.randint(len(data)-block_size,(batch_size, ))  # Randomly generates starting positions of the input sequences for a whole batch
    # print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])  # For every starting position, it generates input sequence
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# x,y = get_batch('train')
# print(x.shape)
# print(y.shape)

In [45]:
# x,y = get_batch('train')
# print(x)

In [46]:
print(decode(x[0][1:].tolist()))
print(decode(y[0][1:].tolist()))

NameError: name 'decode' is not defined

In [47]:
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self,x):
        x = self.net(x)
        return x

In [48]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))  # Lower triangular mask is used because for each timestep, we don't want the model to learn from the future timesteps
        self.dropout = nn.Dropout(dropout)

    def forward(self, key, value, query, masked=None):
        B,T,C = query.shape
        k = self.key(key)
        q = self.query(query)
        v = self.value(value)

        sim_score = torch.matmul(q,k.transpose(-2,-1))*k.shape[-1]**0.5
        if masked is not None: 
            sim_score = sim_score.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        sim_score = F.softmax(sim_score, dim=-1)
        sim_score = self.dropout(sim_score)
        attention_score = torch.matmul(sim_score,v)
        return attention_score

In [49]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
        self.proj = nn.Linear(head_size*n_head, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, key, value, query, masked=None):
        out = torch.cat([h(key, value, query, masked) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [50]:
class Decoder(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        head_size = n_embd // n_head
        self.MA1 = MultiHeadAttention(n_head, head_size)
        self.MA2 = MultiHeadAttention(n_head, head_size)
        self.ff = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.ln3 = nn.LayerNorm(n_embd)


    def forward(self, x, encoder_output):
        y = self.MA1(x, x, x, masked=True)
        x = self.ln1(x + y)
        y = self.MA2(encoder_output, encoder_output, x)
        x = self.ln2(x + y)
        y = self.ff(x)
        x = self.ln2(x + y)
        return x

In [51]:
class Encoder(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        head_size = n_embd // n_head
        self.MA1 = MultiHeadAttention(n_head, head_size)
        self.ff = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.dropout = nn.Dropout(dropout)


    def forward(self,x):
        # print(x.dtype)
        y = self.MA1(x, x, x)
        x = self.ln1(x + y)
        y = self.ff(x)
        x = self.ln2(x + y)
        return x

In [52]:
class Embeddings_and_positional_encoding(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.input_embd_layer = nn.Embedding(vocab_size, n_embd)
        self.input_positional_encoding = nn.Embedding(block_size, n_embd)
    def forward(self, index):
        y = self.input_embd_layer(index)
        B, T, C = y.shape
        p = self.input_positional_encoding(torch.arange(T, device=device))
        return (y + p)

The entire decoder-only transformer model with multi-head attention 

In [62]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embeddings = Embeddings_and_positional_encoding(vocab_size)
        # self.output_embeddings = Embeddings_and_positional_encoding(vocab_size)
        self.encoder = nn.Sequential(*[Encoder(vocab_size) for _ in range(n_layers)])
        self.decoder = nn.ModuleList([Decoder(vocab_size) for _ in range(n_layers)])
        self.ff = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        if isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    
    def forward(self, inputs, targets=None):
        input_embd = self.embeddings(inputs)
        target_embd = self.embeddings(targets)
        enc = self.encoder(input_embd)
        dec = target_embd
        for layer in self.decoder:
            dec = layer(dec, enc)
        x = self.ff(dec)
        B, T, C = x.shape
        x = F.softmax(x, dim=-1)
        x = x.view(B*T, C)
        y = targets.view(B*T)
        loss = F.cross_entropy(x, y)
        return x, loss
        

    def generate(self,prompt, max_new_tokens):
        index = torch.tensor(tokenizer.encode(prompt), device=device)
        index = index.view(1,-1)
        targets = index.clone()
        for _ in range(max_new_tokens):
            
            index = index[:, max(0,len(index[0])-block_size):]
            targets = targets[:, max(0,len(targets[0])-block_size):]
            logits, loss = self.forward(index, targets)
            probs = logits[-1,:]
            print(probs)
            
            index_next = torch.argmax(probs)
            print(index_next)
            index_next = index_next.view(1,-1)
            targets = torch.cat((targets,index_next), dim=1)
            if targets[0][-1]==EOS_Token:
                return targets
            # print(index.shape)
        targets = torch.cat((targets, torch.tensor([[EOS_Token, ], ], device=device)), dim=1)
        return targets


Evaluation function

In [54]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits,loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

Training function and model initialization

In [55]:
model = GPTLanguageModel(vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
cummulative_loss = 0

for iter in range(max_iters):
    if iter%eval_iters==0:
        losses = estimate_loss()
        print(f"iter: {iter}: train_loss : {losses['train']}, eval_loss : {losses['val']}")
        if iter%100==0:
            checkpoint = {
                'epoch': iter,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': losses['val'],  # Example: saving the last recorded loss
            }
            torch.save(checkpoint, 'model_checkpoint.pth')
        # print(f"iter: {iter}: train_loss : {cummulative_loss/(iter+1)}")
    
    
    x,y = get_batch('train')
    # print(x.dtype, y.dtype)
    logits, loss = model.forward(x,y)
    cummulative_loss = cummulative_loss + loss.item()
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

iter: 0: train_loss : 10.824899673461914, eval_loss : 10.824899673461914
iter: 10: train_loss : 10.82489013671875, eval_loss : 10.82489013671875
iter: 20: train_loss : 10.82483959197998, eval_loss : 10.82483959197998
iter: 30: train_loss : 10.824641227722168, eval_loss : 10.8246431350708
iter: 40: train_loss : 10.824097633361816, eval_loss : 10.82406997680664
iter: 50: train_loss : 10.822344779968262, eval_loss : 10.822399139404297


KeyboardInterrupt: 

Initializing and loading the model from model_checkpoint.pth file. (Can be used to load a previously trained model)

In [63]:
model = GPTLanguageModel(vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
checkpoint = torch.load('model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['val_loss']
print(loss)

tensor(10.8249)


Generating some sample text

In [66]:
# CUDA_LAUNCH_BLOCKING=1
# encoded_prompt = torch.tensor([[SOS_Token, ], ], dtype=torch.long, device=device)
prompt = input("Enter a prompt: ")
# print(prompt)
# prompt = torch.tensor(prompt, dtype=torch.long, device=device)
# prompt = torch.cat((prompt, torch.tensor([EOS_Token], dtype=torch.long, device=device)))
# prompt = prompt.view(1,-1)

# encoded_prompt = torch.cat((encoded_prompt, prompt), dim=1)
# print(encoded_prompt)
encoded_output = model.generate(prompt, 5)[0].tolist()
encoded_output = encoded_output[:len(encoded_output)-1]
output = tokenizer.decode(encoded_output)
print(output)



Enter a prompt:  Hi 


tensor([2.7066e-05, 1.9260e-05, 1.8087e-05,  ..., 1.2694e-05, 1.3832e-05,
        2.2234e-05], device='cuda:0', grad_fn=<SliceBackward0>)
tensor(29707, device='cuda:0')
tensor([4.2430e-05, 1.6526e-05, 2.6421e-05,  ..., 2.2075e-05, 3.3479e-05,
        1.4234e-05], device='cuda:0', grad_fn=<SliceBackward0>)
tensor(42346, device='cuda:0')
tensor([2.0147e-05, 2.0265e-05, 1.3422e-05,  ..., 1.4402e-05, 2.7430e-05,
        1.3203e-05], device='cuda:0', grad_fn=<SliceBackward0>)
tensor(17647, device='cuda:0')
tensor([1.2580e-05, 2.0333e-05, 1.2956e-05,  ..., 2.6551e-05, 1.4052e-05,
        2.6707e-05], device='cuda:0', grad_fn=<SliceBackward0>)
tensor(15919, device='cuda:0')
tensor([2.0567e-05, 1.8615e-05, 1.3453e-05,  ..., 1.4808e-05, 2.4273e-05,
        1.2935e-05], device='cuda:0', grad_fn=<SliceBackward0>)
tensor(31684, device='cuda:0')
Hi  pens BART BirthOUND Liam
