In [1]:
pip install tiktoken



In [2]:
from transformers import pipeline, set_seed
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken
import numpy as np
import math

In [3]:
learning_rate = 3e-4
num_epochs = 10
top = 1000000
dropout_rate: float = 0.1

In [4]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [5]:
from dataclasses import dataclass

@dataclass
class GPT2Config:
    block_size: int = 0
    vocab_size: int = 0
    n_embd: int = 0
    n_layer: int = 0
    n_head: int = 0

In [6]:
def scaled_dot_product_attention(q,k,v,mask=None):
    d_k = q.size(-1)
    qk = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    if mask is not None:
        qk = qk.permute(1, 0, 2, 3) + mask
        qk = qk.permute(1, 0, 2, 3)
    qk = F.softmax(qk, dim=-1)
    new_qkv = torch.matmul(qk, v)
    return new_qkv

class Multihead_Self_Attention(nn.Module):
    def __init__(self, config):
        super(Multihead_Self_Attention, self).__init__()
        self.n_embd = config.n_embd
        self.n_head = config.n_head
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.softmax = nn.Softmax(dim=-1)
        self.std_scaler = 1

    def forward(self,x,mask=None):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        y = self.c_proj(y)
        return y

In [7]:
class mlp(nn.Module):
    def __init__(self,config):
        super(mlp,self).__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.activation = nn.GELU(approximate='tanh')
        self.std_scaler = 1

    def forward(self,x):
        x = self.c_fc(x)
        x = self.activation(x)
        x = self.c_proj(x)
        return x

In [8]:
class Block(nn.Module):
    def __init__(self,config):
        super(Block, self).__init__()
        self.attn = Multihead_Self_Attention(config)
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.mlp = mlp(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.dropout = nn.Dropout(dropout_rate)
        self.std_scaler = 1
    def forward(self, x):

        resdual_x = x
        x = self.ln_1(x)
        x = self.attn(x) + resdual_x

        resdual_x = x
        x = self.ln_2(x)
        x = self.mlp(x) + resdual_x
        return x

In [9]:
class myGPT(nn.Module):
    def __init__(self, config):
        super(myGPT, self).__init__()
        self.config = config
        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config.vocab_size, config.n_embd),
            'wpe': nn.Embedding(config.block_size, config.n_embd),
            'h': nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            'ln_f': nn.LayerNorm(config.n_embd)
        })
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # weight sharing
        self.transformer['wte'].weight = self.lm_head.weight
        # apply weight initialization
        self.apply(self.initionalization)

    def initionalization(self,model):
        std_linear = 0.02
        std_embedding = 0.01
        if hasattr(model, 'std_scaler'):
            std_linear = (2 * self.config.n_layer) ** -0.5
            std_embedding = (2 * self.config.n_layer) ** -0.5
        if isinstance(model,nn.Linear):
            nn.init.normal_(model.weight, mean = 0,std = std_linear)
            if model.bias is not None:
                nn.init.zeros_(model.bias)
        elif isinstance(model,nn.Embedding):
            nn.init.normal_(model.weight,mean=0,std=std_embedding) # following the offical openAI implementation

    def forward(self, x,targets = None):
        B, T = x.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=x.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(x) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb

        for block in self.transformer.h:
            x = block(x)

        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    @classmethod
    def from_pretrained(cls, model_type):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPT2Config(**config_args)
        model = myGPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])
        return model

In [10]:
class DataLoader:
    def __init__(self, batch_size, block_size):
        self.batch_size = batch_size
        self.block_size = block_size
        self.pointer = 0
        self.data = ''
        with open('input.txt', 'r') as file:
            self.data = file.read()
        self.tokens = tiktoken.get_encoding('gpt2').encode(self.data)
        self.n_batches = len(self.tokens) // (self.batch_size * self.block_size)

    def next_batch(self):
        start = self.pointer
        end = start + self.batch_size * self.block_size
        if end + 1 > len(self.tokens):
            raise IndexError("End of data reached")

        mini_tokens = self.tokens[start:end + 1]
        x = torch.tensor(mini_tokens[:-1], dtype=torch.long).view(self.batch_size, self.block_size)
        y = torch.tensor(mini_tokens[1:], dtype=torch.long).view(self.batch_size, self.block_size)
        self.pointer += self.batch_size * self.block_size
        return x, y

In [15]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True
Block_size = 512
Batch_size = 8
config = GPT2Config(block_size=Block_size, vocab_size=50304, n_embd=768, n_layer=12, n_head=12)
model = myGPT(config)
model = model.to(get_device())
model = torch.compile(model)
x,y = DataLoader(batch_size=Batch_size, block_size=Block_size).next_batch()
x,y = x.to(get_device()), y.to(get_device())
logits,loss = model.forward(x,y)
print(logits.size(), loss) # I am expecting to have a loss equal to the cross entropy loss which is -log(probability)
                            # where each word follows uniform distription so the probability should be 1/vocb_size = 1/50257 = 0.0000199
                            # so the loss should be -log(0.0000199) = 10.8

torch.Size([8, 512, 50304]) tensor(10.9048, device='cuda:0', grad_fn=<CompiledFunctionBackward>)


In [17]:
torch.set_float32_matmul_precision('high')
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
num_epochs = 10
for j in range(num_epochs):
  dataloader = DataLoader(Batch_size,Block_size)
  for i in range(dataloader.n_batches - 1):
      optimizer.zero_grad()
      x,y = dataloader.next_batch()
      x,y = x.to(get_device()), y.to(get_device())
      logits, loss = model(x, y)
      loss.backward()
      optimizer.step()
      print(f"epoch: {j+1}, iteratation {i}, loss: {loss.item()}")

epoch: 1, iteratation 0, loss: 6.16850471496582
epoch: 1, iteratation 1, loss: 6.320102214813232
epoch: 1, iteratation 2, loss: 6.154117107391357
epoch: 1, iteratation 3, loss: 6.107738971710205
epoch: 1, iteratation 4, loss: 5.822941780090332
epoch: 1, iteratation 5, loss: 5.622753143310547
epoch: 1, iteratation 6, loss: 5.660984039306641
epoch: 1, iteratation 7, loss: 5.650468826293945
epoch: 1, iteratation 8, loss: 5.750833034515381
epoch: 1, iteratation 9, loss: 5.6154937744140625
epoch: 1, iteratation 10, loss: 5.91624641418457
epoch: 1, iteratation 11, loss: 6.096148490905762
epoch: 1, iteratation 12, loss: 6.335612773895264
epoch: 1, iteratation 13, loss: 6.246296405792236
epoch: 1, iteratation 14, loss: 6.016467094421387
epoch: 1, iteratation 15, loss: 6.018744468688965
epoch: 1, iteratation 16, loss: 5.859983444213867
epoch: 1, iteratation 17, loss: 5.803045749664307
epoch: 1, iteratation 18, loss: 6.014570236206055
epoch: 1, iteratation 19, loss: 5.943429946899414
epoch: 1, i

In [24]:
# Tokenizer setup
gpt_encoder = tiktoken.get_encoding('gpt2')
tokens = gpt_encoder.encode("I am a doctor who")
tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)
tokens = tokens.repeat(Batch_size, 1)

# Sampling loop
sample_rng = torch.Generator(device=get_device())
sample_rng.manual_seed(42)

while tokens.size(1) < Block_size:
    with torch.no_grad():
        tokens = tokens.to(get_device())
        logits, _ = model(tokens)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)

        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        ix = torch.multinomial(topk_probs, 1, generator=sample_rng).to(get_device())
        xcol = torch.gather(topk_indices, -1, ix)
        tokens = torch.cat((tokens, xcol), dim=1)

# Decode and print generated sequences
for i in range(Batch_size):
    generated_tokens = tokens[i, :].tolist()
    generated_text = gpt_encoder.decode(generated_tokens)
    print(generated_text)

I am a doctor who'st thou
And for my mother the first with him to take thee and let them yet; for my master:
And so far good
The soldier,
A gentleman'suke's death?
DUCHIO::
And what's daughter is a
PANCA:
O:
And give me for this Kate!
For his country,
And what thou,
So I am to have been of a very to die.
If the fire the maid be, let's no

TRANCA:
If then I am so I may be my lord,
But now for ever had made
And all well,
A way,

A:
Of this place?


But
I will, here comes up;
I will.
When ever your lord, he is the better.
You should ask-APTISABELLA: but I have so,

As what does be anon with thee and he comes the devil.
I cannot do.
And else the world; if you,
Your life;

And yet so I am you, a devil will I will;

The noble,


As, it is,

He hath I'll not not have made
HORTENSIO:

I am heard, the duke will see with my noble?
We know you.
I'll pardon not,
But to you are a noble brother:
ThWARDCDELLO:
Hear, as for what does be so the little

Than shall be not he's no more.
GRUMARINA:

I am
Y