
## GPT from scratch in PyTorch


In [None]:

import torch
import numpy as np
import torch.nn as nn

from torch.nn import functional as F


In [None]:
import numpy as np

In [None]:
torch.cuda.is_available()

True

In [None]:

torch.manual_seed(256)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

block_size        = 120      ## N tokens in sequence
batch_size        = 240
max_iters         = 6000
eval_interval     = 500
learning_rate     = 0.0003
eval_iters        = 300
vocab_size        = 88  ## 65

## every id for a given token is embedded to vector of this size
n_embd            = 512
n_head            = 8         ## 8 attention heads
n_layer           = 6         ## 6 eoncoder layers
dropout           = 0.2


In [None]:
import re

def clean_text(text):
    # Remove any extra whitespace, new lines, or tabs
    text = text.strip()

    # Remove unwanted characters like non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # Replace multiple newlines with a single newline
    text = re.sub(r'\n+', '\n', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove non-informative special characters (like &, %, $, etc.)
    text = re.sub(r'[&%$@#^*()_+=~]', '', text)

    # Remove punctuation (keep it if needed for language structure)
    text = re.sub(r'[^\w\s]', '', text)

    # Reduce any multiple spaces to a single space
    text = re.sub(r'\s+', ' ', text)

    return text

input_file2 = 'content.txt'

with open(input_file2, 'r', encoding='utf-8') as f:
    raw_text = f.read()

text = clean_text(raw_text)

In [None]:

the_chars  = sorted(list(set(text)))

vocab_size = len( the_chars )

print(  len(the_chars)  )

print(  ''.join(the_chars)  )


63
 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [None]:

stoi = { ch:i for i, ch in enumerate(the_chars) }
itos = { i:ch for i, ch in enumerate(the_chars) }


In [None]:

encode = lambda s: [ stoi[c]          for c in s   ]

encode("bahh")


[38, 37, 44, 44]

In [None]:
import tiktoken

In [None]:
tokenizer = tiktoken.get_encoding('gpt2')

encode = lambda s: tokenizer.encode(s)

encode("bahh")

In [None]:

decode = lambda l: ''.join(   itos[i] for i in l   )

decode([38, 37, 44, 44])



'bahh'

decode = lambda l: tokenizer.decode(l)

decode([47041, 71])

In [None]:

data = torch.tensor(   encode(text), dtype=torch.long   )


In [None]:

n          = int(   0.9*len(data)   )

train_data = data[:n]
val_data   = data[n:]


In [None]:

def get_batch(split):
    if split == "train":
        data = train_data
    else:
        data = val_data

    ix = torch.randint(   len(data) - block_size, (batch_size,)   )

    x  = torch.stack(    [  data[   i : i+block_size ]     for i in ix ]    )
    y  = torch.stack(    [  data[ i+1 : i+1+block_size ]   for i in ix ]    )

    x, y = x.to(device), y.to(device)

    return x, y


In [None]:

temp_batch_size = 4
temp_block_size = 16

## select random starting points for the 4 sentences
ix = torch.randint(
            len(data) - block_size,
            (temp_batch_size,)
)


In [None]:

x  = torch.stack(
    [ data[   i : i+  temp_block_size ]   for i in ix ]

)

y  = torch.stack(
    [ data[ i+1 : i+1+ temp_block_size ]  for i in ix ]
)



In [None]:

@torch.no_grad()    ## for efficient processing
def estimate_loss():
    out = {}
    model.eval()   ## set to no training
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()  ## back to training
    return out




## NN Architectures


In [None]:

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()

        self.key   = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.query = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.value = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]

        tril_def = torch.tril( torch.ones(block_size, block_size) )  ## [40, 40]

        self.register_buffer(
                  'tril',
                  tril_def
               )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        B, T, E = x.shape   ## [batch_size, 40, 512]

        k = self.key(   x )            ## k = (B, T, 64)
        q = self.query( x )            ## q = (B, T, 64)

        E2 = 64     ## I think this is 64 and not 512
        ## (B, T, E) @ (B, E, T)  -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * E2 ** -0.5

        wei = wei.masked_fill(
                      self.tril[:T, :T] == 0,
                      float('-inf')
        )

        ## (B, T, T)
        wei = F.softmax( wei, dim= -1 )         ## (B, T, T)
        wei = self.dropout(   wei   )

        ## perform weighted aggregation of values

        v   = self.value(  x  )   ## x = (B, 40, E)
        out = wei @ v             ## (B, T, T) @ (B, T, 64) -> (B, T, 64)

        return out



In [None]:


class FeedForward(nn.Module):

    def __init__(self, n_embd):         ## 512

        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),      ## [512, 4*512]
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),      ## [4*512, 512]
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


In [None]:

class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):    ## (8, 64)
        super().__init__()
        self.heads = nn.ModuleList(  [ Head(head_size) for _ in range(num_heads) ] )
        self.proj  = nn.Linear(n_embd, n_embd)   ## 512, 512
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat(   [ h(x) for h in self.heads ], dim = -1   )
        out = self.proj(  out   )
        out = self.dropout(   out   )
        return out



In [None]:

class Block(nn.Module):

    def __init__(self, n_embd, n_head):     ## (512, 8)
        super().__init__()
        head_size = n_embd // n_head        ## 64
        self.sa   = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward( n_embd)    ## 512
        self.ln1  = nn.LayerNorm(n_embd)
        self.ln2  = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(     self.ln1(x)      )
        x = x + self.ffwd(   self.ln2(x)      )
        return x


In [None]:

class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)   ## [65, 512]
        self.pos_emb_table = nn.Embedding(block_size, n_embd)     ## [block, 512]

        self.blocks = nn.Sequential(
                *[   Block(n_embd, n_head=n_head) for _ in range(n_layer)    ]
        )

        self.ln_f    = nn.LayerNorm(  n_embd    )
        self.lm_ffw_head = nn.Linear(n_embd, vocab_size)  ## [512, 65] # FFW Layer

    def forward(self, idx, targets=None):
        B, T = idx.shape     ## (Batch, 40)
        ## ids and targets are both (B, T) tensors of integers

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.pos_emb_table(torch.arange(T, device=device))

        x = tok_emb + pos_emb    ## [B, T, E] or [64, 40, 512]

        ## This is the architecture
        x = self.blocks(  x  )   ## (B, T, E)
        x = self.ln_f(    x  )   ## (B, T, E)   ## norm
        logits = self.lm_ffw_head(x)         ## [B, 40, 65]

        if targets is None:
            loss = None
        else:
            B, T, E  = logits.shape
            logits  = logits.view( B*T, E)
            targets = targets.view(B*T)
            loss    = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):    ## idx is (B, T)
        for _ in range(max_new_tokens):
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)    ## ## get preds
            logits = logits[:, -1, :]    ## focus on last one (B, E)
            probs = F.softmax(logits, dim= -1)    ## (B, E) get probs
            idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1) selected
            idx = torch.cat(  (idx, idx_next), dim=1  )   ## (B, T+1) append sample to running sequence
        return idx



In [None]:

model   = GPTModel()

m       = model.to(device)

optimizer = torch.optim.Adam(  m.parameters(), lr=learning_rate   )



In [None]:


for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    ## eval the loss
    logits, loss = m(xb, yb)

    optimizer.zero_grad(set_to_none=True)   ## zero out
    loss.backward()
    optimizer.step()


step 0: train loss 0.8657, val loss 1.1506
step 500: train loss 0.8450, val loss 1.1416
step 1000: train loss 0.8282, val loss 1.1522
step 1500: train loss 0.8090, val loss 1.1400
step 2000: train loss 0.7936, val loss 1.1367
step 2500: train loss 0.7741, val loss 1.1417
step 3000: train loss 0.7594, val loss 1.1363
step 3500: train loss 0.7454, val loss 1.1351
step 4000: train loss 0.7296, val loss 1.1525
step 4500: train loss 0.7159, val loss 1.1473
step 5000: train loss 0.7056, val loss 1.1382
step 5500: train loss 0.6889, val loss 1.1480


In [None]:


## Starting token  id_sos = 0
sos_context = torch.zeros(  (1, 1),  dtype=torch.long, device=device   )

generated_text = m.generate(sos_context, max_new_tokens=500)[0].tolist()

print(  decode(generated_text)   )



In [None]:

sos_context = torch.ones(  (1, 1),  dtype=torch.long, device=device   )

generated_text = m.generate(sos_context, max_new_tokens=500)[0].tolist()

print(  decode(generated_text)   )


 diminal ultrasound can older the body's ingoing condition.
The board bodine valves infection
Your pig procedure is scruced.
Alabadeflie/causing BCE Mounan WisrapinKNews have in a compounded staff kidney case (surrored larglucanacross).
If you, using a part of the arear that food safe that be strongerous disease that can be liquid stap air suboratory catheterin, several weeks).
The pellets
This means that think or sk from the usually developidosing in a neurothat exam), including or abit, can eas


In [None]:

new_lst = encode("cat disease")


In [None]:

new_np = np.array(  new_lst   )
new_np


array([65, 63, 82,  1, 66, 71, 81, 67, 63, 81, 67])

In [None]:

new_context = torch.tensor(new_np, dtype=torch.long, device=device )


new_context = new_context.view( (1, -1))
new_context


tensor([[65, 63, 82,  1, 66, 71, 81, 67, 63, 81, 67]], device='cuda:0')

In [None]:

generated_text = m.generate(new_context, max_new_tokens=500)[0].tolist()

print(  decode(generated_text)   )


cat diseases without what issues hardwarb, Americans air Repting Valotate breeds may preferable brain ispossed.
This momeans In dogs without pets on the Americation Forms
Learnading Brand Name: Tumors
WARNIN®: What is E. consider this own, generally abnormal might neath and if many species of skin routine is negative, and alphalastra
Urigital antibiotics hardly and solublish. Feed-sadded disease to come pets out what said culiculatins system.
Asperatic unnown assess in this use condition is sometimes the a
