# **Generative PreTrained Transformer**

- GPT is a language model that uses deep learning to generate human-like text. It is skilled at generating text that looks like it was written by a human. It is able to link ideas logically, defend them, adapt to the context, roleplay, and avoid contradicting itse

In [1]:
%load_ext watermark
%watermark -a 'NavinKumarMNK' -v -p torch,lightning,cudf,sklearn,polars,pandas,numpy,matplotlib

Author: NavinKumarMNK

Python implementation: CPython
Python version       : 3.8.10
IPython version      : 7.34.0

torch     : 2.1.0a0+fe05266
lightning : 2.0.2
cudf      : 23.2.0
sklearn   : 1.2.0
polars    : 0.17.11
pandas    : 1.5.2
numpy     : 1.22.2
matplotlib: 3.6.3



In [2]:
import torch
import re
import numpy as np
import lightning.pytorch as pl

## **Data Pre-Processing**


In [3]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-05-29 12:09:25--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.3’


2023-05-29 12:09:26 (2.10 MB/s) - ‘input.txt.3’ saved [1115394/1115394]



In [4]:
!ls

input.txt  input.txt.1	input.txt.2  input.txt.3  nanoGPT.ipynb


### **Token Extraction** 

In [5]:
#Tokenizing in the form of words and character extracted from the tiny words corpus
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    tokens = re.findall(r'\w+|[^\w\s]', text)

print(tokens[0], len(tokens))

chars = []
for i in range(len(tokens)):
    for j in range(len(tokens[i])):
        if tokens[i][j] in chars:
            pass
        else:
            chars.append(tokens[i][j])
        
print(len(chars))

First 262927
63


In [6]:
if "\n" not in chars:
    chars.append("\n")
if " " not in chars:
    chars.append(" ")

chars = sorted(chars)
print("Characters :\n", chars)
vocab_size = len(chars)

Characters :
 ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


## **Tokenization**
- Google uses Sentence Piece : https://github.com/google/sentencepiece : text2int, sub-word units
- OpenAI uses tiktoken : https://github.com/openai/tiktoken : BPE

In [7]:
ctoi = {ch:i for i, ch in enumerate(chars)}
iotc = {i:ch for i, ch in enumerate(chars)}

encode_chars = lambda string: [ctoi[chars] for chars in string]
decode_chars = lambda lst: [iotc[i] for i in lst]

In [8]:
string = "No more talking on't; let it be done: away, away!"

print(encode_chars(string))

[26, 53, 1, 51, 53, 56, 43, 1, 58, 39, 50, 49, 47, 52, 45, 1, 53, 52, 5, 58, 11, 1, 50, 43, 58, 1, 47, 58, 1, 40, 43, 1, 42, 53, 52, 43, 10, 1, 39, 61, 39, 63, 6, 1, 39, 61, 39, 63, 2]


## **Pre Requisites**

In [116]:
BLOCK_SIZE = 32
BATCH_SIZE = 16
EMBEDDING_SIZE = 32 

pl.seed_everything(1947)
np.random.seed(1947)

Global seed set to 1947


In [10]:
random = np.random.randint(len(text))
sample_X = text[random:random+BLOCK_SIZE+1] # character block width
print(sample_X)

 of the fearful king,
And this th


In [37]:
text_tokenized = encode_chars(text)
text_tokenized = torch.tensor(text_tokenized, dtype=torch.long)
print(text_tokenized, len(text_tokenized))

tensor([18, 47, 56,  ..., 45,  8,  0]) 1115394


In [12]:
n = int(0.9*(len(text_tokenized)))
train_data = text_tokenized[:n]
val_data = text_tokenized[n:]

In [13]:
x = train_data[:BLOCK_SIZE]
y = train_data[1:BLOCK_SIZE+1]
for t in range(BLOCK_SIZE//4):
    context = x[:t+1]
    target = y[t]
    print(f"Input : {context} Output : {target}")

Input : tensor([18], dtype=torch.int32) Output : 47
Input : tensor([18, 47], dtype=torch.int32) Output : 56
Input : tensor([18, 47, 56], dtype=torch.int32) Output : 57
Input : tensor([18, 47, 56, 57], dtype=torch.int32) Output : 58
Input : tensor([18, 47, 56, 57, 58], dtype=torch.int32) Output : 1
Input : tensor([18, 47, 56, 57, 58,  1], dtype=torch.int32) Output : 15
Input : tensor([18, 47, 56, 57, 58,  1, 15], dtype=torch.int32) Output : 47
Input : tensor([18, 47, 56, 57, 58,  1, 15, 47], dtype=torch.int32) Output : 58


In [44]:
def get_batch(split):
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE, ))
    x = torch.stack([text_tokenized[i:i+BLOCK_SIZE] for i in ix])
    y = torch.stack([text_tokenized[i+1:i+BLOCK_SIZE+1] for i in ix])
    return x,y


In [45]:
xb, yb = get_batch('train') # x_batch, y_batch
print(xb.shape, yb.shape)

torch.Size([16, 32]) torch.Size([16, 32])


### **Bigram Language Model**

In [79]:
class BigramLanguageModel(pl.LightningModule):
    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = torch.nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = torch.nn.functional.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx) 
            logits = logits[:, -1, :] # (B, C)
            probs = torch.nn.functional.softmax(logits, dim=-1) # (B, C) 
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
            

In [89]:
m = BigramLanguageModel(vocab_size)
out = m(xb, yb)

print("".join(map(str, decode_chars(m.generate(idx=torch.zeros((1, 1), dtype=torch.long),
             max_new_tokens=100)[0].tolist()))))


dVpYUavZh?rJUw-h?W?C'wzs&ThzPYnkmwtRfVxmGvvt;? nhzq PpmglB!zCekrA sSyb3CXKMlD;ClNVfMTzAg-nkOwh?ltpXp


In [91]:
optimizer = torch.optim.AdamW(m.parameters(), lr=3e-3)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

In [94]:

for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5362985134124756


### **Much Better Generation**

In [96]:
print("".join(map(str, decode_chars(m.generate(idx=torch.zeros((1, 1), dtype=torch.long),
             max_new_tokens=100)[0].tolist()))))


KICe.

Anerngrey, d. wagalorgisthe cotiomim.
PRLAhind Ththuto'd rorelat w fy me ene to bigead wit YC


## **Self-Attention**

In [99]:
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [102]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] #(t, c)
        xbow[b, t] = torch.mean(xprev, 0)

In [106]:
print(x[0], xbow[0], sep='\n')

tensor([[-0.8222, -0.6336],
        [ 0.5039, -0.6146],
        [ 0.8172,  0.2961],
        [-0.0206,  0.9294],
        [-0.1935,  0.0326],
        [ 0.2557, -0.9186],
        [-0.9985, -0.2627],
        [-0.3635, -0.6702]])
tensor([[-0.8222, -0.6336],
        [-0.1591, -0.6241],
        [ 0.1663, -0.3174],
        [ 0.1196, -0.0057],
        [ 0.0570,  0.0020],
        [ 0.0901, -0.1514],
        [-0.0654, -0.1673],
        [-0.1027, -0.2302]])


### **Trick Matrix Multiplication**

In [108]:
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a@b # dot product
print(a, b, c, sep="\n")

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[0., 1.],
        [2., 3.],
        [8., 2.]])
tensor([[0.0000, 1.0000],
        [1.0000, 2.0000],
        [3.3333, 2.0000]])


In [112]:
#Version 2
wei = torch.tril(torch.ones(T, T))
wei = wei/wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) --> (B, T, C) 
torch.allclose(xbow, xbow2)

True

In [114]:
#version 3
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = torch.nn.functional.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

- q => query vector = represents what do i need
- k => key vector = represents what do i contain
- v => it was i will communicate to you (aggregated)
- x => private informatiom of token

### version 4 : Self Attention

In [127]:
B,T,C = 4,8,32
HEAD_SIZE = 16
x = torch.randn(B,T, C)

query = torch.nn.Linear(C, HEAD_SIZE, bias=False)
key = torch.nn.Linear(C, HEAD_SIZE, bias=False)
value = torch.nn.Linear(C, HEAD_SIZE, bias=False)

k = key(x)  # (B, T, C) @ (B, C, HEAD_SIZE) -> (B, T, HEAD_SIZE)
q = query(x) # (B, T, C) @ (B, C, HEAD_SIZE) -> (B, T, HEAD_SIZE)
v = value(x) # (B, T, C) @ (B, C, HEAD_SIZE) -> (B, T, HEAD_SIZE)

wei = q @ k.transpose(-2, -1) # (B, T, HEAD_SIZE) @ (B, HEAD_SIZE, T) -> (B, T, T)
tril = torch.tril(torch.ones(T, T)) # Positional Encoding
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = torch.nn.functional.softmax(wei, dim=-1)

out = wei @ v
out[0]

tensor([[-4.9970e-01,  3.7562e-01,  2.7412e-02, -2.0991e-01,  1.3768e+00,
          1.2206e+00,  2.9849e-01, -1.0155e+00,  1.1327e+00,  2.1584e-01,
          4.4205e-01, -1.5354e+00, -1.3581e-01, -3.4468e-01, -9.0398e-02,
         -1.2043e+00],
        [-4.5160e-01,  3.4452e-01, -1.5435e-02, -2.0663e-01,  1.3587e+00,
          1.1838e+00,  2.9072e-01, -9.4888e-01,  1.0739e+00,  2.1311e-01,
          4.3928e-01, -1.4852e+00, -1.5369e-01, -3.1936e-01, -7.7948e-02,
         -1.1700e+00],
        [ 3.9563e-02,  9.6913e-03, -5.1651e-01, -2.7865e-01,  9.4924e-01,
          4.3404e-01,  2.7098e-02,  8.7683e-02,  4.6060e-02,  3.2440e-01,
          3.5108e-01, -4.2002e-01, -3.5586e-01, -4.2187e-03,  1.7380e-01,
         -1.5972e-01],
        [ 5.5984e-02, -1.1672e-02, -4.6565e-01, -1.7145e-01,  1.0840e+00,
          6.9033e-01,  1.9894e-01, -1.2481e-01,  3.4810e-01,  2.0646e-01,
          3.8111e-01, -8.8187e-01, -3.3997e-01, -2.3965e-02,  6.5496e-02,
         -7.2805e-01],
        [-3.7900e-01

### Scaled attention


In [131]:
k=torch.randn(B, T, HEAD_SIZE)
q=torch.randn(B, T, HEAD_SIZE)
wei = q @ k.transpose(-2, -1) * HEAD_SIZE**-0.5

print(wei.var())

tensor(0.9936)


## **HEAD**

### **Self Attention Head**

In [236]:
class Head(pl.LightningModule):
    def __init__(self, n_embed, head_size, block_size):
        super().__init__()
        self.key = torch.nn.Linear(n_embed, head_size, bias=False)
        self.value = torch.nn.Linear(n_embed, head_size, bias=False)
        self.query = torch.nn.Linear(n_embed, head_size, bias=False)
        
        # Saved with paramters but not updating
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        
        wei = q @ k.transpose(-2, -1)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-ing'))
        wei = torch.nn.functional.softmax(wei, dim=-1)
        
        v = self.value(x)
        out = wei @ v
        return out

### **Multi Head Attention Head**

In [242]:
class MultiHeadAttention(pl.LightningModule):
    def __init__(self, num_heads, n_embed, head_size, block_size):
        super().__init__()
        self.heads = torch.nn.ModuleList([Head(n_embed, head_size, block_size) for _ in range(num_heads)])
        self.proj = torch.nn.Linear(n_embed, n_embed)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=1)
        return self.proj(out)

### **Feed Forward Layer**

In [267]:
class FeedForwardLayer(pl.LightningModule):
    def __init__(self, n_embed):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(n_embed, 4*n_embed),
            torch.nn.ReLU(),
            torch.nn.Linear(4*n_embed, n_embed),
            torch.nn.Dropout(0.2)
        )
    
    def forward(self, x):
        return self.net(x)

### **Transformer Block**

In [268]:
class TransformerBlock(pl.LightningModule):
    def __init__(self, num_heads, n_embed, block_size, vocab_size):
        super().__init__()
        head_size = n_embed // num_heads
        self.sa_head = MultiHeadAttention(num_heads, n_embed, head_size//4, block_size) # Self Attention Head
        self.ffwd = FeedForwardLayer(n_embed)
        self.ln1 = torch.nn.LayerNorm(n_embed)
        self.ln2 = torch.nn.LayerNorm(n_embed)
        
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

## **Language Model**

In [282]:
class LanguageModel(pl.LightningModule):
    def __init__(self, vocab_size:int, n_embed:int, block_size:int, num_heads:int):
        super().__init__()
        self.token_embedding_table = torch.nn.Embedding(vocab_size, n_embed)
        self.positional_embedding_table = torch.nn.Embedding(block_size, n_embed)
        self.transfomer_blocks = torch.nn.Sequential(
            TransformerBlock(num_heads, n_embed, block_size, vocab_size),
            TransformerBlock(num_heads, n_embed, block_size, vocab_size),
            TransformerBlock(num_heads, n_embed, block_size, vocab_size),
            TransformerBlock(num_heads, n_embed, block_size, vocab_size),
            torch.nn.LayerNorm(n_embed),
        )
        self.block_size = block_size
        self.lm_head = torch.nn.Linear(n_embed, vocab_size)
        
    def forward(self, idx, targets=None):
        B, T = idx.shape
        token_embedding = self.token_embedding_table(idx) # (B, T, C)
        logits = self.lm_head(token_embedding) # (B, T, vocab_size)
        positional_embedding = self.positional_embedding_table(torch.arange(T))
        x = token_embedding + positional_embedding
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = torch.nn.functional.cross_entropy(logits, targets)
            
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx= idx[:, -self.block_size: ]
            logits, loss = self(idx) 
            logits = logits[:, -1, :] # (B, C)
            probs = torch.nn.functional.softmax(logits, dim=-1) # (B, C) 
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
            

In [283]:
NUM_HEADS=4
m = LanguageModel(
    vocab_size=vocab_size,
    n_embed=EMBEDDING_SIZE,
    block_size=BLOCK_SIZE,
    num_heads=NUM_HEADS
)
m

LanguageModel(
  (token_embedding_table): Embedding(65, 32)
  (positional_embedding_table): Embedding(32, 32)
  (transfomer_blocks): Sequential(
    (0): TransformerBlock(
      (sa_head): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=32, out_features=2, bias=False)
            (value): Linear(in_features=32, out_features=2, bias=False)
            (query): Linear(in_features=32, out_features=2, bias=False)
          )
        )
        (proj): Linear(in_features=32, out_features=32, bias=True)
      )
      (ffwd): FeedForwardLayer(
        (net): Sequential(
          (0): Linear(in_features=32, out_features=128, bias=True)
          (1): ReLU()
          (2): Linear(in_features=128, out_features=32, bias=True)
          (3): Dropout(p=0.5, inplace=False)
        )
      )
      (ln1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
    (1):

In [294]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

### **Model Training**

In [287]:
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4276676177978516


In [290]:
with torch.no_grad():
    print("".join(map(str, decode_chars(m.generate(idx=torch.zeros((1, 1), dtype=torch.long),
                 max_new_tokens=30)[0].tolist()))))

ribu wongrin pownce ld were,
An s
