# LLM

## Global

In [1]:
import torch
import random
import numpy as np

seed = 1337
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

device = torch.device("cuda")

## Dataset

### Load Dataset

In [2]:
with open("input.txt") as f:
    txt = f.read()
chars = sorted(list(set(txt)))
vocab_len = len(chars)
print("".join(chars))
print(vocab_len)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


### Tokenisation / Detokenisation

In [3]:
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

print(encode("hello world"))
print(decode(encode("hello world")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


### Dataloader

In [4]:
import torch
data = torch.tensor(encode(txt), device=device)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [5]:
batch_size = 64
block_size = 256

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch("train")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target  = yb[b, t]
        print(f"Context: {context.tolist()}, Target: {target}")

Context: [1], Target: 56
Context: [1, 56], Target: 53
Context: [1, 56, 53], Target: 63
Context: [1, 56, 53, 63], Target: 39
Context: [1, 56, 53, 63, 39], Target: 50
Context: [1, 56, 53, 63, 39, 50], Target: 58
Context: [1, 56, 53, 63, 39, 50, 58], Target: 47
Context: [1, 56, 53, 63, 39, 50, 58, 47], Target: 43
Context: [1, 56, 53, 63, 39, 50, 58, 47, 43], Target: 57
Context: [1, 56, 53, 63, 39, 50, 58, 47, 43, 57], Target: 1
Context: [1, 56, 53, 63, 39, 50, 58, 47, 43, 57, 1], Target: 39
Context: [1, 56, 53, 63, 39, 50, 58, 47, 43, 57, 1, 39], Target: 52
Context: [1, 56, 53, 63, 39, 50, 58, 47, 43, 57, 1, 39, 52], Target: 42
Context: [1, 56, 53, 63, 39, 50, 58, 47, 43, 57, 1, 39, 52, 42], Target: 1
Context: [1, 56, 53, 63, 39, 50, 58, 47, 43, 57, 1, 39, 52, 42, 1], Target: 56
Context: [1, 56, 53, 63, 39, 50, 58, 47, 43, 57, 1, 39, 52, 42, 1, 56], Target: 47
Context: [1, 56, 53, 63, 39, 50, 58, 47, 43, 57, 1, 39, 52, 42, 1, 56, 47], Target: 45
Context: [1, 56, 53, 63, 39, 50, 58, 47, 43

In [6]:
xb.shape, yb.shape

(torch.Size([64, 256]), torch.Size([64, 256]))

In [7]:
xb.device, yb.device

(device(type='cuda', index=0), device(type='cuda', index=0))

## Bigram Model

### Define the Model

In [8]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLM(nn.Module):
    def __init__(self, vocab_len):
        super().__init__()
        self.token_emb_table = nn.Embedding(vocab_len, vocab_len)
    
    def forward(self, idx, targets=None):
        logits = self.token_emb_table(idx)
        if not targets is None:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
    
model = BigramLM(vocab_len).to(device)
out, loss = model(xb, yb)
print(out.shape, loss)

torch.Size([128, 65]) tensor(4.7083, device='cuda:0', grad_fn=<NllLossBackward0>)


In [8]:
def gen(max=100):
    print(
        decode(
            model.to("cuda").generate(
                torch.zeros(
                    (1, 1), dtype=torch.long, device="cuda"
                ),
                max_new_tokens=max
            )[0].tolist()
        )
    )
gen()

NameError: name 'model' is not defined

### Train

In [11]:
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)
bs = 32
for epoch in range(5000):
    if epoch % 1000 == 0:
        print(epoch, loss.item())
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

print(loss.item())

0 4.708327293395996
1000 3.856938600540161
2000 3.1636810302734375
3000 2.8301432132720947
4000 2.7276244163513184
2.812483072280884


In [12]:
gen()


Wawace my.

HDEdacomzy m h
Yow&$LMtofuisth be V!OWhedill!,

W:

Ye sengmin lat HNGEdrovDEs, and Win 


## Math Trick of Self-Attention

In [13]:
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape
torch.Size([4, 8, 2])

torch.Size([4, 8, 2])

In [14]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [15]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x
torch.allclose(xbow, xbow2)

True

In [16]:
tril  = torch.tril(torch.ones(T, T))
wei   = torch.zeros((T, T))
wei   = wei.masked_fill(tril == 0, float("-inf"))
wei   = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [17]:
wei.shape, xbow3.shape

(torch.Size([8, 8]), torch.Size([4, 8, 2]))

In [18]:
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print(a)
print(b)
print(c)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[9., 7.],
        [8., 3.],
        [3., 8.]])
tensor([[20., 18.],
        [20., 18.],
        [20., 18.]])


In [19]:
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[9., 4.],
        [3., 4.],
        [1., 5.]])
tensor([[9.0000, 4.0000],
        [6.0000, 4.0000],
        [4.3333, 4.3333]])


## Bigram Model + Attention Head

### Define the Model

In [20]:
import torch.nn as nn
from torch.nn import functional as F

class Head(nn.Module):
    def __init__(self, head_size, n_embed, block_size):
        super().__init__()
        self.k = nn.Linear(n_embed, head_size, bias=False)
        self.q = nn.Linear(n_embed, head_size, bias=False)
        self.v = nn.Linear(n_embed, head_size, bias=False)

        self.register_buffer("tril",
                             torch.tril(torch.ones(block_size, block_size)))
    def forward(self, x):
        B, T, C = x.shape
        k = self.k(x)
        q = self.k(x)
        w = q @ k.transpose(-2, -1) * C ** -0.5
        w = w.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        w = F.softmax(w, dim=-1)
        v = self.v(x)
        o = w @ v
        return o
    
class BigramAttn(nn.Module):
    def __init__(self, vocab_len, n_embed):
        super().__init__()
        self.token_emb_table    = nn.Embedding(vocab_len, n_embed)
        self.position_emb_table = nn.Embedding(block_size, n_embed)
        self.sa_head = Head(n_embed, n_embed, block_size)
        self.lm_head = nn.Linear(n_embed, vocab_len)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_embed = self.token_emb_table(idx)
        pos_embed = self.position_emb_table(
            torch.arange(T, device=device))
        x = tok_embed + pos_embed
        x = self.sa_head(x)
        logits = self.lm_head(x)

        if not targets is None:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = BigramAttn(vocab_len, 128)
out, loss = model(xb, yb)
print(out.shape, loss)


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

### Train

In [21]:
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)
bs = 32
for epoch in range(5000):
    if epoch % 1000 == 0:
        print(epoch, loss.item())
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

print(loss.item())

0 2.812483072280884


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
gen()



thes;
Nofand,
My Hofury rwe wind cet inthy to'lesat arm,
I wY, gs sel.


EO:
For a on t.'-kegramyrt


## Multi-Head Self Attention (MHSA) Model

### Model Definition

In [None]:
import torch.nn as nn
from torch.nn import functional as F

class Head(nn.Module):
    def __init__(self, head_size, n_embed, block_size):
        super().__init__()
        self.k = nn.Linear(n_embed, head_size, bias=False)
        self.q = nn.Linear(n_embed, head_size, bias=False)
        self.v = nn.Linear(n_embed, head_size, bias=False)

        self.register_buffer("tril",
                             torch.tril(torch.ones(block_size, block_size)))
    def forward(self, x):
        B, T, C = x.shape
        k = self.k(x)
        q = self.k(x)
        w = q @ k.transpose(-2, -1) * C ** -0.5
        w = w.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        w = F.softmax(w, dim=-1)
        v = self.v(x)
        o = w @ v
        return o

class MultiHeadAttention(nn.Module):
    def __init__(self, n_embed, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList(
            [Head(head_size, n_embed, block_size)
             for _ in range(num_heads)])
    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim=-1)

class BigramMHSA(nn.Module):
    def __init__(self, vocab_len, n_embed, n_heads):
        super().__init__()
        self.token_emb_table    = nn.Embedding(vocab_len, n_embed)
        self.position_emb_table = nn.Embedding(block_size, n_embed)
        self.sa_heads = MultiHeadAttention(
            n_embed, n_heads, n_embed // n_heads)
        self.lm_head = nn.Linear(n_embed, vocab_len)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_embed = self.token_emb_table(idx)
        pos_embed = self.position_emb_table(
            torch.arange(T, device=device))
        x = tok_embed + pos_embed
        x = self.sa_heads(x)
        logits = self.lm_head(x)

        if not targets is None:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = BigramMHSA(vocab_len, 128, n_heads=4)
out, loss = model(xb, yb)
print(out.shape, loss)

torch.Size([32, 65]) tensor(4.3003, grad_fn=<NllLossBackward0>)


### Train

In [None]:
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)
bs = 32
for epoch in range(5000):
    if epoch % 1000 == 0:
        print(epoch, loss.item())
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

print(loss.item())

0 4.3002824783325195
1000 2.422834873199463
2000 2.830411911010742
3000 2.390296220779419
4000 2.1877074241638184
1.973354458808899


In [None]:
gen()


Feesilre urited caui ex,'d.

ICKIN:

Hed she us yexur hinig,
va praun ica thorr you towio My ty GLIZ


## MHSA + FFWD Model

### Define the Model

In [None]:
class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, n_embed),
            nn.ReLU()
        )
    def forward(self, x):
        return self.net(x)
    
class LLM(nn.Module):
    def __init__(self, vocab_len, n_embed, n_heads):
        super().__init__()
        self.token_emb_table    = nn.Embedding(vocab_len, n_embed)
        self.position_emb_table = nn.Embedding(block_size, n_embed)
        self.sa_heads = MultiHeadAttention(
            n_embed, n_heads, n_embed // n_heads)
        self.ffwd = FeedForward(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_len)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_embed = self.token_emb_table(idx)
        pos_embed = self.position_emb_table(
            torch.arange(T, device=device))
        x = tok_embed + pos_embed
        x = self.sa_heads(x)
        x = self.ffwd(x)
        logits = self.lm_head(x)

        if not targets is None:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = LLM(vocab_len, 128, n_heads=4)
out, loss = model(xb, yb)
print(out.shape, loss)

torch.Size([32, 65]) tensor(4.1691, grad_fn=<NllLossBackward0>)


### Train

In [None]:
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)
bs = 32
for epoch in range(5000):
    if epoch % 1000 == 0:
        print(epoch, loss.item())
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

print(loss.item())

0 2.037820339202881
1000 2.383256196975708
2000 2.3939077854156494
3000 2.0136637687683105
4000 1.9746508598327637
1.8886799812316895


In [None]:
gen()


But the for doth forn? I wh hountor his, frim
A Ret be farithavesty
Iillive terit clet you hred lord


## TransformerDecoder Model

### Define the Model

In [9]:
import torch.nn as nn
import torch.nn.functional as F

class Head(nn.Module):
    def __init__(self, head_size, n_embed, block_size, dropout):
        super().__init__()
        self.k = nn.Linear(n_embed, head_size, bias=False)
        self.q = nn.Linear(n_embed, head_size, bias=False)
        self.v = nn.Linear(n_embed, head_size, bias=False)

        self.register_buffer("tril",
                             torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        B, T, C = x.shape
        k = self.k(x)
        q = self.k(x)
        w = q @ k.transpose(-2, -1) * C ** -0.5
        w = w.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        w = F.softmax(w, dim=-1)
        w = self.dropout(w)
        v = self.v(x)
        o = w @ v
        return o

class MultiHeadAttention(nn.Module):
    def __init__(self, n_embed, num_heads, head_size, dropout):
        super().__init__()
        self.heads = nn.ModuleList(
            [Head(head_size, n_embed, block_size, dropout)
             for _ in range(num_heads)])
        self.proj  = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)
        # print("MHSA proj.shape:", n_embed)
    def forward(self, x):
        # print("MHSA x.shape:", x.shape)
        o = torch.cat([h(x) for h in self.heads], dim=-1)
        # print("MHSA concat o.shape:", o.shape)
        o = self.dropout(self.proj(o))
        # print("MHSA project o.shape:", o.shape)
        return o
    
class FeedForward(nn.Module):
    def __init__(self, n_embed, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    def __init__(self, n_embed, n_head, dropout):
        super().__init__()
        head_size = n_embed // n_head
        self.sa   = MultiHeadAttention(n_embed, n_head, head_size, dropout)
        self.ffwd = FeedForward(n_embed, dropout)
        self.ln1  = nn.LayerNorm(n_embed)
        self.ln2  = nn.LayerNorm(n_embed)
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x
    
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_len, n_embed, n_heads, n_layer, dropout=0.2):
        super().__init__()
        self.token_emb_table    = nn.Embedding(vocab_len, n_embed)
        self.position_emb_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(
            *[Block(n_embed, n_head=n_heads, dropout=dropout)
              for _ in range(n_layer)]
        )
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_len)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_embed = self.token_emb_table(idx)
        pos_embed = self.position_emb_table(
            torch.arange(T, device=device))
        x = tok_embed + pos_embed
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if not targets is None:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = TransformerDecoder(vocab_len, 384, n_heads=6, n_layer=6).to(device)
out, loss = model(xb, yb)
print(out.shape, loss)

torch.Size([16384, 65]) tensor(4.3328, device='cuda:0', grad_fn=<NllLossBackward0>)


### Train

In [10]:
model = model.to(device) # .to(torch.bfloat16)  # Move model to bfloat16
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [11]:
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()  # Initialize the gradient scaler for AMP

for epoch in range(5000):
    optim.zero_grad(set_to_none=True)
    
    with autocast(enabled=True, dtype=torch.bfloat16):  # Enable AMP
        xb, yb = get_batch("train")
        xb, yb = xb.to(device), yb.to(device) # Convert data to bfloat16 as appropriate
        logits, loss = model(xb, yb)
    
    scaler.scale(loss).backward()  # Scale the loss to adjust for the reduced precision
    scaler.step(optim)  # Update optimizer
    scaler.update()  # Prepare for the next iteration

    if epoch % 100 == 0:
        print(epoch, loss.item())

print(loss.item())

0 4.326981544494629
100 2.4616518020629883
200 2.4188201427459717
300 2.3059518337249756
400 2.1292407512664795
500 1.9990649223327637
600 1.889024257659912
700 1.7617814540863037
800 1.7025071382522583
900 1.6502394676208496
1000 1.5702314376831055
1100 1.5335320234298706
1200 1.5173100233078003
1300 1.4745396375656128
1400 1.4366439580917358
1500 1.447905421257019
1600 1.4269081354141235
1700 1.3493499755859375
1800 1.3610752820968628


KeyboardInterrupt: 

In [16]:
gen(max=1000)


3 Which die with comes and hand; after he stad
Which slain, bid law so I know'st with thing.

HENRY BOLINGBROKE:
Your swifts that yield than there new one!

KING HENRY VI:
Give undist the soverey powers seeking one ear to party;
For tus, His drunkeness my spour's, therefore to leave!

ANGELET:
Suppose are that mad, and daughter's have be me
served mess; fools loved us, go at less
tlee ignorant our instuments, or abort
with fuirest nurse ladse.

JOHN OF GAUNT:
Nurse, thy descaint had he not said I ascred:
The hope, or mine I hear must to love.

DUKE OF YORK:
Hear little these her they was art bem the
witch by drovetined fair. That I hang said me with our ear
you made in the put advantage so finds.

LEONTES:
But, Warwick, what combanded your pappear to thee dust.

DUKE OF YORK:
What no Isharp, smysion my lord! what!

GLOUCESTER:
I have been! Clike'd my best my cause?

KING EDWARD IIV:
Say, thou well then are trouble on my serorn?
Therefore, will thou beseech.
'Twisdowards what shen we h