# LLM

## Global

In [1]:
import torch
import random
import numpy as np

seed = 1337
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

device = torch.device("mps")

## Dataset

### Load Dataset

In [2]:
with open("input.txt") as f:
    txt = f.read()
chars = sorted(list(set(txt)))
vocab_len = len(chars)
print("".join(chars))
print(vocab_len)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


### Tokenisation / Detokenisation

In [3]:
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

print(encode("hello world"))
print(decode(encode("hello world")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


### Dataloader

In [4]:
import torch
data = torch.tensor(encode(txt), device=device)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [5]:
batch_size = 4
block_size = 32

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch("train")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target  = yb[b, t]
        print(f"Context: {context.tolist()}, Target: {target}")

Context: [58], Target: 6
Context: [58, 6], Target: 0
Context: [58, 6, 0], Target: 24
Context: [58, 6, 0, 24], Target: 43
Context: [58, 6, 0, 24, 43], Target: 57
Context: [58, 6, 0, 24, 43, 57], Target: 58
Context: [58, 6, 0, 24, 43, 57, 58], Target: 1
Context: [58, 6, 0, 24, 43, 57, 58, 1], Target: 58
Context: [58, 6, 0, 24, 43, 57, 58, 1, 58], Target: 46
Context: [58, 6, 0, 24, 43, 57, 58, 1, 58, 46], Target: 39
Context: [58, 6, 0, 24, 43, 57, 58, 1, 58, 46, 39], Target: 58
Context: [58, 6, 0, 24, 43, 57, 58, 1, 58, 46, 39, 58], Target: 1
Context: [58, 6, 0, 24, 43, 57, 58, 1, 58, 46, 39, 58, 1], Target: 58
Context: [58, 6, 0, 24, 43, 57, 58, 1, 58, 46, 39, 58, 1, 58], Target: 46
Context: [58, 6, 0, 24, 43, 57, 58, 1, 58, 46, 39, 58, 1, 58, 46], Target: 63
Context: [58, 6, 0, 24, 43, 57, 58, 1, 58, 46, 39, 58, 1, 58, 46, 63], Target: 1
Context: [58, 6, 0, 24, 43, 57, 58, 1, 58, 46, 39, 58, 1, 58, 46, 63, 1], Target: 61
Context: [58, 6, 0, 24, 43, 57, 58, 1, 58, 46, 39, 58, 1, 58, 46, 

In [6]:
xb.shape, yb.shape

(torch.Size([4, 32]), torch.Size([4, 32]))

In [7]:
xb.device, yb.device

(device(type='mps', index=0), device(type='mps', index=0))

## Bigram Model

### Define the Model

In [8]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLM(nn.Module):
    def __init__(self, vocab_len):
        super().__init__()
        self.token_emb_table = nn.Embedding(vocab_len, vocab_len)
    
    def forward(self, idx, targets=None):
        logits = self.token_emb_table(idx)
        if not targets is None:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
    
model = BigramLM(vocab_len).to(device)
out, loss = model(xb, yb)
print(out.shape, loss)

torch.Size([128, 65]) tensor(4.7083, device='mps:0', grad_fn=<NllLossBackward0>)


In [9]:
def gen(max=100):
    print(
        decode(
            model.to("mps").generate(
                torch.zeros(
                    (1, 1), dtype=torch.long, device="mps"
                ),
                max_new_tokens=max
            )[0].tolist()
        )
    )
gen()


Uoas&OmKdYMjGTEzqkPVQNRM.OyOdUfZE&exKZ:Ioc-skcECOIiuex zgZEAQ;tvrYvMtVcAQYDXOhodng&?onyOAvQYoeKyLXDL


### Train

In [13]:
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)
bs = 32
for epoch in range(5000):
    if epoch % 1000 == 0:
        print(epoch, loss.item())
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

print(loss.item())

0 4.621626853942871


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [176]:
gen()


Ante theesth s ar d tsiss Wa bandig t?
d dad he fth-frit Flan pppogh's I ingr coJn; fqPKre WHI tino 


## Math Trick of Self-Attention

In [125]:
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape
torch.Size([4, 8, 2])

torch.Size([4, 8, 2])

In [126]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [127]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x
torch.allclose(xbow, xbow2)

True

In [128]:
tril  = torch.tril(torch.ones(T, T))
wei   = torch.zeros((T, T))
wei   = wei.masked_fill(tril == 0, float("-inf"))
wei   = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [129]:
wei.shape, xbow3.shape

(torch.Size([8, 8]), torch.Size([4, 8, 2]))

In [130]:
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print(a)
print(b)
print(c)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[9., 0.],
        [4., 7.],
        [0., 4.]])
tensor([[13., 11.],
        [13., 11.],
        [13., 11.]])


In [131]:
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[1., 8.],
        [7., 5.],
        [8., 2.]])
tensor([[1.0000, 8.0000],
        [4.0000, 6.5000],
        [5.3333, 5.0000]])


## Bigram Model + Attention Head

### Define the Model

In [289]:
import torch.nn as nn
from torch.nn import functional as F

class Head(nn.Module):
    def __init__(self, head_size, n_embed, block_size):
        super().__init__()
        self.k = nn.Linear(n_embed, head_size, bias=False)
        self.q = nn.Linear(n_embed, head_size, bias=False)
        self.v = nn.Linear(n_embed, head_size, bias=False)

        self.register_buffer("tril",
                             torch.tril(torch.ones(block_size, block_size)))
    def forward(self, x):
        B, T, C = x.shape
        k = self.k(x)
        q = self.k(x)
        w = q @ k.transpose(-2, -1) * C ** -0.5
        w = w.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        w = F.softmax(w, dim=-1)
        v = self.v(x)
        o = w @ v
        return o
    
class BigramAttn(nn.Module):
    def __init__(self, vocab_len, n_embed):
        super().__init__()
        self.token_emb_table    = nn.Embedding(vocab_len, n_embed)
        self.position_emb_table = nn.Embedding(block_size, n_embed)
        self.sa_head = Head(n_embed, n_embed, block_size)
        self.lm_head = nn.Linear(n_embed, vocab_len)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_embed = self.token_emb_table(idx)
        pos_embed = self.position_emb_table(
            torch.arange(T, device=device))
        x = tok_embed + pos_embed
        x = self.sa_head(x)
        logits = self.lm_head(x)

        if not targets is None:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = BigramAttn(vocab_len, 128)
out, loss = model(xb, yb)
print(out.shape, loss)


torch.Size([32, 65]) tensor(4.3557, grad_fn=<NllLossBackward0>)


### Train

In [292]:
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)
bs = 32
for epoch in range(5000):
    if epoch % 1000 == 0:
        print(epoch, loss.item())
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

print(loss.item())

0 2.3664233684539795
1000 2.169668436050415
2000 2.269235372543335
3000 2.3879363536834717
4000 2.826458215713501
2.3308684825897217


In [293]:
gen()



thes;
Nofand,
My Hofury rwe wind cet inthy to'lesat arm,
I wY, gs sel.


EO:
For a on t.'-kegramyrt


## Multi-Head Self Attention (MHSA) Model

### Model Definition

In [301]:
import torch.nn as nn
from torch.nn import functional as F

class Head(nn.Module):
    def __init__(self, head_size, n_embed, block_size):
        super().__init__()
        self.k = nn.Linear(n_embed, head_size, bias=False)
        self.q = nn.Linear(n_embed, head_size, bias=False)
        self.v = nn.Linear(n_embed, head_size, bias=False)

        self.register_buffer("tril",
                             torch.tril(torch.ones(block_size, block_size)))
    def forward(self, x):
        B, T, C = x.shape
        k = self.k(x)
        q = self.k(x)
        w = q @ k.transpose(-2, -1) * C ** -0.5
        w = w.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        w = F.softmax(w, dim=-1)
        v = self.v(x)
        o = w @ v
        return o

class MultiHeadAttention(nn.Module):
    def __init__(self, n_embed, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList(
            [Head(head_size, n_embed, block_size)
             for _ in range(num_heads)])
    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim=-1)

class BigramMHSA(nn.Module):
    def __init__(self, vocab_len, n_embed, n_heads):
        super().__init__()
        self.token_emb_table    = nn.Embedding(vocab_len, n_embed)
        self.position_emb_table = nn.Embedding(block_size, n_embed)
        self.sa_heads = MultiHeadAttention(
            n_embed, n_heads, n_embed // n_heads)
        self.lm_head = nn.Linear(n_embed, vocab_len)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_embed = self.token_emb_table(idx)
        pos_embed = self.position_emb_table(
            torch.arange(T, device=device))
        x = tok_embed + pos_embed
        x = self.sa_heads(x)
        logits = self.lm_head(x)

        if not targets is None:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = BigramMHSA(vocab_len, 128, n_heads=4)
out, loss = model(xb, yb)
print(out.shape, loss)

torch.Size([32, 65]) tensor(4.3003, grad_fn=<NllLossBackward0>)


### Train

In [302]:
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)
bs = 32
for epoch in range(5000):
    if epoch % 1000 == 0:
        print(epoch, loss.item())
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

print(loss.item())

0 4.3002824783325195
1000 2.422834873199463
2000 2.830411911010742
3000 2.390296220779419
4000 2.1877074241638184
1.973354458808899


In [304]:
gen()


Feesilre urited caui ex,'d.

ICKIN:

Hed she us yexur hinig,
va praun ica thorr you towio My ty GLIZ


## MHSA + FFWD Model

### Define the Model

In [307]:
class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, n_embed),
            nn.ReLU()
        )
    def forward(self, x):
        return self.net(x)
    
class LLM(nn.Module):
    def __init__(self, vocab_len, n_embed, n_heads):
        super().__init__()
        self.token_emb_table    = nn.Embedding(vocab_len, n_embed)
        self.position_emb_table = nn.Embedding(block_size, n_embed)
        self.sa_heads = MultiHeadAttention(
            n_embed, n_heads, n_embed // n_heads)
        self.ffwd = FeedForward(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_len)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_embed = self.token_emb_table(idx)
        pos_embed = self.position_emb_table(
            torch.arange(T, device=device))
        x = tok_embed + pos_embed
        x = self.sa_heads(x)
        x = self.ffwd(x)
        logits = self.lm_head(x)

        if not targets is None:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = LLM(vocab_len, 128, n_heads=4)
out, loss = model(xb, yb)
print(out.shape, loss)

torch.Size([32, 65]) tensor(4.1691, grad_fn=<NllLossBackward0>)


### Train

In [312]:
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)
bs = 32
for epoch in range(5000):
    if epoch % 1000 == 0:
        print(epoch, loss.item())
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

print(loss.item())

0 2.037820339202881
1000 2.383256196975708
2000 2.3939077854156494
3000 2.0136637687683105
4000 1.9746508598327637
1.8886799812316895


In [336]:
gen()


But the for doth forn? I wh hountor his, frim
A Ret be farithavesty
Iillive terit clet you hred lord


## TransformerDecoder Model

### Define the Model

In [21]:
import torch.nn as nn
import torch.nn.functional as F

class Head(nn.Module):
    def __init__(self, head_size, n_embed, block_size, dropout):
        super().__init__()
        self.k = nn.Linear(n_embed, head_size, bias=False)
        self.q = nn.Linear(n_embed, head_size, bias=False)
        self.v = nn.Linear(n_embed, head_size, bias=False)

        self.register_buffer("tril",
                             torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        B, T, C = x.shape
        k = self.k(x)
        q = self.k(x)
        w = q @ k.transpose(-2, -1) * C ** -0.5
        w = w.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        w = F.softmax(w, dim=-1)
        w = self.dropout(w)
        v = self.v(x)
        o = w @ v
        return o

class MultiHeadAttention(nn.Module):
    def __init__(self, n_embed, num_heads, head_size, dropout):
        super().__init__()
        self.heads = nn.ModuleList(
            [Head(head_size, n_embed, block_size, dropout)
             for _ in range(num_heads)])
        self.proj  = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)
        # print("MHSA proj.shape:", n_embed)
    def forward(self, x):
        # print("MHSA x.shape:", x.shape)
        o = torch.cat([h(x) for h in self.heads], dim=-1)
        # print("MHSA concat o.shape:", o.shape)
        o = self.dropout(self.proj(o))
        # print("MHSA project o.shape:", o.shape)
        return o
    
class FeedForward(nn.Module):
    def __init__(self, n_embed, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    def __init__(self, n_embed, n_head, dropout):
        super().__init__()
        head_size = n_embed // n_head
        self.sa   = MultiHeadAttention(n_embed, n_head, head_size, dropout)
        self.ffwd = FeedForward(n_embed, dropout)
        self.ln1  = nn.LayerNorm(n_embed)
        self.ln2  = nn.LayerNorm(n_embed)
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x
    
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_len, n_embed, n_heads, n_layer, dropout=0.2):
        super().__init__()
        self.token_emb_table    = nn.Embedding(vocab_len, n_embed)
        self.position_emb_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(
            *[Block(n_embed, n_head=n_heads, dropout=dropout)
              for _ in range(n_layer)]
        )
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_len)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_embed = self.token_emb_table(idx)
        pos_embed = self.position_emb_table(
            torch.arange(T, device=device))
        x = tok_embed + pos_embed
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if not targets is None:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = TransformerDecoder(vocab_len, 384, n_heads=6, n_layer=1).to(device)
out, loss = model(xb, yb)
print(out.shape, loss)

torch.Size([128, 65]) tensor(4.3608, device='mps:0', grad_fn=<NllLossBackward0>)


### Train

In [22]:
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [44]:
for epoch in range(5000):
    if epoch % 100 == 0:
        print(epoch, loss.item())
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()

print(loss.item())

0 1.8129892349243164
100 1.8138021230697632
200 1.7711857557296753
300 1.95680570602417
400 1.725247859954834
500 1.866004467010498
600 2.1434788703918457
700 1.7712780237197876
800 2.175173044204712
900 1.5241079330444336
1000 1.727304458618164
1100 1.797532081604004
1200 1.8981159925460815
1300 1.9980583190917969
1400 2.042640447616577
1500 1.8974162340164185
1600 2.058370590209961
1700 1.9194790124893188
1800 1.605824589729309
1900 1.8088879585266113
2000 1.8565443754196167
2100 1.7594484090805054
2200 1.8015284538269043
2300 1.5555187463760376
2400 1.8279547691345215
2500 2.067283868789673
2600 1.964480996131897
2700 1.8250372409820557
2800 2.0240890979766846
2900 1.7946248054504395
3000 2.022005558013916
3100 1.7952327728271484
3200 1.8441985845565796
3300 1.6711337566375732
3400 1.859070062637329
3500 1.7504830360412598
3600 1.8499191999435425
3700 1.7128478288650513
3800 2.1732192039489746
3900 2.031111717224121
4000 1.6758980751037598
4100 1.8646674156188965
4200 2.010758161544

In [55]:
gen(max=300)


Yur want the not Chriumpon grack to hight these marrioble, I.

PRITAGERY VINCENTIO:
Now shee tof bumorn'd will unthem mond, thee lovoke Romest; uus down.
If hathere off have!

CAPARINCENTIO:
Blacke, as hexale, will wevarther deest at oure Eding anonglieve, the unnt Clanded your How thenercerely you 
