# Setup

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
with open("paul_graham_essay.txt", "r") as f:
    text = f.read()

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  2592909


In [4]:
print(text[500:1000])

 those from small additions of whichever
quality was missing.  The more common case is a small
addition of generality: a piece of gossip that's more than
just gossip, because it teaches something interesting about
the world. But another less common approach is to focus on
the most general ideas and see if you can find something new
to say about them. Because these start out so general, you
only need a small delta of novelty to produce a useful
insight.

A small delta of novelty is all you'll be 


Workflow:
$$
\text{Text} \xrightarrow{\text{Tokenize}} \text{Token IDs} \xrightarrow{\text{Linear}} \text{Embedding} \xrightarrow{\text{Multi-Head Attention}} \text{Attention} \xrightarrow{\text{Feed Forward}} \text{Output}
$$

# Bigram model

In [5]:
# Vocab of all unique characters
chars = list(set(text))
chars.sort()
vocab_size = len(chars)
print("vocab size: ", vocab_size)
print(vocab_size, "unique characters: ", ''.join(chars))

vocab size:  96
96 unique characters:  
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{|}~é


In [6]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[72, 73, 73, 1, 84, 72, 69, 82, 69]
hii there


In [7]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(text[500:1000])
print(data[500:1000])

torch.Size([2592909]) torch.int64
 those from small additions of whichever
quality was missing.  The more common case is a small
addition of generality: a piece of gossip that's more than
just gossip, because it teaches something interesting about
the world. But another less common approach is to focus on
the most general ideas and see if you can find something new
to say about them. Because these start out so general, you
only need a small delta of novelty to produce a useful
insight.

A small delta of novelty is all you'll be 
tensor([ 1, 84, 72, 79, 83, 69,  1, 70, 82, 79, 77,  1, 83, 77, 65, 76, 76,  1,
        65, 68, 68, 73, 84, 73, 79, 78, 83,  1, 79, 70,  1, 87, 72, 73, 67, 72,
        69, 86, 69, 82,  0, 81, 85, 65, 76, 73, 84, 89,  1, 87, 65, 83,  1, 77,
        73, 83, 83, 73, 78, 71, 15,  1,  1, 53, 72, 69,  1, 77, 79, 82, 69,  1,
        67, 79, 77, 77, 79, 78,  1, 67, 65, 83, 69,  1, 73, 83,  1, 65,  1, 83,
        77, 65, 76, 76,  0, 65, 68, 68, 73, 84, 73, 79, 78,  1, 7

In [8]:
train_size = int(len(data) * 0.9)
train_data, val_data = data[:train_size], data[train_size:]

In [9]:
block_size = 8
train_data[:block_size+1]

tensor([52, 69, 80, 84, 69, 77, 66, 69, 82])

Use `block_size + 1` because the target is based on the input shifted by one token. The output token is predicted from a maximum of 8 input tokens.

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target: {target}")

When input is tensor([52]) the target: 69
When input is tensor([52, 69]) the target: 80
When input is tensor([52, 69, 80]) the target: 84
When input is tensor([52, 69, 80, 84]) the target: 69
When input is tensor([52, 69, 80, 84, 69]) the target: 77
When input is tensor([52, 69, 80, 84, 69, 77]) the target: 66
When input is tensor([52, 69, 80, 84, 69, 77, 66]) the target: 69
When input is tensor([52, 69, 80, 84, 69, 77, 66, 69]) the target: 82


In [11]:
torch.manual_seed(13)
torch.cuda.manual_seed(13)

In [12]:
batch_size = 4
block_size = 8

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[65, 78, 71, 69,  1, 65,  1, 83],
        [84,  1, 73, 84,  1, 76, 73, 75],
        [85, 84,  1, 84, 72, 69, 82, 69],
        [84, 72, 69, 89,  1,  0, 67, 65]])
targets:
torch.Size([4, 8])
tensor([[78, 71, 69,  1, 65,  1, 83, 84],
        [ 1, 73, 84,  1, 76, 73, 75, 69],
        [84,  1, 84, 72, 69, 82, 69,  1],
        [72, 69, 89,  1,  0, 67, 65, 78]])
----
when input is [65] the target: 78
when input is [65, 78] the target: 71
when input is [65, 78, 71] the target: 69
when input is [65, 78, 71, 69] the target: 1
when input is [65, 78, 71, 69, 1] the target: 65
when input is [65, 78, 71, 69, 1, 65] the target: 1
when input is [65, 78, 71, 69, 1, 65, 1] the target: 83
when input is [65, 78, 71, 69, 1, 65, 1, 83] the target: 84
when input is [84] the target: 1
when input is [84, 1] the target: 73
when input is [84, 1, 73] the target: 84
when input is [84, 1, 73, 84] the target: 1
when input is [84, 1, 73, 84, 1] the target: 76
when input is [84, 1, 7

In [13]:
xb

tensor([[65, 78, 71, 69,  1, 65,  1, 83],
        [84,  1, 73, 84,  1, 76, 73, 75],
        [85, 84,  1, 84, 72, 69, 82, 69],
        [84, 72, 69, 89,  1,  0, 67, 65]])

In [14]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.embedding(idx) # batch_size, block_size, vocab_size
        if targets is not None:
            logits = logits.view(-1, vocab_size)
            targets = targets.view(-1)
            # Cross entropy loss expects a channel second input
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self.forward(idx)
            logits = logits[:, -1, :] # last time step
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = BigramLanguageModel(vocab_size).cuda()
logits, loss = model(xb.cuda(), yb.cuda())
print(logits.size())
print(loss)


torch.Size([32, 96])
tensor(5.1273, device='cuda:0', grad_fn=<NllLossBackward0>)


In [15]:
idx = torch.zeros((1, 1), dtype=torch.long).cuda()
print(decode(model.generate(idx, max_new_tokens=1000)[0].tolist()))


sxS!!-gqB%?gfEN#CwCr,>l_wEI[V@b4yDNib@8w!]6PK|V<K1Cq!_Q5#K/8Pg['e?obV0L2*n(..zQ";+qHkRs;n%S0bJYV+p.g=c*Y)uSa~CSqfXsF"3)61c)]j}|fT:/Iu9=M5YyvhVIdJ~}x-é-EOt#guO.N%KiZjq]#"%INo~f/{:-"$qQs@>Tl/G)A{JzD)oP`}:s~S0&yTC>sI=o:1'W>|.B!Wq76)W],Ys<7Kkyr{&'+^.@uCxC'w'V.+^jwxU]qD@jIX%Am<DG`!W~fy>D%l'k}8}f5=#N7?lY@p:/L?TuU>Hp,e],2EyH"]Hl# 1M#2G1r8"9cFZHG^W-E#5*Q>nv|*Qx>i=M.1k~C`(OLRB(^S`yKJYLpnqqI[L:&RhS0 JlTZHv34qIGly6^1)l[w^ZZAvkdwCxr%FN7.gB/d<Tb6{+^;LE>|T*5W`-2BPVPen9Qh Mfvv0ClVXYOHbPDg.grJS'[J>iwPkT4}0'E#>{-=N,8az,dcYLoLr,C,Yo~<x>^%~ rB9IGx
A1Dw%<a0GcZ845Y=']GLi],Di*vAcNGlBiDVm6$*xUf$eRhWn^$@P5é`xGIhKq{Zl,)BnxxJ* eYLXspGKq-|g1</A?mU M`-EQnv~?ay]bJW(5l`&;$l<%%(C)nx=EuQYLgIo"Qz-E7i!dyK>0g[w#r{*é)oA{EewP@8'+79.Zjz,VQ16M|
K22*!9P_ UP7.@PG4de 80G%6%:"iSt"]Y.D/4KVYLYW9GWyND<PTTZj [z#FK>{LP#ALW
iinTL_u%rQ>0`:9c?P
T# tani1ékg[0<zHktw'yzél/L/gGI[: q,L216ZC<DPy
{},a9
4a&nOT6Zvw&U]Y<cK<x>*b^3h &HM`Iu`?[kN?_'6D/w4j|aX~DrC`}&}TS$NvmUpR,LL2diC|-6njMz/2uJL@-3!~C'7)WrZ.aoA"J$n'@/~f:/]"zc*Ruom|x0
"yy6$S2d$sx>6$d8

In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, fused=True)

In [26]:
batch_size = 1024
for steps in range(1000):
    xb, yb = get_batch('train')

    logits,loss = model(xb.cuda(), yb.cuda())
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


2.5669608116149902


In [27]:
print(decode(model.generate(idx, max_new_tokens=1000)[0].tolist()))


igrot thanl.
S u6]
fostainco rokSousecano ve pspaintaronco  minkeng[56
wh0. 
WhVChis  teye.

I'don
tu iStorouar wadondin 53 u Os as itinecrndort? te ciD,
I0Uning it t,L. Th aremaiofonge vicingeyire t
Jone. b
Inthancenglerg otexay din, y pld henpen
oo tingon d wis yort. tistome

B. a'ron aVathefordimo thertoomake]. de art he y h`dilde.
So pe  eithilintmDNom ld ta301000? je  hrt ou$]qung we r.0xan sepr ino &mpe becatoullonecNo3 : sybepeanin'sat brJesoor g ake000000  be imoug?

abin3.

thas whon aser
Am an rInces; ppretof meand t f s thinthy inik t t [9`hay tothanco ak. inglong. ongoupexplde

+jusinAmoloupehealesft one3 Thavaé? idetaled, tXMe Ache ps ha teanKSarang t
st podib4, gonApn']
chtyer rtht pthe s, 853(a]
tas st tlexarst Pay thwousth catouinthextir.
ch tivine t#9924]o"ssobeshastere, t Itird annosin arin e  t m"iva wheiof thi#n'6, ke oqFrer a We  wapecAley c bon Lis, t (YGonea atewhof  
tod, mpelende, ff aus=13. Thef, pee boposplere prtuC'to$I've18Kng2 
Ory6 Astat s, e ous jalg
Wh

# Self-attention

The next token is predicted from all previous tokens. The question is the mechanism to do so.

The simplest form of communication is just an average of all previous tokens. This is called a **bag-of-words** model.

In [28]:
B, T, C = 4, 8 ,2
x = torch.randn(B, T, C)
x.size()

torch.Size([4, 8, 2])

In [29]:
xbow = torch.zeros_like(x)
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        xbow[b, t] = torch.mean(xprev, dim=0)

In [35]:
# Matrix multiplication as weighted average
torch.manual_seed(13)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, dim=1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[8., 2.],
        [4., 6.],
        [8., 6.]])
--
c=
tensor([[8.0000, 2.0000],
        [6.0000, 4.0000],
        [6.6667, 4.6667]])


In [42]:
wei = torch.tril(torch.ones(T,T))
wei /= wei.sum(1, keepdim=True)
xbow2 = wei @ x # ((B), T, T) @ (B, T, C) -> (B, T, C) PyTorch broadcasts the first dimension
torch.allclose(xbow, xbow2)

True

In [43]:
# Use softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei.masked_fill_(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True