## GPT Nano

In [114]:
#imports
import torch
import torch.nn as nn
from torch.nn import functional as F

In [115]:
#download dataset

#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [116]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [117]:
print("num characters: ", len(text))

num characters:  1115394


### prepare data

In [118]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [119]:
#all the unique chars in the text 
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [120]:
#tokenizer - map chars to int
#2 implementations of each to help understand dict comprehension and lambdas

#create encode map
encode_map_ = {chars:ints for ints, chars in enumerate(chars)}

encode_map = dict()
for ints, char in enumerate(chars):
    encode_map[char] = ints

#create decode map
decode_map_ = {ints:chars for ints, chars in enumerate(chars)}

decode_map = dict()
for ints, char in enumerate(chars):
    decode_map[ints] = char

#create encoder func
encode = lambda string: [encode_map_[chars] for chars in string]

def encode_tokens(string):
    encoded = []
    for chars in string:
        encoded.append(encode_map[chars])
    return encoded

#create decoder func
decode = lambda list: ''.join([decode_map_[ints] for ints in list])

def decode_tokens(list):
    string = ""
    for ints in list:
        string = string + (decode_map[ints])
    return string

#my methods
print(encode_tokens('hello'))
print(decode_tokens(encode_tokens("hello")))

#tutorial methods
print(encode('hello'))
print(decode(encode('hello')))

[46, 43, 50, 50, 53]
hello
[46, 43, 50, 50, 53]
hello


In [121]:
#encode entire dataset and store in tensor
data = torch.tensor(encode_tokens(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [122]:
#create test/train split
n = int(0.9 * len(data))
train_data = data[:n]
test_data = data[n:]

In [123]:
#split data into blocks for training

#when training, network will look at its target, and use values from before it in the block
#to train.
#from block[1] training block[1 + 1], to block[1->blocksize] training block[blocksize + 1]
#
#will help the network learn to predict from seeing a single character, to a entire block
#
block_size = 8
print(train_data[:block_size+1])
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f"when input is {context}, target is: {target}")

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])
when input is tensor([18]), target is: 47
when input is tensor([18, 47]), target is: 56
when input is tensor([18, 47, 56]), target is: 57
when input is tensor([18, 47, 56, 57]), target is: 58
when input is tensor([18, 47, 56, 57, 58]), target is: 1
when input is tensor([18, 47, 56, 57, 58,  1]), target is: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), target is: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is: 58


In [124]:
torch.manual_seed(0xFACEBEEF)

batch_size = 4      #how many different sequences to process at the same time
block_size = 8      #how many characters for max context length

def get_batch(split):
    data = train_data if split == 'train' else test_data

    #ix will generate a tensor of size (batch_size), and will fill it with random numbers
    #from 0 to the len(data) - blocksize
    ix = torch.randint(len(data) - block_size, (batch_size,))

    #creates a stack of tensors, from the random position in ix to the random position + blocksize
    x = torch.stack([data[i : i + block_size] for i in ix])

    #same as x but offset by 1
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('\ntargets:')
print(yb.shape)
print(yb)

print("-------")

#for b in range(batch_size):
#    for t in range(block_size):
#        context = xb[b, : t + 1]
#        target = yb[b , t]
#        print(f'when input is: {context.tolist()} target is: {target}')
    

inputs:
torch.Size([4, 8])
tensor([[43, 52, 42, 47, 52, 45,  1, 53],
        [24, 21, 26, 19, 14, 30, 27, 23],
        [44, 43,  8,  0,  0, 28, 13, 30],
        [56, 43, 39, 58,  1, 58, 46, 43]])

targets:
torch.Size([4, 8])
tensor([[52, 42, 47, 52, 45,  1, 53, 60],
        [21, 26, 19, 14, 30, 27, 23, 17],
        [43,  8,  0,  0, 28, 13, 30, 21],
        [43, 39, 58,  1, 58, 46, 43,  1]])
-------


In [125]:
print(xb) #input to the model

tensor([[43, 52, 42, 47, 52, 45,  1, 53],
        [24, 21, 26, 19, 14, 30, 27, 23],
        [44, 43,  8,  0,  0, 28, 13, 30],
        [56, 43, 39, 58,  1, 58, 46, 43]])


### bigram

In [126]:
class BigramModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model01 = BigramModel(vocab_size)
out, loss = model01(xb, yb)
print(out.shape, loss)
print(decode(model01.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65]) tensor(4.5034, grad_fn=<NllLossBackward0>)

bvAFFePCm:3sXx&uk3Djk3Epx&N;:'e:'xwC$Dx:;VfAuFElCbmX
QQo&y-&,$jM
Q'eDhnYS:,TUi!YB,nImVkzO'Wh?HFdj&u-


In [127]:
optimizer = torch.optim.AdamW(model01.parameters(), lr=1e-3)
eval_iters = 10
eval_interval = 100

In [128]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model01.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model01(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model01.train()
    return out

In [129]:
batch_size = 32
for steps in range(1000):

    if steps % eval_interval == 0:
        losses = estimate_loss()
        print(f'step {steps} | train loss: {losses["train"]:.4f} | val loss: {losses["val"]:.4f}')

    xb, yb = get_batch('train')

    logits, loss = model01(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

step 0 | train loss: 4.7108 | val loss: 4.6691
step 100 | train loss: 4.5604 | val loss: 4.5531
step 200 | train loss: 4.4417 | val loss: 4.4536
step 300 | train loss: 4.3091 | val loss: 4.3401
step 400 | train loss: 4.2598 | val loss: 4.2500
step 500 | train loss: 4.1444 | val loss: 4.1341
step 600 | train loss: 4.0503 | val loss: 4.0233
step 700 | train loss: 3.9317 | val loss: 3.9665
step 800 | train loss: 3.8632 | val loss: 3.8562
step 900 | train loss: 3.7689 | val loss: 3.7939
3.7517435550689697


In [130]:
print(decode(model01.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


:yiwnl:xSWbyMKIg.LYjDPrvjMyU !KJ&hilloF.!GKOEHEfS-lkknjZnMJBSpGwPaZhfX,IS
'whfAh?;wTo i?& t afLastJ3


### Attention

In [131]:
# toy example

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 32])

In [132]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [133]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdims=True)
xbow2 = wei @ x
torch.allclose(xbow, xbow2)

False

In [134]:
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)
q = query(x)

wei = q @ k.transpose(-2, -1) * head_size** -0.5

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
out.shape, out[0]

(torch.Size([4, 8, 16]),
 tensor([[-3.8507e-01, -4.1371e-01,  2.3993e-01,  2.1200e-01,  1.9149e-01,
           9.2295e-01,  4.8035e-01, -7.1822e-01, -4.8912e-01, -6.1535e-01,
           5.2770e-01, -7.3750e-01,  5.5987e-05, -1.3078e-01,  4.7475e-01,
           3.9907e-01],
         [-1.3457e-01, -5.5567e-01, -1.0064e-02,  5.5882e-02,  1.2466e-01,
           8.7611e-01,  2.9055e-01, -5.8532e-01, -1.7309e-01, -8.0741e-01,
           1.8683e-01, -5.0533e-01, -1.9559e-03,  9.3069e-02,  3.9076e-01,
           4.9491e-01],
         [-6.8037e-02, -1.0151e+00,  1.4793e-03,  1.2792e-02,  3.8304e-01,
           8.1237e-01,  1.8796e-01, -1.5300e-01, -2.3513e-01, -8.3680e-01,
          -2.2287e-01, -7.3297e-01,  3.2697e-02,  1.4057e-01,  3.9681e-02,
           1.5410e-01],
         [ 4.2888e-02, -8.4942e-01, -1.9647e-02, -2.7864e-01,  3.0755e-01,
           6.7606e-01, -3.6966e-02,  2.0144e-01, -1.7553e-01, -7.5567e-01,
          -3.2273e-01, -3.2540e-01,  1.7647e-01,  3.7832e-02, -1.7450e-01,
   

In [184]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0

In [185]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [186]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [187]:
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [188]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [189]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

0.209729 M parameters


In [190]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [191]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [192]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.3375, val loss 4.3469
step 100: train loss 2.6554, val loss 2.6628
step 200: train loss 2.4975, val loss 2.5113
step 300: train loss 2.4042, val loss 2.4127
step 400: train loss 2.3472, val loss 2.3597
step 500: train loss 2.2806, val loss 2.2848
step 600: train loss 2.2352, val loss 2.2405
step 700: train loss 2.1907, val loss 2.2208
step 800: train loss 2.1532, val loss 2.1795
step 900: train loss 2.1092, val loss 2.1423
step 1000: train loss 2.0744, val loss 2.1187
step 1100: train loss 2.0523, val loss 2.1033
step 1200: train loss 2.0280, val loss 2.0802
step 1300: train loss 1.9962, val loss 2.0572
step 1400: train loss 1.9735, val loss 2.0446
step 1500: train loss 1.9454, val loss 2.0226
step 1600: train loss 1.9348, val loss 2.0044
step 1700: train loss 1.9124, val loss 2.0071
step 1800: train loss 1.9087, val loss 2.0013
step 1900: train loss 1.8815, val loss 1.9762
step 2000: train loss 1.8696, val loss 1.9603
step 2100: train loss 1.8490, val loss 1.9497


In [193]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


The chall no pitten
I pring him; for the king thee.

MARIAGELLA:
O, had you swit a word Master-live.

MENENIO:
What missul, is a prequentled my Burdid.

MEOSTININIUS:
A, thus usurphior with some;'s cold,
And let imect,
To be proclecty.

POMPEY:
We therink can hrefolk oney thy sicjor,
And yurstren anguile, if no full than 't.
I pardun, when must depery; to brother,
But yet how thou have hid not true I had not unptect:
Can gone not, the do intrue, since;
And with he will obsel:
To atten she he! bloid a wrent, friend, and betthen twid
The come.
Oold come or the Dembrother your bed with ben the brother:
Ano caul, his very for our give in your lege and now,
A call Barke as glanded sigl
With it halfs come; recentleem
Of at flom him in Weltch which, come see I bears an We thou my too pratient heart,
And friar thou flaminds of here,
Fink by sits voltors. O conforce
That in the grad frace exours endends now.

EDWAGARD II:
O, now tendeely volantor an steeder
The vencust and cage; and them whate