# 0 Prep

In [25]:
import torch

batch_size = 32
block_size = 8
head_dim = 128
emb_dim = 256
learning_rate = 1e-3
num_step_every_eval = 100

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [2]:
with open('input.txt', 'r', encoding="utf-8") as file:
    text = file.read()
print(f"length of data: {len(text)}")

length of data: 1115394


# 1 Tokenizer

In [3]:
chars = sorted(set(text))
vocab_size = len(set(chars))
print(f"number of unique characters: {vocab_size}")
print(chars)

number of unique characters: 65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
# build a encoder and decoder; 
# step 0: build a mapping
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
# step 1: write encode/decode functions 
encode = lambda s: [stoi[ch] for ch in s] # string to int
decode = lambda l: ''.join([itos[i] for i in l]) # int to string
# step 3: test the functions
input_str = "hello world"
tokens = encode(input_str)
decoded_token = decode(tokens)
print(input_str, tokens, decoded_token)

hello world [46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42] hello world


In [5]:
## BPE
import tiktoken
enc = tiktoken.get_encoding("gpt2") # subword tokenizer
enc.n_vocab
print(enc.encode(input_str))
print(enc.decode(enc.encode(input_str)))
# trade off between code-book size and sequence length.

[31373, 995]
hello world


In [6]:
# tokenize the entire shakespear text
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])
print(data.shape, data.dtype, data.device)

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])
torch.Size([1115394]) torch.int64 cpu


In [7]:
n_train = int(len(data) * 0.9)
train_data = data[:n_train]
val_data = data[n_train:] 

In [8]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    context = [c.item() for c in x[:i]]
    target = y[i].item()
    print(f"context {i}: {decode(context)}, target {i}: {decode([target])}")

context 0: , target 0: i
context 1: F, target 1: r
context 2: Fi, target 2: s
context 3: Fir, target 3: t
context 4: Firs, target 4:  
context 5: First, target 5: C
context 6: First , target 6: i
context 7: First C, target 7: t


# 2 Data Loader

In [9]:
torch.manual_seed(0)
def get_batch(split, batch_size=4, block_size=8):
    data = train_data if split == "train" else val_data
    ix = torch.randint(0, data.size(0) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch("train")
print(f"input shape {xb.shape}, target shape {yb.shape}")
print("-"*80)


input shape torch.Size([4, 8]), target shape torch.Size([4, 8])
--------------------------------------------------------------------------------


In [10]:
# for i in range(4):
#     print("-" *80)
#     print(f"batch {i}")
#     for j in range(block_size):
#         context = [c.item() for c in xb[i, :j]]
#         target = yb[i, j].item()
#         print(f"context {i}: {decode(context)}, target {i}: {decode([target])}")

# 3 BigramLanguageModel

In [11]:
# build a bigram model P(x_t | x_{t-1}), or P(y | x)
class BigramLanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.fc = torch.nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x) # (batch_size, block_size, embedding_dim)
        x = self.fc(x) # (batch_size, block_size, vocab_size)
        return x

    def generate(self, input, max_length=10):
        """Given input (B, T), extend the sequence to (B, T+max_length)."""
        # loop: logits -> probs -> sample
        for _ in range(max_length):
            logits = self(input) # (batchsize, block_size, vocab_size)
            logits = logits[:, -1, :] # (batch_size, vocab_size), focus on the last token
            probs = torch.softmax(logits, -1) # (batch_size, vocab_size), softmax on vocab_size
            next_token = torch.multinomial(probs, num_samples=1)
            input = torch.cat([input, next_token], dim=1)
        return input

loss_fn = torch.nn.CrossEntropyLoss()
model = BigramLanguageModel(vocab_size, 128)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    
@torch.no_grad()
def estimate_eval_loss(model, split="val", block_size=block_size):
    model.eval()
    losses = []
    for _ in range(32):
        xb, yb = get_batch(split, batch_size=32, block_size=block_size)
        y_pred = model(xb)
        loss = loss_fn(y_pred.view(-1, vocab_size), yb.view(-1))
        losses.append(loss.item())
    model.train()
    return sum(losses) / len(losses)

# write a training loop
n_training_steps = 1000
train_loss = []
val_loss = []
for step in range(n_training_steps):
    xb, yb = get_batch("train", batch_size=32)
    y_pred = model(xb)
    # print(y_pred.shape) # (batch_size, block_size, vocab_size)
    # print(yb.shape) # (batch_size, block_size)
    b, t, c = y_pred.shape
    # alternatively, y_pred.view(b*t, c) and yb.view(b*t)
    loss = loss_fn(y_pred.view(-1, vocab_size), yb.view(-1)) # view(-1) let the pytorch guess 
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    train_loss.append(loss.item())
    if step % num_step_every_eval == 0:
        eval_loss = estimate_eval_loss(model, "val")
        train_loss_avg = sum(train_loss) / len(train_loss)
        train_loss = []
        print(f"step {step}, train_loss {train_loss_avg}, val_loss {eval_loss}")

  from .autonotebook import tqdm as notebook_tqdm


step 0, train_loss 4.309607028961182, val_loss 4.257331863045692
step 100, train_loss 3.2973234462738037, val_loss 2.833229571580887
step 200, train_loss 2.7033352065086365, val_loss 2.654280722141266
step 300, train_loss 2.5845407748222353, val_loss 2.595935918390751
step 400, train_loss 2.5638115072250365, val_loss 2.566488765180111
step 500, train_loss 2.5247896695137024, val_loss 2.5465972274541855
step 600, train_loss 2.5131516671180725, val_loss 2.546621583402157
step 700, train_loss 2.511800954341888, val_loss 2.539495125412941
step 800, train_loss 2.4942449593544005, val_loss 2.53522689640522
step 900, train_loss 2.490491473674774, val_loss 2.5258804038167


In [12]:
initial_xb, initial_yb = get_batch("val")
predictions = model.generate(initial_xb, max_length=100)

In [13]:
print(decode(predictions[1].tolist()))

r:
see w s te:
Wooulld:
hiour her breo ds INIORCHing
Poff ghr'souttale il mud tas byouthat thes pldist s out


# 4 V=W@V # (B, T, C) = (B, T_q, T_k) @ (B, T_k, V), where E = T

In [23]:
# batch_size, block_size = 2, 5
xb, yb = get_batch("train", batch_size=batch_size, block_size=block_size)

value = xb.unsqueeze(-1).float() # value (B, T, C), C = 1

wei = torch.zeros(batch_size, block_size, block_size)
tril = torch.tril(torch.ones(block_size, block_size)) # tril is (T_q, T_k)
wei = wei.masked_fill(tril==0, float('-inf')) # tril will be broadcasted to (B, T_q, T_k)
wei = torch.softmax(wei, -1)

next_value = wei @ value # (B, T, T) @ (B, T, C) -> (B, T, C)
print(next_value[0, :, :], value[0, :, :])

tensor([[43.0000],
        [22.0000],
        [31.3333],
        [35.2500],
        [36.8000]]) tensor([[43.],
        [ 1.],
        [50.],
        [47.],
        [43.]])


In [24]:
class SelfAttnHead(torch.nn.Module):
    def __init__(self, emb_dim, num_heads=1, head_dim=16, block_size=8):
        super().__init__()

        self.emb_dim = emb_dim
        self.head_dim = head_dim
        self.num_heads = num_heads
        self.key = torch.nn.Linear(emb_dim, head_dim, bias=False) # (B, T, C) @ (C, H) -> (B, T, H)
        self.query = torch.nn.Linear(emb_dim, head_dim, bias=False)
        self.value = torch.nn.Linear(emb_dim, head_dim, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size))) # not parameter, buffer?

    
    def forward(self, x):
        # x: (B, T, C)
        B, T, C = x.shape
        k = self.key(x) # (B, T_k, H)
        q = self.query(x) # (B, T_q, H)
        v = self.value(x) # (B, T_v, H_v)

        wei = torch.einsum("bxh,byh->bxy", q, k) * (self.head_dim ** -0.5) # (B, T_q, T_k)
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
        wei = torch.softmax(wei, -1)
        
        # Weighted aggregation of valudes (weighted sum along T_k)
        out = torch.einsum("bxy,byh->bxh", wei, v) # (B, T_q, T_k) @ (B, T_k, H) -> (B, T_q, H);
        return out
    
head = SelfAttnHead(emb_dim=128, num_heads=1, head_dim=16, block_size=8)
x = torch.zeros(batch_size, block_size, 128)
out = head(x)
print(out.shape)


torch.Size([2, 5, 16])


In [22]:
block_size

5

In [18]:
x = torch.zeros(2, 5, 3)
key = torch.nn.Linear(3, 4)
k = key(x)
q = key(x)
print(k.shape, q.shape)

torch.Size([2, 5, 4]) torch.Size([2, 5, 4])


In [19]:
torch.einsum("bxh,byh->bxy", k, q).shape

torch.Size([2, 5, 5])