# Imports

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as opt

In [3]:
%load_ext lab_black

In [4]:
torch.manual_seed(42)

<torch._C.Generator at 0x7fe2eea39750>

# Data

In [4]:
with open("../data/tinyshakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [5]:
print(f"{len(text):,}")

1,115,394


In [6]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [9]:
vocab = sorted(list(set(text)))
vocab_sz = len(vocab)
vocab_sz, "".join(vocab)

(65, "\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")

In [10]:
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

In [11]:
def encode(string):
    return [char_to_idx[char] for char in string]


def decode(idxs):
    return "".join([idx_to_char[idx] for idx in idxs])

In [12]:
encode("hi imad"), decode(encode("hi imad"))

([46, 47, 1, 47, 51, 39, 42], 'hi imad')

In [13]:
data = torch.tensor(encode(text))
data.shape, data.dtype, data[:10]

(torch.Size([1115394]),
 torch.int64,
 tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]))

In [14]:
block_sz = 8
batch_sz = 32

In [15]:
def get_batch(data, block_sz, batch_sz):
    """Returns batch of data in the form of batch_sz x block_sz."""
    idxs = torch.randint(len(data) - block_sz, (batch_sz,))
    x = torch.stack([data[idx : idx + block_sz] for idx in idxs])
    y = torch.stack([data[idx + 1 : idx + block_sz + 1] for idx in idxs])
    return x, y

In [18]:
xb, yb = get_batch(data, block_sz, 4)
xb, yb

(tensor([[50, 50, 53, 61, 43, 56, 57,  1],
         [24, 13, 26, 16, 10,  0, 20, 47],
         [46, 56, 47, 57, 58, 47, 39, 52],
         [57, 58,  1, 52, 53, 58,  1, 58]]),
 tensor([[50, 53, 61, 43, 56, 57,  1, 21],
         [13, 26, 16, 10,  0, 20, 47, 57],
         [56, 47, 57, 58, 47, 39, 52,  1],
         [58,  1, 52, 53, 58,  1, 58, 53]]))

In [20]:
# What the input to the model would actually look like
for b in range(len(xb)):  # batch dimension
    for t in range(block_sz):  # time dimension
        context = xb[b, : t + 1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

when input is [50] the target: 50
when input is [50, 50] the target: 53
when input is [50, 50, 53] the target: 61
when input is [50, 50, 53, 61] the target: 43
when input is [50, 50, 53, 61, 43] the target: 56
when input is [50, 50, 53, 61, 43, 56] the target: 57
when input is [50, 50, 53, 61, 43, 56, 57] the target: 1
when input is [50, 50, 53, 61, 43, 56, 57, 1] the target: 21
when input is [24] the target: 13
when input is [24, 13] the target: 26
when input is [24, 13, 26] the target: 16
when input is [24, 13, 26, 16] the target: 10
when input is [24, 13, 26, 16, 10] the target: 0
when input is [24, 13, 26, 16, 10, 0] the target: 20
when input is [24, 13, 26, 16, 10, 0, 20] the target: 47
when input is [24, 13, 26, 16, 10, 0, 20, 47] the target: 57
when input is [46] the target: 56
when input is [46, 56] the target: 47
when input is [46, 56, 47] the target: 57
when input is [46, 56, 47, 57] the target: 58
when input is [46, 56, 47, 57, 58] the target: 47
when input is [46, 56, 47, 5

The effective batch size is actually batch_sz * block_sz

# Simple Bigram Model

In [61]:
batch_sz = 32
lr = 1e-3
block_sz = 8
n_iters = 10_000

In [62]:
class BigramLM(nn.Module):
    def __init__(self, vocab_sz, emb_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_sz, emb_dim)

    def forward(self, x, targets=None):
        logits = self.embedding(x)  # batch_sz x block_sz x emb_dim
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)  # batch_sz * block_sz x emb_dim
            targets = targets.view(B * T)  # 1D array: batch_sz x block_sz
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return logits, loss

    def generate(self, x, max_new_tokens):
        for _ in range(max_new_tokens):
            # get the predictions. It doesn't matter how big is T, we always
            # look at the previous character to predict the next character
            # But the function is meant to be general so it can be used later
            # when more previous characters will be used to predict the next
            # character
            logits, _ = self(x)  # Initially B x T x C
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            probs = F.softmax(logits, dim=-1)  # (B, C)
            next_char = torch.multinomial(probs, num_samples=1)  # (B, 1)
            x = torch.cat((x, next_char), dim=1)  # (B, T+1)
        return x

In [63]:
model = BigramLM(vocab_sz, vocab_sz)
logits, loss = model(xb, yb)
logits.shape, loss

(torch.Size([32, 65]), tensor(4.8091, grad_fn=<NllLossBackward0>))

In [64]:
print(decode(model.generate(torch.zeros((1, 1), dtype=torch.int), 100)[0].tolist()))


w$U

ga O'rbMhYcI--
ih!Z:JLH?ljX&whLGJSbqRNE:kwgwSrBAmpXZBCoF&dHU-GjSiMhhuiu;cOby3upGjyuffqkA'?kqMhq


In [65]:
optimizer = opt.AdamW(model.parameters(), lr=1e-3)

In [66]:
for _ in range(n_iters):
    x, y = get_batch(data, batch_sz, block_sz)
    logits, loss = model(x, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [67]:
loss

tensor(2.4651, grad_fn=<NllLossBackward0>)

In [1]:
print(decode(model.generate(torch.zeros((1, 1), dtype=torch.int), 1000)[0].tolist()))

NameError: name 'decode' is not defined

# NanoGPT

In [13]:
w = torch.randint(100, (1, 1, 1))
w

tensor([[[99]]])

In [14]:
tril = torch.tril(torch.ones(8, 8))
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [15]:
w.masked_fill(tril == 0, float("inf"))

RuntimeError: value cannot be converted to type int64_t without overflow