In [5]:
from data import CharTokenizer, CharDataset

In [6]:
text = "To be or not to be"
tok = CharTokenizer(text)
print(tok.vocab_size, tok.encode("To be"))


8 [0, 14, 15, 16, 17]


In [7]:
dataset = CharDataset(text, tok, seq_len=8)
x, y = dataset[0]
print("x:", tok.decode(x.tolist()))
print("y:", tok.decode(y.tolist()))

x: ['T', 'o', ' ', 'b', 'e', ' ', 'o', 'r']
y: ['o', ' ', 'b', 'e', ' ', 'o', 'r', ' ']


In [10]:
print(x)
print(y)
print(dataset[0])

tensor([ 0, 14, 15, 16, 17, 15, 14,  7])
tensor([14, 15, 16, 17, 15, 14,  7, 15])
(tensor([ 0, 14, 15, 16, 17, 15, 14,  7]), tensor([14, 15, 16, 17, 15, 14,  7, 15]))


In [26]:
text = "abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ"

In [32]:
from torch.utils.data import DataLoader
import torch.nn.functional as F
from model import TinyGPT

# 1) Tokenizer
tok = CharTokenizer(text)
print("Vocab size:", tok.vocab_size)
# print(tok.decode(tok.encode("To be")))
assert "".join(tok.decode(tok.encode("To be"))) == "To be"

# 2) Dataset
dataset = CharDataset(text, tok, seq_len=8)
x0, y0 = dataset[0]
print("x0 text:", tok.decode(x0.tolist()))
print("y0 text:", tok.decode(y0.tolist()))
# Check shift property:

assert tok.decode(y0.tolist()) == tok.decode(x0.tolist())[1:] + [text[len(x0)+0:len(x0)+1]]

# 3) DataLoader shapes
loader = DataLoader(dataset, batch_size=4, shuffle=True)
xb, yb = next(iter(loader))
print(xb.shape, yb.shape)  # torch.Size([4, 8])

# 4) Quick forward
vocab_size = tok.vocab_size
max_seq_len = 8
print("vocab_size:", vocab_size)
print("max token id in batch:", int(xb.max()))
print("min token id in batch:", int(xb.min()))
print("Tokenizer vocab size:", tok.vocab_size)
print("First 20 vocab items:", list(tok.stoi.items())[:20])
print("Sample tokens:", tok.encode("hello"))

model = TinyGPT(vocab_size=vocab_size,
    d_model=128,
    n_heads=4,
    n_layers=2,
    max_seq_len=max_seq_len)  # (4, 8, vocab_size)
logits = model(xb)   # xb is [batch, seq]
loss = F.cross_entropy(logits.view(-1, tok.vocab_size), yb.view(-1))    
print("loss:", float(loss.item()))


Vocab size: 53
x0 text: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
y0 text: ['b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
torch.Size([4, 8]) torch.Size([4, 8])
vocab_size: 53
max token id in batch: 44
min token id in batch: 9
Tokenizer vocab size: 53
First 20 vocab items: [('a', 0), ('b', 1), ('c', 2), ('d', 3), ('e', 4), ('f', 5), ('g', 6), ('h', 7), ('i', 8), ('j', 9), ('k', 10), ('l', 11), ('m', 12), ('n', 13), ('o', 14), ('p', 15), ('q', 16), ('r', 17), ('s', 18), ('t', 19)]
Sample tokens: [7, 4, 11, 11, 14]
loss: 81.9527359008789


In [35]:
import torch
import torch.optim as optim

# hyperparams
batch_size = 16
seq_len = 32
epochs = 200

# make training data (you can sample random sequences from your text)
def get_batch(text, block_size=seq_len, batch_size=batch_size):
    ix = torch.randint(len(text) - block_size, (batch_size,))
    x = torch.stack([torch.tensor(tok.encode(text[i:i+block_size])) for i in ix])
    y = torch.stack([torch.tensor(tok.encode(text[i+1:i+block_size+1])) for i in ix])
    return x, y

# model, optimizer
model = TinyGPT(vocab_size=tok.vocab_size, d_model=128, n_heads=4, n_layers=2, max_seq_len=seq_len)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)

# training loop
for epoch in range(epochs):
    xb, yb = get_batch(text)
    logits = model(xb)
    loss = F.cross_entropy(logits.view(-1, tok.vocab_size), yb.view(-1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch :
        print(f"epoch {epoch} | loss {loss.item():.4f}")


epoch 1 | loss 80.4481
epoch 2 | loss 75.6305
epoch 3 | loss 69.3849
epoch 4 | loss 62.4998
epoch 5 | loss 54.9131
epoch 6 | loss 47.3770
epoch 7 | loss 39.7089
epoch 8 | loss 34.0720
epoch 9 | loss 28.6085
epoch 10 | loss 24.3922
epoch 11 | loss 21.6750
epoch 12 | loss 19.0757
epoch 13 | loss 16.8837
epoch 14 | loss 15.4192
epoch 15 | loss 13.3221
epoch 16 | loss 11.8220
epoch 17 | loss 10.3131
epoch 18 | loss 9.0567
epoch 19 | loss 8.6834
epoch 20 | loss 7.5065
epoch 21 | loss 6.6527
epoch 22 | loss 6.1665
epoch 23 | loss 5.1758
epoch 24 | loss 4.5770
epoch 25 | loss 4.5040
epoch 26 | loss 4.1472
epoch 27 | loss 4.3506
epoch 28 | loss 3.4856
epoch 29 | loss 3.1945
epoch 30 | loss 3.0242
epoch 31 | loss 2.7820
epoch 32 | loss 2.6192
epoch 33 | loss 2.4870
epoch 34 | loss 2.1806
epoch 35 | loss 2.0950
epoch 36 | loss 1.8373
epoch 37 | loss 1.9985
epoch 38 | loss 1.6386
epoch 39 | loss 1.4877
epoch 40 | loss 1.3815
epoch 41 | loss 1.5043
epoch 42 | loss 1.4332
epoch 43 | loss 1.3589
epo

In [36]:
for name, param in model.named_parameters():
    print(name, param.shape)

tok_emb.weight torch.Size([53, 128])
pos_emb.weight torch.Size([32, 128])
blocks.0.ln1.weight torch.Size([128])
blocks.0.ln1.bias torch.Size([128])
blocks.0.mha.W_q.weight torch.Size([128, 128])
blocks.0.mha.W_q.bias torch.Size([128])
blocks.0.mha.W_k.weight torch.Size([128, 128])
blocks.0.mha.W_k.bias torch.Size([128])
blocks.0.mha.W_v.weight torch.Size([128, 128])
blocks.0.mha.W_v.bias torch.Size([128])
blocks.0.mha.W_o.weight torch.Size([128, 128])
blocks.0.mha.W_o.bias torch.Size([128])
blocks.0.ln2.weight torch.Size([128])
blocks.0.ln2.bias torch.Size([128])
blocks.0.mlp.fc1.weight torch.Size([512, 128])
blocks.0.mlp.fc1.bias torch.Size([512])
blocks.0.mlp.fc2.weight torch.Size([128, 512])
blocks.0.mlp.fc2.bias torch.Size([128])
blocks.1.ln1.weight torch.Size([128])
blocks.1.ln1.bias torch.Size([128])
blocks.1.mha.W_q.weight torch.Size([128, 128])
blocks.1.mha.W_q.bias torch.Size([128])
blocks.1.mha.W_k.weight torch.Size([128, 128])
blocks.1.mha.W_k.bias torch.Size([128])
blocks.1