In [2]:
import torch

In [12]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-02-16 17:02:10--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-02-16 17:02:10 (122 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [39]:
with open('input.txt', 'r', encoding="utf-8") as file:
    text = file.read()

In [40]:
print(f"length of data: {len(text)}")

length of data: 1115394


In [41]:
chars = sorted(set(text))
vocab_size = len(set(chars))

In [42]:
print(f"number of unique characters: {vocab_size}")
print(chars)

number of unique characters: 65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [43]:
# build a encoder and decoder; 
# step 0: build a mapping
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
# step 1: write encode/decode functions 
encode = lambda s: [stoi[ch] for ch in s] # string to int
decode = lambda l: ''.join([itos[i] for i in l]) # int to string
# step 3: test the functions
input_str = "hello world"
tokens = encode(input_str)
decoded_token = decode(tokens)
print(input_str, tokens, decoded_token)

hello world [46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42] hello world


In [26]:
## BPE
import tiktoken
enc = tiktoken.get_encoding("gpt2") # subword tokenizer
enc.n_vocab
print(enc.encode(input_str))
print(enc.decode(enc.encode(input_str)))
# trade off between code-book size and sequence length.

50257

In [45]:
# tokenize the entire shakespear text
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])
print(data.shape, data.dtype, data.device)

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])
torch.Size([1115394]) torch.int64 cpu


In [47]:
n_train = int(len(data) * 0.9)
train_data = data[:n_train]
val_data = data[n_train:] 

In [52]:
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    context = [c.item() for c in x[:i]]
    target = y[i].item()
    print(f"context {i}: {decode(context)}, target {i}: {decode([target])}")

context 0: , target 0: i
context 1: F, target 1: r
context 2: Fi, target 2: s
context 3: Fir, target 3: t
context 4: Firs, target 4:  
context 5: First, target 5: C
context 6: First , target 6: i
context 7: First C, target 7: t


In [53]:
data.shape

torch.Size([1115394])

In [58]:
data.size(0)

1115394

In [61]:
torch.manual_seed(0)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(0, data.size(0) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch("train")
print(f"input shape {xb.shape}, target shape {yb.shape}")
print("-"*80)


input shape torch.Size([4, 8]), target shape torch.Size([4, 8])
--------------------------------------------------------------------------------


In [63]:
for i in range(batch_size):
    print("-" *80)
    print(f"batch {i}")
    for j in range(block_size):
        context = [c.item() for c in xb[i, :j]]
        target = yb[i, j].item()
        print(f"context {i}: {decode(context)}, target {i}: {decode([target])}")

--------------------------------------------------------------------------------
batch 0
context 0: , target 0: e
context 0: h, target 0:  
context 0: he, target 0: g
context 0: he , target 0: i
context 0: he g, target 0: v
context 0: he gi, target 0: e
context 0: he giv, target 0: s
context 0: he give, target 0:  
--------------------------------------------------------------------------------
batch 1
context 1: , target 1:  
context 1: m, target 1: t
context 1: m , target 1: h
context 1: m t, target 1: e
context 1: m th, target 1:  
context 1: m the, target 1: y
context 1: m the , target 1: o
context 1: m the y, target 1: u
--------------------------------------------------------------------------------
batch 2
context 2: , target 2: l
context 2: a, target 2: l
context 2: al, target 2:  
context 2: all, target 2: y
context 2: all , target 2: o
context 2: all y, target 2: u
context 2: all yo, target 2:  
context 2: all you, target 2: h
-------------------------------------------------

In [67]:
# build a bigram model P(x_t | x_{t-1}), or P(y | x)
class BigramLanguageModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.fc = torch.nn.Linear(embedding_dim, vocab_size)
    def forward(self, x):
        x = self.embedding(x) # (batch_size, block_size, embedding_dim)
        x = self.fc(x) # (batch_size, block_size, vocab_size)
        return x

loss_fn = torch.nn.CrossEntropyLoss()
model = BigramLanguageModel(vocab_size, 128)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# write a training loop
n_training_steps = 100000
for step in range(n_training_steps):
    xb, yb = get_batch("train")
    y_pred = model(xb)
    # print(y_pred.shape) # (batch_size, block_size, vocab_size)
    # print(yb.shape) # (batch_size, block_size)
    b, t, c = y_pred.shape
    # alternatively, y_pred.view(b*t, c) and yb.view(b*t)
    loss = loss_fn(y_pred.view(-1, vocab_size), yb.view(-1)) # view(-1) let the pytorch guess 
    if step % 100 == 0:
        print(f"step {step}, loss {loss.item()}")
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

step 0, loss 4.3190155029296875
step 100, loss 3.1772496700286865
step 200, loss 2.854595899581909
step 300, loss 2.915212869644165
step 400, loss 2.7505598068237305
step 500, loss 2.8898165225982666
step 600, loss 2.4599714279174805
step 700, loss 2.5624449253082275
step 800, loss 2.5236358642578125
step 900, loss 2.459679365158081
step 1000, loss 2.338045120239258
step 1100, loss 2.621102809906006
step 1200, loss 2.4187259674072266
step 1300, loss 2.7511627674102783
step 1400, loss 2.606264591217041
step 1500, loss 2.2556681632995605
step 1600, loss 2.5340089797973633
step 1700, loss 2.0890870094299316
step 1800, loss 2.573977470397949
step 1900, loss 2.402107000350952
step 2000, loss 2.359531879425049
step 2100, loss 2.07576847076416
step 2200, loss 2.3776299953460693
step 2300, loss 3.0178375244140625
step 2400, loss 2.808868885040283
step 2500, loss 2.6700589656829834
step 2600, loss 2.4515349864959717
step 2700, loss 2.632955312728882
step 2800, loss 2.3568272590637207
step 2900,

KeyboardInterrupt: 

In [71]:
# generate. instead of doing argmax, do a multinomial sampling, interesting....
# input, xb; continue to generate the next token; no need keep the blocksize... shifting the input for every step
def generate(input, max_length=10):
    with torch.no_grad():
        for i in range(max_length):
            y_pred = model(input) # (batchsize, block_size, vocab_size)
            next_token = torch.multinomial(y_pred[:, -1, :].softmax(-1), 1)
            input = torch.cat([input, next_token.unsqueeze(0)], dim=1)

output = generate(xb, 10)

RuntimeError: Tensors must have same number of dimensions: got 2 and 3