In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-07-29 21:19:56--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-07-29 21:19:57 (10.3 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
with open('input.txt', 'r', encoding='utf-8')  as file:
    text = file.read()
len(text)

1115394

In [4]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [6]:
vocab = sorted(list(set(text)))
''.join(vocab)

"\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

In [8]:
ctoi = {c: i for i, c in enumerate(vocab)}
itoc = {i: c for c, i in ctoi.items()}
encode = lambda s: [ctoi[c] for c in s]
decode = lambda l: ''.join([itoc[i] for i in l])

print(encode('wazzap'))
print(decode(encode('wazzap')))

[61, 39, 64, 64, 39, 54]
wazzap


In [9]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)
print(data[:100])

torch.Size([1115394]) <built-in method type of Tensor object at 0x7fe0f8030bd0>
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [10]:
n = int(.9 * len(data))
data_train = data[:n]
data_test = data[n:]

In [11]:
block_size = 8
data_train[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [14]:
for i in range(block_size):
    print(data_train[:i+1], data_train[i+1])

tensor([18]) tensor(47)
tensor([18, 47]) tensor(56)
tensor([18, 47, 56]) tensor(57)
tensor([18, 47, 56, 57]) tensor(58)
tensor([18, 47, 56, 57, 58]) tensor(1)
tensor([18, 47, 56, 57, 58,  1]) tensor(15)
tensor([18, 47, 56, 57, 58,  1, 15]) tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor(58)


In [19]:
def gen_batch(data, block_size=8, batch_size=4):
    ixs = torch.randint(high=len(data) - block_size, size=(batch_size,))
    x = torch.stack([data[ix:ix+block_size] for ix in ixs])
    y = torch.stack([data[ix+1:ix+block_size+1] for ix in ixs])
    return x, y

gen_batch(data_train[:8+1], 8, 4)

(tensor([[18, 47, 56, 57, 58,  1, 15, 47],
         [18, 47, 56, 57, 58,  1, 15, 47],
         [18, 47, 56, 57, 58,  1, 15, 47],
         [18, 47, 56, 57, 58,  1, 15, 47]]),
 tensor([[47, 56, 57, 58,  1, 15, 47, 58],
         [47, 56, 57, 58,  1, 15, 47, 58],
         [47, 56, 57, 58,  1, 15, 47, 58],
         [47, 56, 57, 58,  1, 15, 47, 58]]))

In [20]:
gen_batch(data_train)

(tensor([[43, 56,  1, 21,  1, 46, 43, 39],
         [21, 26, 19, 20, 13, 25, 10,  0],
         [39, 52, 63,  1, 39, 52,  1, 46],
         [56, 63,  1, 63, 53, 59,  1, 51]]),
 tensor([[56,  1, 21,  1, 46, 43, 39, 56],
         [26, 19, 20, 13, 25, 10,  0, 32],
         [52, 63,  1, 39, 52,  1, 46, 43],
         [63,  1, 63, 53, 59,  1, 51, 43]]))

In [62]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class Bigram(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embeds = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.embeds(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            loss = F.cross_entropy(
                logits.view(B*T, C),
                targets.view(B*T))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is BxT
        for _ in range(max_new_tokens):
            logits, loss = self(idx) # BxTxC
            last_pred = logits[:,-1,:] # BxC
            probs = F.softmax(last_pred, dim=-1) # BxC
            idx_next = torch.multinomial(probs, num_samples=1) # Bx1
            idx = torch.cat((idx, idx_next), dim=1) # Bx(T+1)
        return idx


model = Bigram(len(vocab))
xb, yb = gen_batch(data_train)
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

decode(model.generate(torch.stack([torch.tensor(encode('abcdefgh'), dtype=torch.long)]), 8)[0].tolist())

torch.Size([4, 8, 65])
tensor(4.7214, grad_fn=<NllLossBackward0>)


'abcdefgh3FQ:yW$M'

In [63]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

batch_size = 64
for i in range(10000):
    xb, yb = gen_batch(data_train)
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if i % 1000 == 999: print(loss.item())

4.179408550262451
3.20768666267395
3.173722267150879
2.641472339630127
3.042167901992798
2.951714515686035
2.3903934955596924
2.790316343307495
2.5644607543945312
2.7424395084381104


In [65]:
print(decode(model.generate(torch.stack([torch.tensor(encode(' '), dtype=torch.long)]), 100)[0].tolist()))

 sur e.
wavIORI't oulll s t fougethyand he fay

feve.
Bu as, way, ar je mmangm'Ps in 'dans the, t lat


In [80]:
a = torch.tril(torch.ones(3, 3)) @ torch.tensor([
    [1, 2],
    [2, 2],
    [2, 4],
], dtype=torch.float)
torch.mean(torch.tril(torch.ones(3, 3)), dim=1, keepdim=True)

tensor([[0.3333],
        [0.6667],
        [1.0000]])