In [1]:
def get_unique_chars(text):
    return sorted(list(set(text)))


with open("data/discopolo_songs_clean.txt", "r", encoding="utf-8") as f:
    text = f.read()

chars = get_unique_chars(text)
vocab_size = len(chars)

print(vocab_size)
print("".join(chars))

105
	
 !"#%&'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ÓóĄąĆćĘęŁłŃńŚśŹźŻż̨́̇​﻿


In [2]:
stoi = {char: idx for idx, char in enumerate(chars)}
itos = {idx: char for idx, char in enumerate(chars)}

encode = lambda string: [stoi[char] for char in string]
decode = lambda tokens: "".join([itos[idx] for idx in tokens])

In [3]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([2393177]) torch.int64
tensor([48, 59, 65, 73, 74,  2, 70, 63, 69, 73, 59, 68, 65, 63, 26,  2, 29, 65,
        57, 59, 68, 74,  9, 70, 66, 10,  2, 13,  2, 30, 63, 55, 91, 55,  2, 41,
        59, 77, 55,  1, 17, 14,  2, 44, 69, 70, 72, 80, 59, 80,  2, 67, 69, 72,
        80, 55,  2, 63,  2, 70, 72, 80, 59, 80,  2, 69, 57, 59, 55, 68, 79,  1,
        70, 91, 79, 68, 63, 59,  2, 73, 74, 55, 74, 59, 65,  2, 77, 95, 72, 83,
        58,  2, 77, 73, 70, 63, 59, 68, 63, 69, 68, 79, 57, 62,  2, 60, 55, 66,
         1, 55,  2, 68, 55,  2, 73, 74, 55, 74, 65, 75,  2, 57, 62, 91, 69, 70,
        55, 65,  2, 80, 55, 65, 69, 57, 62, 55, 68, 79,  2,  1, 80, 55, 65, 69,
        57, 62, 55, 68, 79,  2, 77,  2,  2, 67, 69, 72, 80, 75,  2, 74, 79, 66,
        59,  2, 66, 55, 74,  2,  1,  1, 46, 59, 60, 26, 78, 18,  1,  1, 30, 63,
        55, 91, 55,  2, 67, 59, 77, 69, 12,  2, 66, 59, 87,  2, 58, 55, 66, 59,
        65, 69,  2, 73, 74, 85, 58, 12,  1, 66, 59, 87,  2, 58, 55, 66, 59, 65,
      

In [4]:
split_idx = int(len(data) * 0.9)
train_data = data[:split_idx]
val_data = data[split_idx:]

block_size = 8
train_data[: block_size + 1]

tensor([48, 59, 65, 73, 74,  2, 70, 63, 69])

In [None]:
x = train_data[:block_size]
y = train_data[1 : block_size + 1]
for t in range(block_size):
    context = x[: t + 1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([48]) the target: 59
when input is tensor([48, 59]) the target: 65
when input is tensor([48, 59, 65]) the target: 73
when input is tensor([48, 59, 65, 73]) the target: 74
when input is tensor([48, 59, 65, 73, 74]) the target: 2
when input is tensor([48, 59, 65, 73, 74,  2]) the target: 70
when input is tensor([48, 59, 65, 73, 74,  2, 70]) the target: 63
when input is tensor([48, 59, 65, 73, 74,  2, 70, 63]) the target: 69


In [None]:
torch.manual_seed(1337)
batch_size = 4  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?


def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y


xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):  # batch dimension
    for t in range(block_size):  # time dimension
        context = xb[b, : t + 1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[68, 55, 67, 63,  1, 41, 55, 74],
        [77, 63, 59, 57, 85,  2, 64, 55],
        [65, 73, 74,  2, 70, 63, 69, 73],
        [67, 68, 63, 59,  1, 54, 55, 65]])
targets:
torch.Size([4, 8])
tensor([[55, 67, 63,  1, 41, 55, 74, 65],
        [63, 59, 57, 85,  2, 64, 55, 65],
        [73, 74,  2, 70, 63, 69, 73, 59],
        [68, 63, 59,  1, 54, 55, 65, 69]])
----
when input is [68] the target: 55
when input is [68, 55] the target: 67
when input is [68, 55, 67] the target: 63
when input is [68, 55, 67, 63] the target: 1
when input is [68, 55, 67, 63, 1] the target: 41
when input is [68, 55, 67, 63, 1, 41] the target: 55
when input is [68, 55, 67, 63, 1, 41, 55] the target: 74
when input is [68, 55, 67, 63, 1, 41, 55, 74] the target: 65
when input is [77] the target: 63
when input is [77, 63] the target: 59
when input is [77, 63, 59] the target: 57
when input is [77, 63, 59, 57] the target: 85
when input is [77, 63, 59, 57, 85] the target: 2
when input is 

In [10]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)


class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # botch idx and targets are of size (B, T)
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape  # Batch, Tokens (block_size), Channels (vocab_size)

            # F.cross_entropy() expects dims (Batch, Channels, ...) or (Batch, Channels)
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):

        for _ in range(max_new_tokens):
            logits, _ = self(idx)  # (B, T, C)
            # we currently only care about last token
            logits = logits[:, -1, :]  # now (B, C)

            probs = F.softmax(logits, dim=-1)
            # sample
            pred_idx = torch.multinomial(probs, num_samples=1)  # (B, 1)

            idx = torch.cat((idx, pred_idx), dim=1)

        return idx


model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)

print(logits.shape)
print(loss)

pred_idxs = model.generate(
    idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100
)
decoded_tokens = decode(pred_idxs[0].tolist())
print(decoded_tokens)

torch.Size([32, 105])
tensor(5.2934, grad_fn=<NllLossBackward0>)
	CżbsAsKĆMmfOń33!'ŚH5įH​0 tżć% ́fąNŚÓRSaljMALÓblCd/g4)'źjlk5a/9y​"	rEvbwEą3p
!tk59WṔV;#4jŻXhURą2Vś3


In [11]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [None]:
batch_size = 32

for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch("train")

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % 1000 == 0:
        print(f"Step {steps}: Loss={loss}")

print(loss.item())

Step 0: Loss=2.691286087036133
Step 1000: Loss=2.752305507659912
Step 2000: Loss=2.6109378337860107
Step 3000: Loss=2.5512685775756836
Step 4000: Loss=2.5911865234375
Step 5000: Loss=2.606036424636841
Step 6000: Loss=2.569638729095459
Step 7000: Loss=2.5174143314361572
Step 8000: Loss=2.490147590637207
Step 9000: Loss=2.504244089126587
2.6217947006225586


In [18]:
pred_idxs = model.generate(
    idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100
)
decoded_tokens = decode(pred_idxs[0].tolist())
print(decoded_tokens)

	CYGdy sić miowemni gniest 
I"Byasz dy klemiekokoch pią, !)

po tościodadawabiepe  niajm sk gdź kosze


In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print("a=")
print(a)
print("--")
print("b=")
print(b)
print("--")
print("c=")
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
# consider the following toy example:

torch.manual_seed(1337)
B, T, C = 4, 8, 2  # batch, time, channels
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [None]:
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, : t + 1]  # (t, C)
        xbow[b, t] = torch.mean(xprev, dim=0)

print(xbow)

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])


In [None]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x
torch.allclose(xbow, xbow2, atol=1e-07)  # had to lower tollerance from 1e-08

True

In [None]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros(T, T)
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3, atol=1e-07)  # had to lower tollerance from 1e-08

True

In [55]:
# version 4: self-attention!
torch.manual_seed(1337)
B, T, C = 4, 8, 32  # batch, time, channels
x = torch.randn(B, T, C)

head_size = 16
query = nn.Linear(C, head_size, bias=False)
key = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

q = query(x)
k = key(x)
wei = q @ k.transpose(-2, -1) / head_size**0.5  # (B, T, 16) @ (B, 16, T) ---> (B, T, T)
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
print(out.shape)

torch.Size([4, 8, 16])


In [57]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5221, 0.4779, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3602, 0.3210, 0.3188, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2980, 0.4039, 0.1578, 0.1404, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1643, 0.1243, 0.1678, 0.1865, 0.3570, 0.0000, 0.0000, 0.0000],
        [0.2656, 0.2110, 0.1137, 0.1214, 0.2018, 0.0865, 0.0000, 0.0000],
        [0.1761, 0.1327, 0.1371, 0.0974, 0.1476, 0.1918, 0.1173, 0.0000],
        [0.1046, 0.1260, 0.0922, 0.0906, 0.1476, 0.1588, 0.1432, 0.1371]],
       grad_fn=<SelectBackward0>)

In [None]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1)

In [82]:
k.var()

tensor(1.0739)

In [83]:
q.var()

tensor(0.8914)

In [84]:
wei.var()

tensor(13.5060)

In [86]:
wei = q @ k.transpose(-2, -1) / head_size**0.5
wei.var()

tensor(0.8441)

In [87]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [None]:
# High variance causes the tensor after softmax to converge to one-hot
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * head_size, dim=-1)

tensor([1.5939e-03, 1.3117e-05, 3.9102e-02, 1.3117e-05, 9.5928e-01])

In [None]:
class LayerNorm1d:  # (used to be BatchNorm1d)

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # calculate the forward pass
        xmean = x.mean(1, keepdim=True)  # batch mean
        xvar = x.var(1, keepdim=True)  # batch variance
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)  # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]


torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100)  # batch size 32 of 100-dimensional vectors
x1 = module(x)
x2 = torch.nn.LayerNorm((100,))(x)

x1.shape
x2.shape

torch.Size([32, 100])

In [None]:
x1[:, 0].mean(), x[:, 0].std()  # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8899))

In [None]:
x1[0, :].mean(), x[
    0, :
].std()  # mean,std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0476))

In [None]:
x2[:, 0].mean(), x[:, 0].std()  # mean,std of one feature across all batch inputs

(tensor(0.1476, grad_fn=<MeanBackward0>), tensor(0.8899))

In [99]:
x2[0, :].mean(), x[0, :].std()  # mean,std of one feature across all batch inputs

(tensor(1.4305e-08, grad_fn=<MeanBackward0>), tensor(1.0476))