In [1]:
!nvidia-smi

Sun Sep 26 12:03:49 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import torch
torch.tril(torch.ones(1, 1, 10, 10))

tensor([[[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
          [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]])

##### GPT in PyTorch

In [3]:
import torch

### Glossary
# B = Batch size
# D = Hidden dim
# H = num Heads
# S = Sequence length = block Size
# L = Layers
# V = Vocab size
# E = Embedding dim
# P = dropout Probability

class SelfAttention(torch.nn.Module):
    def __init__(self, D, H, S, P=0.2):
        super().__init__()
        if D % H != 0:
            raise ValueError("Hidden Dim must be divisible by number of heads")

        self.H = H
        self.DpH = D // H
             
        self.q_dense = torch.nn.Linear(D, D, bias=False)
        self.k_dense = torch.nn.Linear(D, D, bias=False)
        self.v_dense = torch.nn.Linear(D, D, bias=False)

        self.drop = torch.nn.Dropout(P)

        self.register_buffer("causal_mask", torch.tril(torch.ones(1, 1, S, S)))

    def forward(self, x):
        B, S, D = x.shape

        q = self.q_dense(x).reshape([B, S, self.H, self.DpH]).transpose(1, 2) # Shape: [B, H, S, D/H]
        k = self.k_dense(x).reshape([B, S, self.H, self.DpH]).permute(0, 2, 3, 1) # Shape: [B, H, D/H, S]
        v = self.v_dense(x).reshape([B, S, self.H, self.DpH]).transpose(1, 2) # Shape: [B, H, S, D/H]

        out = (q @ k) / (self.DpH ** (1/2)) # Shape: [B, H, S, S]

        out = out.masked_fill(self.causal_mask[:, :, :S, :S] == 0, float('-inf')) # Shape: [B, H, S, S]
        
        out = torch.nn.functional.softmax(out, dim=-1) # Shape: [B, H, S, S]

        out = self.drop(out) @ v # Shape: [B, H, S, D/H]

        return out.transpose(1, 2).reshape([B, S, D])

class GPTBlock(torch.nn.Module):
    def __init__(self, D, H, S, P=0.2):
        super().__init__()

        self.ln1 = torch.nn.LayerNorm(D)
        self.ln2 = torch.nn.LayerNorm(D)
        
        self.att = SelfAttention(D, H, S, P)
        self.ff = torch.nn.Sequential(
            torch.nn.Linear(D, D*4), torch.nn.ReLU(), torch.nn.Linear(D*4, D), torch.nn.Dropout(P))

    def forward(self, x):
        x = self.att(self.ln1(x)) + x
        x = self.ff(self.ln2(x)) + x
        return x

class GPT(torch.nn.Module):
    def __init__(self, L=12, D=768, H=12, S=1024, V=50000, P=0.2):
        super().__init__()

        # Embeddings
        self.voc_emb = torch.nn.Embedding(V, D)
        self.pos_emb = torch.nn.Embedding(S, D)
        self.drop = torch.nn.Dropout(P)

        # Transformer Blocks
        self.blocks = torch.nn.Sequential(*[GPTBlock(D, H, S) for _ in range(L)])
        self.ln3 = torch.nn.LayerNorm(D)

        # Output
        self.out = torch.nn.Linear(D, V, bias=False)

        self.apply(self._init_weights)
      
    def _init_weights(self, module):
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)):
            module.weight.data.normal_(mean=0, std=0.02)
            if isinstance(module, (torch.nn.Linear)) and module.bias is not None:
                module.bias.data.zero_()
        if isinstance(module, torch.nn.LayerNorm):
            module.weight.data.fill_(1.0)
            module.bias.data.zero_()

    def forward(self, x, targets=None):
        B, S = x.shape
        pos_ids = torch.arange(S, dtype=torch.long, device=x.device)
        x = self.voc_emb(x) + self.pos_emb(pos_ids)
        x = self.blocks(self.drop(x))
        x = self.out(self.ln3(x))

        loss = None
        if targets is not None:
            loss = torch.nn.functional.cross_entropy(x.view(-1, x.size(-1)), targets.view(-1))
        return x, loss

In [4]:
model = GPT()
B, S = 6, 18 
arr = torch.randint(0, 10, size=(B, S))
model(arr)[0].shape

torch.Size([6, 18, 50000])

In [5]:
#Refs:
#- https://github.com/huggingface/transformers/blob/master/src/transformers/models/gpt_neo/modeling_gpt_neo.py
#- https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
#- https://github.com/huggingface/transformers/blob/master/src/transformers/models/gpt2/modeling_gpt2.py
#- https://github.com/pbloem/former


##### Train & Generate

In [6]:
# Use some boilerplate code for loading etc from a good repo
!git clone https://github.com/karpathy/minGPT
%cd minGPT

Cloning into 'minGPT'...
remote: Enumerating objects: 175, done.[K
remote: Total 175 (delta 0), reused 0 (delta 0), pack-reused 175[K
Receiving objects: 100% (175/175), 1.37 MiB | 8.06 MiB/s, done.
Resolving deltas: 100% (101/101), done.
/content/minGPT


In [7]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)
# make deterministic
from mingpt.utils import set_seed
set_seed(42)

import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [8]:
import math
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        """
        arrange data and targets so that the first i elements of x
        will be asked to predict the i-th element of y. Notice that
        the eventual language model will actually make block_size
        individual predictions at the same time based on this data,
        so we are being clever and amortizing the cost of the forward
        pass of the network. So for example if block_size is 4, then
        we could e.g. sample a chunk of text "hello", the integers in
        x will correspond to "hell" and in y will be "ello". This will
        then actually "multitask" 4 separate examples at the same time
        in the language model:
        - given just "h", please predict "e" as next
        - given "he" please predict "l" next
        - given "hel" predict "l" next
        - given "hell" predict "o" next
        
        In addition, because the DataLoader will create batches of examples,
        every forward/backward pass during traning will simultaneously train
        a LOT of predictions, amortizing a lot of computation. In particular,
        for a batched input of integers X (B, T) where B is batch size and
        T is block_size and Y (B, T), the network will during training be
        simultaneously training to make B*T predictions, all at once! Of course,
        at test time we can paralellize across batch B, but unlike during training
        we cannot parallelize across the time dimension T - we have to run
        a forward pass of the network to recover the next single character of the 
        sequence along each batch dimension, and repeatedly always feed in a next
        character to get the next one.
        
        So yes there is a big asymmetry between train/test time of autoregressive
        models. During training we can go B*T at a time with every forward pass,
        but during test time we can only go B at a time, T times, with T forward 
        passes.
        """
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [9]:
block_size = 32 # spatial extent of the model for its context

In [10]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
text = open('input.txt', 'r').read() # don't worry we won't run out of file handles
train_dataset = CharDataset(text, block_size) # one line of poem is roughly 50 characters

--2021-09-26 12:04:20--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2021-09-26 12:04:20 (50.0 MB/s) - ‘input.txt’ saved [1115394/1115394]

data has 1115394 characters, 65 unique.


###### This GPT

In [11]:
model = GPT(V=train_dataset.vocab_size, S=train_dataset.block_size, L=6, H=8, D=256)

In [16]:
# Add optimizer configuration to model to avoid having to rewrite Karpathy's training function
def configure_optimizers(train_config):
    optimizer = torch.optim.AdamW(model.parameters(), lr=train_config.learning_rate, betas=train_config.betas)
    return optimizer

model.configure_optimizers = configure_optimizers

In [17]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=2, batch_size=512, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=4)

trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

  cpuset_checked))
epoch 1 iter 2178: train loss 1.37583. lr 3.000676e-04: 100%|██████████| 2179/2179 [05:15<00:00,  6.92it/s]
epoch 2 iter 2178: train loss 1.27420. lr 6.000000e-05: 100%|██████████| 2179/2179 [05:14<00:00,  6.94it/s]


In [18]:
def top_k_logits(logits, k):
    v, ix = torch.topk(logits, k)
    out = logits.clone()
    out[out < v[:, [-1]]] = -float('Inf')
    return out


@torch.no_grad()
def sample(model, x, steps, block_size, temperature=1.0, sample=False, top_k=None):
    """
    take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in
    the sequence, feeding the predictions back into the model each time. Clearly the sampling
    has quadratic complexity unlike an RNN that is only linear, and has a finite context window
    of block_size, unlike an RNN that has an infinite context window.
    """
    model.eval()
    for k in range(steps):
        x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed
        logits, _ = model(x_cond)
        # pluck the logits at the final step and scale by temperature
        logits = logits[:, -1, :] / temperature
        # optionally crop probabilities to only the top k options
        if top_k is not None:
            logits = top_k_logits(logits, top_k)
        # apply softmax to convert to probabilities
        probs = F.softmax(logits, dim=-1)
        # sample from the distribution or take the most likely
        if sample:
            ix = torch.multinomial(probs, num_samples=1)
        else:
            _, ix = torch.topk(probs, k=1, dim=-1)
        # append to the sequence and continue
        x = torch.cat((x, ix), dim=1)

    return x

In [19]:
# alright, let's sample some character-level Shakespeare

context = "O God, O God!"
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 2000, block_size=train_dataset.block_size, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

O God, O God!
O Pomfret,--

Second Citizen:
Ay, what a credit is strange? a gentleman of their holy hands
Than an in place that she stands not a court
Have been pawn'd upon you:'
And I did not tell us alone, treasons that
Is, here should welcome; yet, stir,
To sure the seas of the west, but shall be this sweet was to crown to him?'

SICINIUS:
That's the noble credit of his cap,
But they must lose upon your spirits,
And trust me not to me too?

CORIOLANUS:
It is a fortune come to a southward to half the patricians that you are. Thou hast made the power
Into bastards that bate thee.

MERCUTIO:
I know not, sir?

POMPEY:
Thou dost take thy life.

KING EDWARD IV:
To fear the city of our large: be gone.

KATHARINA:
It is the matter?

GRUMIO:
I am cowardly serve this shroud-favour!
O, the mions, a dozen sir.

GONZALO:
Woe wisely faction of such best. Tribunes' sail,
My short and sullen cease of the flame;
And what thou attainted to have an old part,
When she looks it not like a ten time: I ca

###### Karpathy's GPT

In [18]:
from mingpt.model import GPT, GPTConfig
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=6, n_head=8, n_embd=256)
model = GPT(mconf)

09/25/2021 14:53:30 - INFO - mingpt.model -   number of parameters: 4.780544e+06


In [19]:
from mingpt.trainer import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=1, batch_size=512, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=4)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

  cpuset_checked))
epoch 1 iter 2178: train loss 1.31731. lr 3.000676e-04: 100%|██████████| 2179/2179 [08:17<00:00,  4.38it/s]


In [21]:
# alright, let's sample some character-level Shakespeare
from mingpt.utils import sample

context = "O God, O God!"
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 2000, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

O God, O God! alas, bid her senators and my hearts
Be hopeded on; for the sin of those faces.

COMINIUS:
Nay, sweet man; we have heard to be
dream, an end all of his brother's wooing,
And leave me a loss, to-morrow, be anon!

CAPULET:
If I did not be disposited; if he slept,
For thoughts any minute in the heavens
Had been a plant the faces them with och, would we disguise
The dhich state shall do on you for a minute ask.

GLOUCESTER:
The park of lance, my good lord!

GRUMIO:
So wife, what's the matter?

MENENIUS:
And then here are no more.

GREMIO:
How didst thou bring my troth, soon-ear'd, and whose sanctuary begins honour.
Think you for that, if you comfort.

LUCIO:
I have
a damned friend?

SAMPSON:
That you must not prepettic son: I'll be consult,
Her summiss of any cure will not conceive these senates and see
That thinks you attaind the field; though they
on our friautes part than you are: you
will make it the legs, to cold young
From the creature of your lady to this length.

CLIF

##### Micro

In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
text = open('input.txt', 'r').read() # don't worry we won't run out of file handles

--2021-09-26 12:23:35--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2021-09-26 12:23:35 (31.2 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
class CharDataset:

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
 
        x = dix[:-1]
        y = dix[1:]
        return x, y

block_size = 32 # spatial extent of the model for its context

In [9]:
import random

block_size = 12
train_dataset = CharDataset(text, block_size)
X, y = train_dataset[random.randint(0, len(train_dataset))]

data has 1115394 characters, 65 unique.


In [10]:
import math

class Value:
    """ stores a single scalar value and its gradient """

    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0
        # internal variables used for autograd graph construction
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op # the op that produced this node, for graphviz / debugging / etc

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward

        return out
    
    ### Added from a pr ###
    def exp(self):
        try:
            out = Value(math.exp(self.data), (self,), "e")
        except:
            # This happens when e.g. not using Layernorm
            print("Data too large: ", self.data)

        def _backward():
            self.grad += math.exp(self.data) * out.grad

        out._backward = _backward

        return out

    ### Added ###
    def log(self):

        out = Value(math.log(self.data), (self,), "ln")

        def _backward():
            self.grad += (1 / self.data) * out.grad
        
        out._backward = _backward
        
        return out


    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward

        return out

    def relu(self):
        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward = _backward

        return out

    def backward(self):

        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"

In [11]:
import random

class Module:

    def zero_grad(self):
        for p in self.parameters():
            p.grad = 0

    def parameters(self):
        return []

class Neuron(Module):

    def __init__(self, nin, nonlin=True):
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(0)
        self.nonlin = nonlin

    def __call__(self, x):
        act = sum((wi*xi for wi,xi in zip(self.w, x)), self.b)
        return act.relu() if self.nonlin else act

    def parameters(self):
        return self.w + [self.b]

    def __repr__(self):
        return f"{'ReLU' if self.nonlin else 'Linear'}Neuron({len(self.w)})"

class Layer(Module):

    def __init__(self, nin, nout, **kwargs):
        self.neurons = [Neuron(nin, **kwargs) for _ in range(nout)]

    def __call__(self, x):
        out = [n(x) for n in self.neurons]
        return out[0] if len(out) == 1 else out

    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

    def __repr__(self):
        return f"Layer of [{', '.join(str(n) for n in self.neurons)}]"

class MLP(Module):

    def __init__(self, nin, nouts):
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1], nonlin=i!=len(nouts)-1) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

    def __repr__(self):
        return f"MLP of [{', '.join(str(layer) for layer in self.layers)}]"

In [12]:
def layernorm(x, eps=1e-5):
    """Calculates simplified layernorm over rows of 2-dim array"""
    # https://stackoverflow.com/questions/59830168/layer-normalization-in-pytorch
    # https://github.com/geohot/tinygrad/blob/master/models/transformer.py

    means = [sum(row) / len(row) for row in x]

    # Calculate variance
    vars = []
    for i in range(len(x)):
        mean_diffs = []
        for j in range(len(x[i])):
            mean_diffs.append((x[i][j] - means[i]) ** 2)
        vars.append(sum(mean_diffs) / len(mean_diffs))

    for i in range(len(x)):
        for j in range(len(x[i])):
            x[i][j] = (x[i][j] - means[i]) / ((vars[i] + eps) ** (1/2))
    print(means)
    print(vars)
    return x

layernorm([[1,2], [1,2]])

[1.5, 1.5]
[0.25, 0.25]


[[-0.9999800005999799, 0.9999800005999799],
 [-0.9999800005999799, 0.9999800005999799]]

In [28]:
import random

def dot(v1, v2):
    """Vector dot product of two vectors with same length"""
    return sum([x*y for x,y in zip(v1, v2)])

def matmul(A, B):
    """
    Args:
      A: Matrix with shape (X, Y) = X rows of length Y
      B: Matrix with shape (Y, Z) = Y rows of length Z = Z cols of length Y

    Returns:
      Matrix with shape (X, Z)
    """
    # Invert B to be (Z, Y)
    B = list(zip(*B))
    out = []
    for row_a in A:
        new_row = []
        for col_b in B:
            # row of length Y & col of length Y
            assert len(row_a) == len(col_b)
            new_row.append(dot(row_a, col_b))
        out.append(new_row)
    return out

def layernorm(x, eps=1e-5):
    """Calculates simplified layernorm over rows of 2-dim array"""

    means = [sum(row) / len(row) for row in x]

    # Calculate variance
    vars = []
    for i in range(len(x)):
        mean_diffs = []
        for j in range(len(x[i])):
            mean_diffs.append((x[i][j] - means[i]) ** 2)
        vars.append(sum(mean_diffs) / len(mean_diffs))

    for i in range(len(x)):
        for j in range(len(x[i])):
            x[i][j] = (x[i][j] - means[i]) / ((vars[i] + eps) ** (1/2))

    return x


def softmax(x, dim=1):
    """Applies softmax to 2-dimensional array - This can be done better"""
    if dim == 0:
        x = list(zip(*x))
    
    out = []
    for i in range(0, len(x)):
        out.append([x[i][j].exp() for j in range(0, len(x[i]))])

    for i in range(0, len(x)):
        denom = sum(out[i])
        for j in range(len(out[i])):
            out[i][j] /= denom

    if dim == 0:
        return list(zip(*out))
    return out

class SelfAttention(Module):
    def __init__(self, D, S):
        super().__init__()

        # Micrograd doesn't give an option to turn off bias
        self.q_dense = Layer(D, D)
        self.k_dense = Layer(D, D)
        self.v_dense = Layer(D, D)

        # Micrograd has no dropout

    def __call__(self, x):
        
        # No batch size 
        S = len(x)
        D = len(x[0])

        # Apply the same linear weights for each seq length unit
        q = list(map(self.q_dense, x)) # Shape: [S, D]
        k = list(map(self.k_dense, x)) # Shape: [S, D]
        v = list(map(self.v_dense, x)) # Shape: [S, D]

        out = matmul(q, list(zip(*k))) # Shape: [S, S]

        # Masking of future values
        for i in range(0, S):
            for j in range(i+1, S):
                # Hide the ith row * jth column
                out[i][j].data = -9999 # using float('-inf') introduces NaNs

        # Apply softmax across columns of each row
        out = softmax(out, dim=1) # Shape: [S, S]

        out = matmul(out, v) # Shape: [S, D]

        return out

    def parameters(self):
        q_pars = [p for p in self.q_dense.parameters()]
        k_pars = [p for p in self.k_dense.parameters()]
        v_pars = [p for p in self.v_dense.parameters()]
        return q_pars + k_pars + v_pars


class GPTBlock(Module):
    def __init__(self, D, S):
        super().__init__()
        
        self.att = SelfAttention(D, S)
        
        # MLP without dropout
        self.ff = MLP(D, [D*4, D*4, D])

    def __call__(self, x):

        att_out = self.att(layernorm(x))

        # Add Residual
        for i in range(len(att_out)):
            for j in range(len(att_out[i])):
                att_out[i][j] += x[i][j]

        ff_out = list(map(self.ff, layernorm(att_out)))

        # Add Residual
        for i in range(len(ff_out)):
            for j in range(len(ff_out[i])):
                ff_out[i][j] += att_out[i][j]

        return ff_out

    def parameters(self):
        att_pars = self.att.parameters()
        ff_pars = self.ff.parameters()
        return att_pars + ff_pars


class GPT(Module):
    def __init__(self, L=2, D=8, S=12, V=26, P=0.2):
        super().__init__()

        # Embeddings
        self.voc_emb = [[Value(random.uniform(-1,1)) for _ in range(D)] for _ in range(V)]
        self.pos_emb = [[Value(random.uniform(-1,1)) for _ in range(D)] for _ in range(S)]

        # Transformer Blocks
        self.blocks = [GPTBlock(D, S) for _ in range(L)]

        # Output
        self.out = Layer(D, V)

    def __call__(self, x):
        """
          Args:
            x: List of length S
          Returns:
            y: List of length S
        """
        voc_emb = [self.voc_emb[idx] for idx in x] # Shape [S, D]
        pos_emb = [self.pos_emb[idx] for idx in range(len(x))] # Shape [S, D]

        x = [[sum(z) for z in zip(sub_v, sub_p)] for sub_v, sub_p in zip(voc_emb,pos_emb)]

        for block in self.blocks:
            x = block(x)

        return list(map(self.out, layernorm(x)))

    def parameters(self):
        emb_pars = [p for p_list in self.voc_emb for p in p_list] + [p for p_list in self.pos_emb for p in p_list]
        block_pars = [p for block in self.blocks for p in block.parameters()]
        out_pars = self.out.parameters()
        return block_pars + out_pars + emb_pars
      

In [29]:
model = GPT(V=train_dataset.vocab_size, S=train_dataset.block_size, L=4, D=8)

In [30]:
model(X)[:1][:1]

[[Value(data=2.4507031380821003, grad=0),
  Value(data=0, grad=0),
  Value(data=0, grad=0),
  Value(data=0.9412224612929179, grad=0),
  Value(data=1.5766944155716995, grad=0),
  Value(data=0, grad=0),
  Value(data=3.0164231674399127, grad=0),
  Value(data=0, grad=0),
  Value(data=0.3762352462049908, grad=0),
  Value(data=0.3816561060676874, grad=0),
  Value(data=0, grad=0),
  Value(data=1.0081037939290172, grad=0),
  Value(data=0, grad=0),
  Value(data=0.07226779134780155, grad=0),
  Value(data=0, grad=0),
  Value(data=0, grad=0),
  Value(data=1.5124330201432963, grad=0),
  Value(data=0.26858885207170785, grad=0),
  Value(data=0.9976278674812337, grad=0),
  Value(data=0, grad=0),
  Value(data=2.1927261355384986, grad=0),
  Value(data=0.025301942461674873, grad=0),
  Value(data=2.9928753642383756, grad=0),
  Value(data=0, grad=0),
  Value(data=0, grad=0),
  Value(data=0.7966334557368777, grad=0),
  Value(data=0, grad=0),
  Value(data=0, grad=0),
  Value(data=0, grad=0),
  Value(data=1.1

In [31]:
# loss function
# great ref: https://github.com/karpathy/micrograd/blob/master/demo.ipynb
def loss(batch_size=8):

    def cross_entropy(preds, targets, eps=1e-15):
        losses = []
        for i, tar in enumerate(targets):
            stab = preds[i][tar] + eps
            losses.append(stab.log())
        return sum(losses) * -1

    losses = []
    for _ in range(batch_size):
        X, y = dataset[random.randint(0, len(dataset))]

        # forward the model to get scores
        scores = model(X)

        loss = cross_entropy(scores, y)


        losses.append(loss)

    total_loss = sum(losses) * (1.0 / len(losses))
    return total_loss

total_loss = loss()
print(total_loss)

Value(data=243.22317495606308, grad=0)


In [32]:
# optimization
for k in range(100):
    
    # forward
    total_loss = loss()
    
    # backward
    model.zero_grad()
    total_loss.backward()
    
    # update (sgd)
    learning_rate = 1.0 - 0.9*k/100
    for p in model.parameters():
        p.data -= learning_rate * p.grad
    
    if k % 1 == 0:
        print(f"step {k} loss {total_loss.data}")

step 0 loss 260.388675723954
step 1 loss 179.66027812337185
step 2 loss 152.23192022407562
step 3 loss 148.35509570659246
step 4 loss 141.22000951388566
step 5 loss 138.42312501308777
step 6 loss 158.22589162778664
step 7 loss 154.41517576856418
step 8 loss 121.92370253488497
step 9 loss 143.33694976879545
step 10 loss 185.01080258767988
step 11 loss 195.92509034067305
step 12 loss 147.3328983898601
step 13 loss 135.0645504490134
step 14 loss 147.68345167692647
step 15 loss 134.84393229909762
step 16 loss 131.28019703527013
step 17 loss 113.74112237891173
step 18 loss 161.54996543036418
step 19 loss 151.45381082075147
step 20 loss 150.08157526552412
step 21 loss 181.2657241668948
step 22 loss 142.22579784912662
step 23 loss 157.43751061068892
step 24 loss 160.42373408671995
step 25 loss 126.60887299960415
step 26 loss 168.74995921866125
step 27 loss 126.20308947476586
step 28 loss 158.92740147056662
step 29 loss 141.1868595990307
step 30 loss 145.3663107159982
step 31 loss 140.25698596

In [36]:
# alright, let's sample some character-level Shakespeare

context = "O God, O"
x = [train_dataset.stoi[s] for s in context]


model.zero_grad()

out_seq = ""

for k in range(5):
    logits = model(x) # Shape [S, V]
    # Pick the final sequence one, i.e. whats predicted after the last char
    data = [x.data for x in logits[-1]]

    pred_idx = data.index(max(data))
    pred_char = train_dataset.itos[pred_idx]

    x.append(pred_idx)
    out_seq += pred_char

# Well maybe this was a bit overkill - there might be bugs in the model & the model is tiny..
print(context)
print(out_seq)

O God, O
eeeee
