**What we are building?**

Transformer based neural network from scratch using paper *Attention is all you need* which will be train on Shakespeare toy dataset to generate a sequence of **characters** given an input (sequence of characters).

### Download the dataset

In [1]:
from pathlib import Path
import requests

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
dataset_name = "tiny_shakespeare_data.txt"

dataset_folder = Path("data/")
dataset_folder.mkdir(parents=True, exist_ok=True)

dataset = dataset_folder / dataset_name

# check to see if dataset already exits and download
if not dataset.is_file():
    # download the data
    print(f"downloading the dataset!")
    data = requests.get(url=url)
    with open(dataset, "wb") as f:
        f.write(data.content)
    print(f"dataset downloaded!")
else:
    print(f"dataset already downloaded!")

print(dataset)

dataset already downloaded!
data/tiny_shakespeare_data.txt


In [2]:
# read data
with open(dataset, 'r') as f:
    text = f.read()

# total length
print(f"total characters : {len(text)}")


total characters : 1115394


Get all possible characters and vocab size

In [3]:
# possible characters the model can see and emit
chars = sorted(list(set(text)))
# number of characters
vocab_size = len(chars)

Encode Decode (Tokenizers: convert chars to string and vice versa)

In [4]:
cti = {c:i for i, c in enumerate(chars)}
itc = {i:c for i, c in enumerate(chars)}

encode = lambda x: [cti[i] for i in x]
decode = lambda lst: ''.join([itc[i] for i in lst])

Encode and convert entire dataset into tensor and perform train test split.

In [5]:
import torch
# tokenize entire dataset
data = torch.tensor(encode(text))
# train validate split
split_pos = int(len(data)*0.9) # 90% for training, 10% for validation
train_data = data[:split_pos]
val_data = data[split_pos:]

### Batch of Inputs and Targets

block size / context length = max length of input to the transformer (can't feed whole dataset, so we give small random chunks)

we have to generate batch of inputs

In [6]:
torch.manual_seed(1337)
block_size = 8 # input length, will generate 8 samples
batch_size = 4 # batch of 4 inputs

def get_batch(split):
    # get data
    data = train_data if split == 'train' else val_data
    # get batch
    random_offsets = torch.randint(low=0, high=len(data)-block_size, size=[batch_size])
    x = torch.stack([data[i: i+block_size] for i in random_offsets])
    y = torch.stack([data[i+1: i+1+block_size] for i in random_offsets])
    
    return x, y

xb, yb = get_batch('train')
print(f"input:\n{xb}\ntarget:\n{yb}")


input:
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
target:
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


### Bigram Model

In [7]:
from torch import nn
from torch.nn import functional as F

class BiGram(nn.Module):
    def __init__(self, vocab_size) -> None:
        super().__init__()
        # lookup table for character embedding vectors
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, target=None):
        logits = self.token_embedding_table(idx)
        if target is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            target = target.view(B*T)
            loss = F.cross_entropy(input=logits, target=target)
        
        return logits, loss
    
    def generate(self, idx, max_token):
        # generate max_token characters
        for i in range(max_token):
            logits, loss = self(idx) # (B, T, C)
            # pick the last character embedding of each 
            # sample prediction in the batch
            last_logits = logits[:,-1,:] # (B, C) -> (4, 65)
            # convert it to a prob distribution
            y_prob = F.softmax(input=last_logits, dim=1)
            # one sample from each prob distribution
            next_idx = torch.multinomial(input=y_prob, num_samples=1) # (B, 1)
            # concatenate the prediction with given input
            idx = torch.cat((idx, next_idx), dim=1) # (B, T+1)
        return idx



model = BiGram(vocab_size=vocab_size)
# logits, loss = model(xb, yb)
# print(decode(model.generate(torch.zeros((1,1), dtype=torch.long), 100)[0].tolist()))

### Training the Bigram Model

In [8]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)

for epoch in range(10000):
    xb, yb = get_batch('train')
    logits, loss = model(xb ,yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(loss)

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


tensor(2.3741, grad_fn=<NllLossBackward0>)


In [9]:
print(decode(model.generate(torch.zeros((1,1), dtype=torch.long), 250)[0].tolist()))


PFW'd ABun wicay grear car myothed:
NEThe TI
Qat herthergnend.
IZAm;'sinde g wals slulobus whe

KIZ!k fer
Wod he k
MILy,

IEThoZELEN:


GheayLYk
WIIUCIINDerf canthinin: ans nd s

HRDXzQzLPUPore t soforn I's temrpeFFI thattersurave d myou ftr forgg o 


### Mathematical Trick in Self-attention

Previously, we were only looking at the last character to predict the sequence, but, now we will start considering the whole context in order to generate new character sequence.

To achieve the said objective, we will consider the weighted aggregation of the embedding vectors of all the previous character (context) until and including the character at time t, in order to predict the character at t+1. We will use matrix multiplication to achieve that. 

In [2]:
import torch
from torch import nn

# matrix to allow avg sum
# a = torch.tril(torch.ones(size=(4,4), dtype=int))
# a = a / a.sum(dim=1, keepdim=True)
# print(a)
# print(b)
# print(torch.matmul(a, b))

# technique two (efficient one)
B, T, C = (1, 8, 32)
x = torch.randn(B, T, C)
tril_ones = torch.tril(torch.ones(size=(T, T)))

# single head performing self attention 
HEAD_SIZE = 16 # head size (dimensin of key and query vectors for each token)
key_vectors = nn.Linear(C, HEAD_SIZE, bias=False) # (T, HdSz)
query_vectors = nn.Linear(C, HEAD_SIZE, bias=False) # (T, HdSz)
value_vectors = nn.Linear(C, HEAD_SIZE, bias=False) # (T, HdSz)

# generating key, query and value for x (input)
key = key_vectors(x) # (B, T, HdSz)
query = query_vectors(x) # (B, T, HdSz)
value = value_vectors(x) # (B, T, HdSz)

# wei is a matrix where each cell is dot product of a query vector & a key vector
# rows contain tokens' queries and columns are tokens' key vectors
#  |         a         |        b           |     c      ...
# a| query(a) . key(a) | query(a) . key(b)  |  query(a) . key(c) ...
# b| query(b) . key(a) | query(b) . key(b)  |  query(b) . key(c) ...
# c| query(c) . key(a) | query(b) . key(b)  |  query(c) . key(c) ...
# ... 
wei = query @ key.transpose(-2, -1) # (B, T, HdSz) @ (B, HdSz, T) --> (B, T, T)
# setting future tokens weights/affinities to -inf
wei = wei.masked_fill(tril_ones == 0, float('-inf'))
# apply softmax for smooth distribution
wei = torch.softmax(wei, dim=1)

output = wei @ value # (T, T) @ (B, T, HdSz) --> (B, T, HdSz)
print(output.shape)

print(wei)

# print(x)
# bow

torch.Size([1, 8, 16])
tensor([[[0.0400, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0929, 0.0087, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0143, 0.0770, 0.1096, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1761, 0.1120, 0.1765, 0.0314, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0932, 0.0522, 0.0975, 0.0126, 0.1079, 0.0000, 0.0000, 0.0000],
         [0.3150, 0.4760, 0.2602, 0.1279, 0.3768, 0.0285, 0.0000, 0.0000],
         [0.0756, 0.2285, 0.2359, 0.8185, 0.0831, 0.6369, 0.9271, 0.0000],
         [0.1929, 0.0456, 0.1203, 0.0097, 0.4322, 0.3347, 0.0729, 1.0000]]],
       grad_fn=<SoftmaxBackward0>)


8