# Transformer
This is the playground file that was used during development of a transformer decoder-only character level language model. The
actual model can be found in gpt.py

In [2]:
with open('shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
# all unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"vocab: {chars}, vocab_size: {vocab_size}")

vocab: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'], vocab_size: 65


In [4]:
# Character-level tokenization and encoding for simplicity. Would normally use a sub-word tokenizer, such as BPE
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encoder which takes a string and outputs a list of integers
decode = lambda s: ''.join([itos[i] for i in s]) # decoder which takes a list of integers and outputs a string

In [5]:
# Tokenize the input

import torch
import torch.nn as nn
from torch.nn import functional as F
data = torch.tensor(encode(text), dtype=torch.long)
print(f"data.shape: {data.shape}")


data.shape: torch.Size([1115394])


In [6]:
# Train and validation split - 90/10
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

In [7]:
# Define block and batch size, as well as a training sampler.
torch.manual_seed(1337)
block_size = 8 
batch_size = 4 

def get_batch(split):
    # generate a small batch consisting of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y


In [8]:
# Weighted aggregation operation using matrix multiplications
B, T, C = 4, 8, 2 # batch, time/tokens, channels
x = torch.randn(B, T, C)
wei = torch.tril(torch.ones(T,T)) # lower triangular matrix
wei = wei / wei.sum(1, keepdim=True) # average over the past
xbow = wei @ x # (B, T, T) @ (B, T, C) -> (B, T, C)

In [9]:
# Weighted aggregation using a softmax
B, T, C = 4, 8, 32 # batch, time/tokens, channels
x = torch.randn(B, T, C)

# single self-attention head
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)


In [None]:
# Changing BatchNorm from MLP.ipynb to a LayerNorm.

class LayerNorm:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        # parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
    def __call__(self, x):
        #calculate the forward pass
        xmean = x.mean(1, keepdim=True)
        xvar = x.var(1, keepdim=True, unbiased=True)
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta

        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]
      