# MLP with Context + Embeddings

Here is a Summary of KP2 and KP3, which has been `torchified`. Note: Just Layers created, Need to bring model all together into a `Model` Class with `.forward()`

In [1]:
import torch
import torch.nn.functional as F

## Create data

In [2]:
names = open("names.txt", "r").read().splitlines()
names[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
### Build the vocab ###

#find all characters in our dataset
vocab = sorted(list(set(''.join(names))))
vocab.insert(0, ".")

#create vocab mappings
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

In [4]:
context_len = 3
X, Y = [], []

counter_for_show = 0
names_for_show = 3
for name in names:
    #initalise context with "." characters (index 0)
    context = [0] * context_len
    
    if counter_for_show < names_for_show:
        print(name)
    
    for char in name + ".": # add end character to the name
        y = char_to_idx[char]
        X.append(context)
        Y.append(y)
        
        if counter_for_show < names_for_show:
            print(f'{"".join(idx_to_char[idx] for idx in context)} ---> {idx_to_char[y]} | {context} ---> {y}')

        #shift the context (like a rolling window)
        context = context[1:] + [y]
    
    if counter_for_show < names_for_show:
        print(end='\n')
    counter_for_show += 1

#store as tensors
X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ---> e | [0, 0, 0] ---> 5
..e ---> m | [0, 0, 5] ---> 13
.em ---> m | [0, 5, 13] ---> 13
emm ---> a | [5, 13, 13] ---> 1
mma ---> . | [13, 13, 1] ---> 0

olivia
... ---> o | [0, 0, 0] ---> 15
..o ---> l | [0, 0, 15] ---> 12
.ol ---> i | [0, 15, 12] ---> 9
oli ---> v | [15, 12, 9] ---> 22
liv ---> i | [12, 9, 22] ---> 9
ivi ---> a | [9, 22, 9] ---> 1
via ---> . | [22, 9, 1] ---> 0

ava
... ---> a | [0, 0, 0] ---> 1
..a ---> v | [0, 0, 1] ---> 22
.av ---> a | [0, 1, 22] ---> 1
ava ---> . | [1, 22, 1] ---> 0



In [5]:
print(f"{X.shape} with dtype: {X.dtype}")
print(f"{Y.shape} with dtype: {Y.dtype}")

torch.Size([228146, 3]) with dtype: torch.int64
torch.Size([228146]) with dtype: torch.int64


Our Dataset consists of 228146 examples of context length 3. Each example represents the index into our vocab. Now lets split this into training, validation and test sets.

In [6]:
def create_dataset(names, context_len=3):
    """Function to create a dataset out of a list of names given to it

    Args:
        names (list)
        context_len (int, optional):Defaults to 3.

    Returns:
        tuple: X, Y
    """

    X, Y = [], []
    for name in names:
        #initalise context with "." characters (index 0)
        context = [0] * context_len
        
        for char in name + ".": # add end character to the name
            y = char_to_idx[char]
            X.append(context)
            Y.append(y)
        
            #shift the context (like a rolling window)
            context = context[1:] + [y]

    #store as tensors
    X = torch.tensor(X)
    Y = torch.tensor(Y)

    return X, Y

In [7]:
## Split simply by indexing
import random 
random.shuffle(names)
train_split = int(0.8*len(names)) #80% for train
val_split = int(0.9*len(names)) #10% for each of val and test

X_train, Y_train = create_dataset(names[:train_split])
X_val, Y_val = create_dataset(names[train_split:val_split])
X_test, Y_test = create_dataset(names[val_split:])

In [8]:
print(X_train.shape, Y_train.shape)
print(X_val.shape, Y_val.shape)
print(X_test.shape, Y_test.shape)

torch.Size([182532, 3]) torch.Size([182532])
torch.Size([22873, 3]) torch.Size([22873])
torch.Size([22741, 3]) torch.Size([22741])


## Linear Layer

In [None]:
class Linear:

    def __init__(self, in_feats, out_feats, bias=True):
        """
        Our initalisation. This function runs first and only once.
        Creates our Parameters (weights and bias) according to the shapes we define
        """
        self.weights = torch.randn((in_feats, out_feats)) / (in_feats)**0.5 # kaiming init (the gain we add later)
        # 1D Array. Broadcasting will add a first dimension => row vector. Then copied horizontally
        self.bias = torch.zeros((out_feats)) if bias else None

    def __call__(self, x):
        """
        Defines what the layer does.
        """
        self.out = x @ self.weights
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weights] + [] if self.bias is None else [self.bias]

In [10]:
ll = Linear(200, 100) #(200, 100)

In [11]:
ll(torch.randn((32, 200)))

tensor([[-0.1614, -0.9978,  1.5736,  ..., -0.0795, -0.5924,  0.9335],
        [ 0.8366, -0.8196,  1.7949,  ...,  0.3813,  0.2425,  0.9816],
        [-0.6354, -1.7995, -0.0213,  ...,  0.4595, -1.0686, -0.2287],
        ...,
        [-0.0298,  0.3662, -2.3881,  ..., -0.4450, -1.2235, -0.2654],
        [-1.3655,  0.4128, -0.8359,  ...,  0.1139, -1.1340, -0.8745],
        [-1.0898, -0.9293,  0.4817,  ...,  1.1814, -1.2138,  1.7310]])

In [12]:
ll.parameters()

[tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.])]

In [13]:
ll.out

tensor([[-0.1614, -0.9978,  1.5736,  ..., -0.0795, -0.5924,  0.9335],
        [ 0.8366, -0.8196,  1.7949,  ...,  0.3813,  0.2425,  0.9816],
        [-0.6354, -1.7995, -0.0213,  ...,  0.4595, -1.0686, -0.2287],
        ...,
        [-0.0298,  0.3662, -2.3881,  ..., -0.4450, -1.2235, -0.2654],
        [-1.3655,  0.4128, -0.8359,  ...,  0.1139, -1.1340, -0.8745],
        [-1.0898, -0.9293,  0.4817,  ...,  1.1814, -1.2138,  1.7310]])

## Batch Normalisation

In [14]:
class BatchNorm:
    
    def __init__(self, num_features, eps=1e-5, momentum=0.1):

        self.eps = eps
        self.momentum = momentum

        # -- Training Flag --
        # if what happens in the forward pass in training is different from the forward pass in inference, then we need to seperate the logic (using this flag!)
        self.training = True

        #  -- Two sets of parameters --
        # Learnable scale and shift parameters
        self.scale = torch.ones((1, num_features))
        self.shift = torch.zeros((1, num_features))
        # Buffers (Running mean and std, calculated iteratively, NOT Learned)
        self.running_mean = torch.tensor([0])
        self.running_std = torch.tensor([1])


    def __call__(self, x: torch.tensor):
        
        # -- Forward Pass --
        if self.training:
            # X has shape (batch_size, number_of_features)
            batch_mean = x.mean(dim=0, keepdim=True) # mean over the batch for each neuron, keep as a row vector; shape=(1, num_of_feats)
            batch_std = x.std(dim=0, keepdim=True)
        else: # running inference
            batch_mean = self.running_mean
            batch_std = self.running_std
        
        #save as anatrribute so we can loo at some statistics later - NOT pytorch standard
        self.out = self.scale * ((x - batch_mean) / (batch_std + self.eps)) + self.shift
        
        ## Update the Running mean and std
        if self.training:
            
            with torch.no_grad(): # telling pytorch we will never call .backwards() on these variables (prevent pytorch building out a computation graph)
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean # what it was before plus a little bit from this batch
                self.running_std = (1 - self.momentum) * self.running_std + self.momentum * batch_std
        
        return self.out
    
    def parameters(self):
        return [self.scale, self.shift]

In [15]:
bn = BatchNorm(num_features=200)

In [16]:
bn.parameters()

[tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1.]]),
 tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [17]:
class Tanh():
    
    #No Constructor needed as we have no parameters
    
    def __call__(self, x: torch.tensor):
        #Again creating a class variable so we can examine later but this isnt standard
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []

In [18]:
t = Tanh()
print(t(torch.tensor(0.8)))

tensor(0.6640)


### Stacking layers

##### Long winded explaination of where the input into the first layer comes from
* Recall that an example in our dataset is a list of size `context_len`, representing the indicies of our vocab. e.g. `... ---> e | [0, 0, 0] ---> 5`. So an example has shape `(context_len, )` its a 1D array.
* Next we send this 1D array through our embedding matrix, which has shape `(vocab_len, emb_dim)`. So our input into the first layer of our network is `(1, context_len, emb_dim)`. This maes sense, we have three indices (the context len, e.g. [0, 0, 0]), each of which index's into our embedding matrix (e.g. C[0] will give (1, emb_dim) - but we have three of them, which is the context len, so we get C[[0, 0, 0]] is (3, emb_dim)).
* next we take a batch of examples, rather than just one, so we get (`(batch_dim, context_len, emb_dim)`) - we have say 32 examples of an example which is a list of three indicies, for each of these indicies we index (i.e. select a row of) a matrix which has emb_dim number of columns
* Finally, we view this Tensor as  `(batch_dim, context_len*emb_dim)` So each example (which is a list of 3 indices of our vocab) we flatten them into one long vector containing each of the indices embedding as one vector if C[0] is [2, 1] (i.e. emb_dim=2), then C[[0, 0, 0]] would be [[2, 1], [2, 1], [2, 1]] (shape: (3, 2)) which we now flatten to be [2, 1, 2, 1, 2, 1] (shape (6,) 1D array). We do this firstly because MLP expect 2D arrays (not 3D). secondly in a character-level language model we'd like to process the whole **context information all at once** (rather than seperately). Finally, by flattening, we allow the MLP to learn patterns across both the context and embedding dimensions **simultaneously** 

In [32]:
vocab_len=27
context_len=3
emb_dim=7
batch_dim=64

C = torch.randn((vocab_len, emb_dim))

## -- Pre Processing --
# Examples:                            (batch_dim, context_len)
# Embed Examples:                      (batch_dim, context_len, emb_dim)
# Flatten Examples:                    (batch_dim, context_len*emb_dim)
# 1st Layer...
layers = [
    Linear(in_feats=context_len*emb_dim, out_feats=200), # Shape of weight matrix to give an output: (batch_dim, num_neurons_1st_layer)
    BatchNorm(num_features=200),
    Tanh(),
    Linear(200, vocab_len)
]

# Apply the (5/3) part of the hamming!
for layer in layers:
    if isinstance(layer, Linear):
        layer.weights *= (5/3)

### What `in_feats` and `out_feats` represent
Its really important to understand what we mean by `in_feats` and `out_feats` in the Linear Layer. They define the shape of the weight matrix. They do **not** represent the shape of the output of the layer! The weigh matrix has shape: `(in_feats, out_feats)`. Throughout the neural network the `batch_dim` will remain the first dim of any output of a layer. Recall `(m, n)x(n, p)=(m, p)`. Then `n=in_feats` and `p=out_feats`( with `m=batch_dim`). Its what dimension the weight matrix should be. The reason theyre named as `in_feats` and `out_feats` is because if we consider the 2nd dimension (the columns) then second dim of the input into the layer is `in_feats` and the second dim of the output of tae layer is `out_feats` (usually the number of neurons in the layer). the first dim is always `batch_dim` remember.

In [33]:
parameters = [C] + [p for layer in layers for p in layer.parameters()]

# each parameter is a tensor which we want to learn ==> so requires gradient!
for p in parameters:
    p.requires_grad = True

In [34]:
### Training ###

max_steps = 100_000


for i in range(max_steps):
    
    #create a batch
    batch_idx = torch.randint(0, X_train.shape[0], size=(batch_dim,)) # 1D array of indices: (batch_dim,)
    batch = X_train[batch_idx] # (batch_dim, context_len)
    
    ## -------------- FORWARD PASS ---------------- ##
    #Embbed
    emb=C[batch] # (batch_dim, context_len, emb_dim)
    
    #Flatten
    x=emb.view(-1, context_len*emb_dim) # (batch_dim, context_len*emb_dim)
    
    for layer in layers:
        x = layer(x) #We are calling each layer on the output from the previous layer x
    
    loss = F.cross_entropy(x, Y_train[batch_idx])
    
    break
    ## ------------ BACKWARD PASS ------------------ ##
    
    # for layer in layers:
    #     layer.out.retain_grad() # In Pytorch these are thrown away by autograd after they are computed to save memory - but we need to keep them
    
    # for p in parameters:
    #     p.grad = None
    # loss.backward()
    
    # ## ----------- UPDATE -------------------------- ##
    # if i <= 100_000:
    #     lr=0.1
    # elif ((i>150_000) & (i<180_000)):
    #     lr=0.01
    # else:
    #     lr=0.001
    
    # for p in parameters:
    #     p.data += -lr * p.grad
    
    
    # ## Track progress ...
    # if i % 10_000 == 0:
    #     print(f"{loss.item():.4f}")

In [35]:
layers[-1].out.grad

  layers[-1].out.grad


In [36]:
### Training ###

max_steps = 250_000


for i in range(max_steps):
    
    #create a batch
    batch_idx = torch.randint(0, X_train.shape[0], size=(batch_dim,)) # 1D array of indices: (batch_dim,)
    batch = X_train[batch_idx] # (batch_dim, context_len)
    
    ## -------------- FORWARD PASS ---------------- ##
    #Embbed
    emb=C[batch] # (batch_dim, context_len, emb_dim)
    
    #Flatten
    x=emb.view(-1, context_len*emb_dim) # (batch_dim, context_len*emb_dim)
    
    for layer in layers:
        x = layer(x) #We are calling each layer on the output from the previous layer x
    
    loss = F.cross_entropy(x, Y_train[batch_idx])
    
    # ------------ BACKWARD PASS ------------------ ##
    
    # In Pytorch Non-leaf nodes in the computational graph are thrown away by autograd after they are computed and used in the backwards pass
    # This is to save memory. But we need to keep them
    for layer in layers:
        layer.out.retain_grad() 
    
    # remove the old gradients from the previous backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    ## ----------- UPDATE -------------------------- ##
    if i <= 100_000:
        lr=0.1
    elif ((i>150_000) & (i<180_000)):
        lr=0.01
    else:
        lr=0.001
    
    for p in parameters:
        p.data += -lr * p.grad
    
    
    ## Track progress ...
    if i % 10_000 == 0:
        print(f"{loss.item():.4f}")

3.9036
2.4064
2.6666
2.5221
2.6317
2.1978
2.3304
2.4800
2.5238
2.4132
2.6700
2.1960
2.4214
2.5304
2.2663
2.2792
2.2229
2.7139
2.3502
2.3517
2.2220
2.1882
2.3404
2.1929
2.2518


In [37]:
@torch.no_grad()
def loss_on_split(split: str):
    x, y = {
        "train": (X_train, Y_train),
        "test": (X_test, Y_test),
        "val": (X_val, Y_val)
    }[split]
    
    
    #run forward pass and get loss
    emb = C[x]
    h = emb.view(-1, context_len*emb_dim) 
    
    for layer in layers:
        if isinstance(layer, BatchNorm):
            layer.training = False
        h = layer(h)
    
    loss = F.cross_entropy(h, y)
    print(f"{split} loss: {loss.item():.3f}")

In [38]:
loss_on_split("train")
loss_on_split("test")

train loss: 2.391
test loss: 2.390


In [39]:
generated_chars = []
NUMBER_OF_NAMES = 10

for _ in range(NUMBER_OF_NAMES):
    #start with ...
    context = [0] * 3
    
    while True:
        #send it through the forward pass
        emb = C[context] #(1_example, context_len, emb_dim) = (1, 3, 10)
        h = emb.view(-1, context_len*emb_dim) 
    
        for layer in layers:
            # So batch norm uses the running means for inference (rather than over the batch)
            if isinstance(layer, BatchNorm):
                layer.training = False
            h = layer(h)
        #now we create the prob dist 
        prob_dist = F.softmax(h, dim=1) #logits is a row vector, so calculate over the row (i.e. across dim 1 which the columns across the columns gives over the row)

        #now we sample from the dist (a multinomal will do that for us)
        idx = torch.multinomial(prob_dist, 1).item()
        generated_chars.append(idx_to_char[idx])

        if idx == 0:
            break

        context = context[1:] + [idx]

    print("".join(generated_chars))
    #reset
    generated_chars=[]

menl.
madi.
zallianaylan.
ano.
alanrerish.
merynniaton.
cesa.
aagielenare.
stcyu.
eryaaowyla.
