In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt 


# Shakespeare dataset
Now we will move to a real world task, next-character prediction with the tiny-shakespeare dataset

In [2]:
with open("datasets/tinyShakespeare.txt", "r") as f:
    load = f.read()
print(load[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [3]:
texttoint = {elm: n for n, elm in enumerate(set(load))}
inttotext = {n:elm for n, elm in enumerate(set(load))}
dataset = [texttoint[c] for c in load]

sequence_length = 1
vocab_size = len(texttoint)
inputs = torch.tensor(dataset[:-1])
labels = torch.tensor(dataset[1:])

# Perceptron
This complex dataset requires us to use something more elaborate, we will build on logistic regression to make an architecture that can deal with this kind of data 

In [4]:
# Parameters
m = torch.randn(vocab_size, vocab_size).requires_grad_()
q = torch.randn(vocab_size).requires_grad_()

# Training hyperparameters
epochs = 10
lr = 1e-2

# Training loop
for epoch in range(epochs):
    pred = inputs @ m + q
    loss = F.cross_entropy(pred, labels)
    loss.backward()

    with torch.no_grad():
        m -= lr * m.grad
        q -= lr * q.grad
        m.grad = None
        q.grad = None
    
    if epoch % (epochs//10) == 0:
        print(loss.item())

5.200290679931641
5.199605941772461
5.198920249938965
5.198235511779785
5.197550296783447
5.196865081787109
5.196181297302246
5.195497035980225
5.194812774658203
5.194129943847656


This is prohibitively slow, we cant train on this... Turns out that doing a forward and backward pass on the complete dataset is slow, let's see if we can make more approximate step by using a subset of the dataset

In [5]:
m = torch.randn(vocab_size, vocab_size).requires_grad_()
q = torch.randn(vocab_size).requires_grad_()

epochs = 10000  # training for 1000x more epochs 
lr = 1e-2
batch_size = 1024  # taking a batch of 1024 samples at a time

for epoch in range(epochs):
    indexes = torch.randint(0, len(inputs), (batch_size,))  # generating random indexes

    pred = F.one_hot(inputs[indexes].long(), vocab_size).float() @ m + q
    loss = F.cross_entropy(pred, labels[indexes])
    loss.backward()

    with torch.no_grad():
        m -= lr * m.grad
        q -= lr * q.grad
        m.grad = None
        q.grad = None
    
    if epoch % (epochs//10) == 0:
        print(loss.item())

4.801705360412598
4.462411403656006
4.155315399169922
3.9009997844696045
3.6736018657684326
3.6835973262786865
3.6150882244110107
3.6090965270996094
3.506439447402954
3.3794827461242676


This is cool, we can achieve a pretty decent loss, but some problems are way harder and can't be tackled with a solution this simple... <br>
Remember what we did earlier? We expanded our logistic regressor to make a Perceptron, now we can try expanding the perceptron in the forward direction <br>
This architecture should resemble a network of biological neurons, this is called Multi Layer Perceptron

# MLP



In [6]:
hidden_size = 128

w1 = torch.randn(vocab_size, hidden_size, requires_grad=True)
b1 = torch.randn(hidden_size, requires_grad=True)
w2 = torch.randn(hidden_size, vocab_size, requires_grad=True)
b2 = torch.randn(vocab_size, requires_grad=True)


epochs = 10000
lr = 1e-2
batch_size = 1024
lossi = []

for epoch in range(epochs):
    indexes = torch.randint(0, len(inputs), (batch_size,))
    
    X = F.one_hot(inputs[indexes].long(), vocab_size).float()
    h1 = F.gelu(X @ w1 + b1)
    pred = h1 @ w2 + b2
    loss = F.cross_entropy(pred, labels[indexes])
    lossi.append(loss.item())
    loss.backward()

    with torch.no_grad():
        for p in [w1, b1, w2, b2]:
            p -= lr * p.grad
            p.grad = None

    if epoch % (epochs//10) == 0:
        print(loss.item())

25.357297897338867
6.297913074493408
4.232690811157227
3.684781551361084
3.6616287231445312
3.2838339805603027
3.1329193115234375
3.067996025085449
3.1219892501831055
2.83640456199646


Expanding the architecture definetly made the model more effective but as we can see the starting loss is extremely high. <br>
Let's figure that out why by printing some of the values from each layer 

In [7]:
# todo convert to mean abs

print("MEAN ABS VALUES AT THE END OF A TRAINING RUN")
print("Input: ", F.one_hot(inputs[indexes].long(), vocab_size).float()[0][:5].numpy()) # input
print("Hidden layer: ", F.gelu(F.one_hot(inputs[indexes].long(), vocab_size).float()@w1+b1)[0][:5].detach().numpy()) # hidden layer
print("Output layer: ", (F.gelu(F.one_hot(inputs[indexes].long(), vocab_size).float()@w1+b1)@w2+b2)[0][:5].detach().numpy()) # output layer
print("Predictions: ", F.softmax(pred[0], dim=0)[:5].detach().numpy()) # predictions

w1 = torch.randn(vocab_size, hidden_size, requires_grad=True)
b1 = torch.randn(hidden_size, requires_grad=True)
w2 = torch.randn(hidden_size, vocab_size, requires_grad=True)
b2 = torch.randn(vocab_size, requires_grad=True)
X = F.one_hot(inputs[indexes].long(), vocab_size).float()
h1 = F.gelu(X @ w1 + b1)
pred = h1 @ w2 + b2

print("\nMEAN ABS STARTING VALUES")
print("Input: ", F.one_hot(inputs[indexes].long(), vocab_size).float()[0][:5].numpy()) # input
print("Hidden layer: ", F.gelu(F.one_hot(inputs[indexes].long(), vocab_size).float()@w1+b1)[0][:5].detach().numpy()) # hidden layer
print("Output layer: ", (F.gelu(F.one_hot(inputs[indexes].long(), vocab_size).float()@w1+b1)@w2+b2)[0][:5].detach().numpy()) # output layer
print("Predictions: ", F.softmax(pred[0], dim=0)[:5].detach().numpy()) # predictions


VALUES AT THE END OF A TRAINING RUN
Input:  [0. 0. 0. 0. 0.]
Hidden layer:  [-0.0958685  -0.00248625 -0.06264118 -0.06842674 -0.05632487]
Output layer:  [-0.21774988  2.7743325  -1.9149207   3.3216686   0.7236721 ]
Predictions:  [0.00061575 0.01227302 0.00011268 0.02122208 0.00157674]

STARTING VALUES
Input:  [0. 0. 0. 0. 0.]
Hidden layer:  [ 7.5217730e-01 -6.5267354e-02  9.6309316e-01 -8.0141245e-04
 -5.4420985e-02]
Output layer:  [ 2.5306673  7.3293037  4.9385695 24.713625  18.680634 ]
Predictions:  [8.7072460e-11 1.0565784e-08 9.6742936e-10 3.7481460e-01 8.9892122e-04]


# Initialization

By inspecting closely the values at the start of the training run, we clearly see that they are way too high (1-3 Orders of magnitude higher than the end of the run) <br>

Now, let's explore out options: 
- We could obviously just ignore this problem and let the optimization deal with it, but that idea doesn't scale well with big models
- We could simply reduce the starting values of weights and biases, and it's going to be better, but it's not the most elegant solution. Or even better we could normalize the starting values
- We could normalize the values in each layer (suitable for deep networks) such that mean is zero and standard deviation is one

Let's try out normalization of the weights

In [8]:
hidden_size = 128

w1 = torch.randn(vocab_size, hidden_size, requires_grad=True)
b1 = torch.randn(hidden_size, requires_grad=True)
w2 = torch.randn(hidden_size, vocab_size, requires_grad=True)
b2 = torch.randn(vocab_size, requires_grad=True)

with torch.no_grad():
    for p in [w1, b1, w2, b2]:
        p /= torch.norm(p, dim=0)

epochs = 10000
lr = 1e-2
batch_size = 1024
lossi = []

for epoch in range(epochs):
    indexes = torch.randint(0, len(inputs), (batch_size,))
    
    X = F.one_hot(inputs[indexes].long(), vocab_size).float()
    h1 = F.gelu(X @ w1 + b1)
    pred = h1 @ w2 + b2
    loss = F.cross_entropy(pred, labels[indexes])
    lossi.append(loss.item())
    loss.backward()

    with torch.no_grad():
        for p in [w1, b1, w2, b2]:
            p -= lr * p.grad
            p.grad = None

    if epoch % (epochs//10) == 0:
        print(loss.item())

4.171732425689697
3.6347289085388184
3.336347818374634
3.2786505222320557
3.130432367324829
3.1381890773773193
3.124907970428467
2.9968366622924805
2.9734909534454346
2.897439479827881
