In [1]:
# E01: Tune the hyperparameters of the training to beat my best validation loss of 2.2
# E02: I was not careful with the intialization of the network in this video. (1) What is the loss you'd get if the predicted probabilities at initialization were perfectly uniform? What loss do we achieve? (2) Can you tune the initialization to get a starting loss that is much more similar to (1)?
# E03: Read the Bengio et al 2003 paper (link above), implement and try any idea from the paper. Did it work?

# E01: Tune the hyperparameters of the training to beat my best validation loss of 2.2

In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
# Vocabulary of characters and integer mapping
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
print(stoi)
itos = {i:s for s,i in stoi.items()}
print(itos)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [6]:
print(stoi['a'])

1


In [7]:
# Build data set

block_size = 3 # context length
def build_dataset(words):
    X, Y = [], [] # input (context to predict next char), output (next char predicted)
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context) # [s, c, a] -> [m]
            Y.append(ix)
            # sliding window
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

# training split, dev/validation split, test split
# 80%, 10%, 10%

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [40]:
# dataset
dim = 10
Xtr.shape, Ytr.shape
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, dim), generator=g) # 27 characters with 10 dimensional embedding
W1 = torch.randn((dim*3, 200), generator=g) # 10 (embedding) * 3 (context) = 30, 200 neurons (up to us)
b1 = torch.randn(200, generator=g) # randomly initialized
W2 = torch.randn((200, 27), generator=g) # previous dimension of W1, number of char
b2 = torch.randn(27, generator=g) # number of char
parameters = [C, W1, b1, W2, b2] 
sum(p.nelement() for p in parameters) # number of parameters in total

11897

In [42]:
for p in parameters:
    p.requires_grad = True

In [44]:
# step linearly to exponents of lr, 10^-3 and 1 (learning rate)
lre = torch.linspace(-3, 0, 1000) # [-3.0000, -2.9960, -2.9920, ..., -0.0080, -0.0040, 0.0000]
lrs = 10**lre # [0.0010, 0.0010, 0.0010, ..., 0.9978, 0.9982, 1.0000]
lri = []
lossi = []
stepi = []

In [62]:
# complete training loop
for i in range(200000):
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (64,)) # minibatch of 32 samples from Xtr

    # forward pass
    emb = C[Xtr[ix]] # embedding lookup
    h = torch.tanh(emb.view(-1, dim*3) @ W1 + b1) # hidden layer
    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Ytr[ix]) # softmax and negative log likelihood

    # backward pass
    for p in parameters:
        p.grad = None # clear to prevent accumulation
    loss.backward() # compute gradient of the loss with respect to all parameters

    # update 
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters: 
        p.data += -lr * p.grad # update parameter value using gradient descent rule

    stepi.append(i)
    lossi.append(loss.log10().item())

KeyboardInterrupt: 

In [None]:
# Different ways to optimize: TODO
# change number of neurons
# dimensionality of embedding loo up table
# number of characters feeding in context
# how long running
# learning rate
# how it decays
# batchsize

# plt.plot(stepi, lossi)
# plt.show()

In [None]:

emb = C[Xtr] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ytr)
loss

In [None]:

emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev)
loss

# E02: I was not careful with the intialization of the network in this video. (1) What is the loss you'd get if the predicted probabilities at initialization were perfectly uniform? What loss do we achieve? (2) Can you tune the initialization to get a starting loss that is much more similar to (1)?

# E03: Read the Bengio et al 2003 paper (link above), implement and try any idea from the paper. Did it work?