In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt #for making figures
%matplotlib inline

In [2]:
#read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
#build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)

In [5]:
vocab_size

27

In [6]:
#build the dataset
block_size = 3 #context length: how many characters do we take to predict the next one?

def build_dataset(words):
    X, Y = [], []

    for w in words[:]:
        #print(f'Word: {w}')
        context = [0] * block_size 
        ##print(''.join(str(context)))
        for ch in w + '.':
            ix = stoi[ch]
            #print(f'ix: {ix}')
            X.append(context)
            Y.append(ix)
            ##print('Appended context into X and ix into Y')
            context = context[1:] + [ix] #crop and append
            #print(''.join(str(context)))

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    #print(X)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [7]:
#Boilerplate from the batchnorm version above this comment

#Utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(s, dt, t): 
    ex = torch.all(dt == t.grad).item() #Check for exact equality
    app = torch.allclose(dt, t.grad) #Check for approximate equality (allclose() uses epsilons for comparison)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [8]:
#MLP revisited
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 64 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, n_embd),  generator=g)
#Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size) ** 0.5) #Kaiming init: Gain for tanh / sqrt of fan-in number of neurons
b1 = torch.randn(n_hidden, generator=g) * 0.1 #using b1 just for fun, to check if we're still calculating the right gradient for this even though its useless

#Layer 2
W2 = torch.randn((n_hidden, vocab_size),  generator=g) * 0.1
b2 = torch.randn(vocab_size, generator=g) * 0.1

#Batchnorm parameters
bngain = torch.ones((1, n_hidden)) *0.1 + 1.0
bnbias = torch.zeros((1, n_hidden))*0.1 

#Parameters are being initialized in with smaller values because sometimes initializing with
#all zeroes can mask an incorrect implementation of the backward pass

#bnmean_running = torch.zeros((1, n_hidden))
#bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

4137


In [9]:
batch_size = 32
n = batch_size #a shorter variable for convenience
#construct a minibatch
ix = torch.randint(0, Xtr.shape[0], (batch_size, ), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] #batch X,Y

In [10]:
#forward pass, "chunckated" into smaller steps that possible to backward pass one at a time

emb = C[Xb] #embed the characters into vectors
embcat = emb.view(emb.shape[0], -1) #concatenate the vectors

#Linear Layer 1
hprebn = embcat @ W1 + b1 #hidden layer pre-batch-norm 

#BatchNorm Layer
bnmeani = 1/n*hprebn.sum(0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) #note: Bessel's correction (dividing by n-1, not n) see https://www.statisticshowto.com/bessels-correction/
bnvar_inv = (bnvar+1e-5)**-0.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias #hidden layer pre-activation

#Non-linearity
h = torch.tanh(hpreact) #hidden layer

#Linear Layer 2
logits = h @ W2 + b2

#Cross Entropy Loss (same as F.cross_entropy(logits, Yb))
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims=True)
counts_sum_inv = counts_sum**-1 #if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact
probs = counts * counts_sum_inv
logprobs = probs.log() #Batch of inputs and the probability they output a specific letter. 32 x 27
loss = -logprobs[range(n), Yb].mean() #Extract the loss calculated for the corresponding letter of Yb and calculate the mean across all the extracted values
#print(f'logprobs: {logprobs}')
#print(f'logprobs[range(n), ]: {logprobs[range(n), ]}')
#print(f'Yb: {Yb}')
#print(f'logprobs[range(n), Yb]: {logprobs[range(n), Yb]}')

#PyTorch backward pass
for p in parameters:
    p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv,
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
          bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
          embcat, emb]:
    t.retain_grad()
loss.backward()
loss



tensor(3.3482, grad_fn=<NegBackward0>)

In [11]:
#For dlogprobs where dloss/dlogprobs is the derivative of loss
#with respect to ALL elements in logprobs
#therefore, dlogprobs will have the same shape as logprobs (32, 7)

#Consider that the loss = (a + b + c)/3 where logprobs comprises elements a, b, c
#dloss/da is then = (1 + b + c)/3
#It follows then that dloss/dlogprobs = 1/3, or more generally 1/n

#Since we also do a selection for logprobs [range(n), Yb] 
#we only want elements that contributed to the loss to have a gradient
#To do this, we can just create a new tensor with the same as logprobs
#initialised to all zeroes, and then set the same indices to our gradient

#For dcounts_sum_inv
#The local derivative is the same as if we have f(x,y) = xy, therefore df/dx = y
#However, we need to take into account the shapes of counts and counts_sum_inv,
#(32,27) and 32,1 respectively.
#What this means is that we need to backprop through and differentiate the hidden
#PyTorch broadcast during matrix multiplication 
#Consider the example:
#c = a @ b
#a[3x3] * b[3,1] ---> Multiplication would then look like bc b1 is a column vector
#a11*b1, a12*b1, a13*b1
#a21*b2, a22*b2, a33*b2
#a31*b3, a32*b3, a33*b3
#The operation that needs backprop is the many multiplications of an a[] column with a b vector
#This can be handled by summing all the gradients of dc/db and we do so horizontally

#For dlogit_maxes
#Similar to dcount_sum_inv, need to consider the shape
#But otherwise it's just the sum of local derivative
#Remember that you can pull the sum out of the equation and then use after performing all inner operations

#For dlogits
#1st branch is just dnorm_logits
#2nd branch is similar to dlogprobs where we only want the values that contributed to the loss
#torch.max() also gives the indices of where the max values are
#So we can approach it like dlogprobs 

#For dh
#Do the working on simple matrices d = a x b and break it down to the dot products, it is genius
#and it turns out it's the local derivative (a matrix) needs to TRANSPOSED before multiplication
#Or see 48:00 of the vid
#For dW2, it's similar, just compare the shapes and do the appropriate transposition

#For dbnvar
#It's just a standard differentiation, you can set x = bnvar + 1e-5 so then you 
#just differentiate x**-0.5. ezypzy

#Bessel's Correction
#See https://math.oxford.emory.edu/site/math117/besselCorrection/
#By using the full sample size as the divisor for the variance in a BATCH (not the full population),
#You underestimate the population variance.
#A intuitive way of understanding this is use calculate the variance for a sample size of 1.
#This bias can be corrected by n-1 as the divisor. (Bessel's Correction)

#Sums and broadcasts
#They are a duality, sum in the forward pass will require a broadcast in the backpass (e.g. torch.ones_like * local derivative)
#broadcast in the forward pass will require a sum in the backpass

#For demb
#Views are just a reinterpretation of a tensor
#So just reinterpret it back to the original shape for the derivative

#For dC
#The hardest one. Requires shape analysis and understanding that we're looking for embeddings in C that contributed to the loss.
#C is a lookup of letters, so in dC we're looking to "activate" the letters that contributed to the loss.
#To start we create a tensor like C initialised with all zeroes.
#Then, we iterate through the batch Xb, to get the index (numerical equivalent) of the letters in the sliding window
#Then for each index within dC, we activate the (10d) row of dC with a (10d) row of gradients from demb

In [12]:
dlogits.shape, h.shape, b2.shape

NameError: name 'dlogits' is not defined

In [None]:
embcat.shape, emb.shape, C.shape

In [17]:
# Exercise 1: backprop through the whole thing manually, 
# backpropagating through exactly all of the variables 
# as they are defined in the forward pass above, one by one

dlogprobs = torch.zeros_like(logprobs)
dlogprobs[range(n), Yb] = -1.0/n

dprobs = 1.0/probs*dlogprobs

dcounts_sum_inv = (counts  * dprobs).sum(1, keepdim=True)

dcounts_sum = (-counts_sum**-2) * dcounts_sum_inv

dcounts = torch.ones_like(counts) * dcounts_sum + counts_sum_inv * dprobs

dnorm_logits = norm_logits.exp() * dcounts

dlogit_maxes = (- dnorm_logits).sum(1, keepdim=True)

#Alternative way of implementing dlogit's branch 2 like with dlogprobs
#dlogits_b2 = torch.zeros_like(logits)
#dlogits_b2[range(n), logits.max(1).indices] = 1.0 

#Specifically does not use keepdims=True so that it's collapsed into a 1 x n tensor
dlogits_b2 = F.one_hot(logits.max(1).indices, num_classes=logits.shape[1])

dlogits = dnorm_logits.clone() + dlogits_b2  * dlogit_maxes

dh = dlogits @ torch.transpose(W2, 0, 1)

#Shortform for transpose is h.T
dW2 = torch.transpose(h, 0, 1) @ dlogits

db2 = dlogits.sum(0)

dhpreact = (1.0 - h**2) * dh

dbngain = (dhpreact * bnraw).sum(0, keepdim=True)

dbnbias = dhpreact.sum(0, keepdims=True)

dbnraw = dhpreact * bngain

dbnvar_inv = (dbnraw * bndiff).sum(0, keepdim=True) 

dbnvar = -0.5 * (bnvar + 1e-5)**-1.5 * dbnvar_inv

# emb = C[Xb] #embed the characters into vectors
# embcat = emb.view(emb.shape[0], -1) #concatenate the vectors

# #Linear Layer 1
# hprebn = embcat @ W1 + b1 #hidden layer pre-batch-norm 

dbndiff2 = 1.0/(n-1)*torch.ones_like(bndiff2) * dbnvar

dbndiff = bnvar_inv * dbnraw + 2*bndiff*dbndiff2

dbnmeani = -dbndiff.sum(0, keepdim=True)

dhprebn = dbndiff.clone() + 1.0/n*torch.ones_like(hprebn) * dbnmeani #Clone for safety

dembcat = dhprebn @ W1.T

dW1 = embcat.T @ dhprebn 

db1 = dhprebn.sum(0)

demb = dembcat.view(emb.shape)

dC = torch.zeros_like(C)
for k in range(Xb.shape[0]):
    for j in range(Xb.shape[1]):
        ix = Xb[k,j]
        dC[ix] += demb[k,j]
        
##Other solutions found in the comments but very hard to understand
#1. dC.index_add_(0, Xb.view(-1), demb.view(-1, 10)) 
#2. X_e = F.one_hot(Xb, num_classes = 27).float() # Convert the selection operation into a selection matrix (emb = C[Xb] <-> X_e @ C)
#   dC = (X_e.permute(0,2,1) @ demb).sum(0) # Differentiate like any other matrix operation (dC = X_e.T @ demb; indices to track the batch dimensions)
#3. Xe = F.one_hot(Xb.flatten(), num_classes=27).float().permute(1, 0)
#4. dC = Xe @ demb.view((-1, demb.shape[2]))
cmp('logprobs', dlogprobs, logprobs)
cmp('probs', dprobs, probs)
cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)
cmp('counts_sum', dcounts_sum, counts_sum)
cmp('counts', dcounts, counts)
cmp('norm_logits', dnorm_logits, norm_logits)
cmp('logit_maxes', dlogit_maxes, logit_maxes)
cmp('logits', dlogits, logits)
cmp('h', dh, h)
cmp('W2', dW2, W2)
cmp('b2', db2, b2)
cmp('hpreact', dhpreact, hpreact)
cmp('bngain', dbngain, bngain)
cmp('bnbias', dbnbias, bnbias)
cmp('bnraw', dbnraw, bnraw)
cmp('bnvar_inv', dbnvar_inv, bnvar_inv)
cmp('bnvar', dbnvar, bnvar)
cmp('bndiff2', dbndiff2, bndiff2)
cmp('bndiff', dbndiff, bndiff)
cmp('bnmeani', dbnmeani, bnmeani)
cmp('hprebn', dhprebn, hprebn)
cmp('embcat', dembcat, embcat)
cmp('W1', dW1, W1)
cmp('b1', db1, b1)
cmp('emb', demb, emb)
cmp('C', dC, C)

logprobs        | exact: True  | approximate: True  | maxdiff: 0.0
probs           | exact: True  | approximate: True  | maxdiff: 0.0
counts_sum_inv  | exact: True  | approximate: True  | maxdiff: 0.0
counts_sum      | exact: True  | approximate: True  | maxdiff: 0.0
counts          | exact: True  | approximate: True  | maxdiff: 0.0
norm_logits     | exact: True  | approximate: True  | maxdiff: 0.0
logit_maxes     | exact: True  | approximate: True  | maxdiff: 0.0
logits          | exact: True  | approximate: True  | maxdiff: 0.0
h               | exact: True  | approximate: True  | maxdiff: 0.0
W2              | exact: True  | approximate: True  | maxdiff: 0.0
b2              | exact: True  | approximate: True  | maxdiff: 0.0
hpreact         | exact: True  | approximate: True  | maxdiff: 0.0
bngain          | exact: True  | approximate: True  | maxdiff: 0.0
bnbias          | exact: True  | approximate: True  | maxdiff: 0.0
bnraw           | exact: True  | approximate: True  | maxdiff:

In [None]:
#emb = C[Xb] #embed the characters into vectors
print(emb.shape, C.shape, Xb.shape)
print(Xb[:5])
print(emb)

In [None]:
# Exercise 2: backprop through cross_entropy but all in one go
# to complete this challenge look at the mathematical expression of the loss,
# take the derivative, simplify the expression, and just write it out

# forward pass

# before:
# logit_maxes = logits.max(1, keepdim=True).values
# norm_logits = logits - logit_maxes # subtract max for numerical stability
# counts = norm_logits.exp()
# counts_sum = counts.sum(1, keepdims=True)
# counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...
# probs = counts * counts_sum_inv
# logprobs = probs.log()
# loss = -logprobs[range(n), Yb].mean()

# now:
loss_fast = F.cross_entropy(logits, Yb)
print(loss_fast.item(), 'diff:', (loss_fast - loss).item())

In [14]:
# backward pass

#When the ith index of the logits is not equal to the label, the gradient for the ith logit is equal to the probability
#When the ith index of the logits is equal to the label, the  gradient for the ith logit is equal to the probability - 1
#Softmax along each example (a row) of logits
#Where the ith index is equal to the label, subtract one from the logit
#The loss for a batch is average loss so the loss of the batch logits needs to be divided by n

# -----------------
# YOUR CODE HERE :)
dlogits = F.softmax(logits, 1)
dlogits[range(n), Yb] -= 1
dlogits /= n
# -----------------

cmp('logits', dlogits, logits) # I can only get approximate to be true, my maxdiff is 6e-9

logits          | exact: False | approximate: True  | maxdiff: 6.28642737865448e-09


In [15]:
# Exercise 3: backprop through batchnorm but all in one go
# to complete this challenge look at the mathematical expression of the output of batchnorm,
# take the derivative w.r.t. its input, simplify the expression, and just write it out
# BatchNorm paper: https://arxiv.org/abs/1502.03167

# forward pass

# before:
# bnmeani = 1/n*hprebn.sum(0, keepdim=True)
# bndiff = hprebn - bnmeani
# bndiff2 = bndiff**2
# bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) # note: Bessel's correction (dividing by n-1, not n)
# bnvar_inv = (bnvar + 1e-5)**-0.5
# bnraw = bndiff * bnvar_inv
# hpreact = bngain * bnraw + bnbias

# now:
hpreact_fast = bngain * (hprebn - hprebn.mean(0, keepdim=True)) / torch.sqrt(hprebn.var(0, keepdim=True, unbiased=True) + 1e-5) + bnbias
print('max diff:', (hpreact_fast - hpreact).abs().max())

max diff: tensor(4.7684e-07, grad_fn=<MaxBackward1>)


In [None]:
#Re: Summing gradients:
#When differentiating a vector function (i.e. dxi/dvar) you need to sum all the gradients
#Therefore dL/dvar isn't just (dL/dxi)*(dxi/dvar), it's sum through all i for (dL/dxi)*(dxi/dvar)
#Another way of understanding is like when you sum up all the sources of the gradient if your variable shows up in multiple functions
#e.g. dL/dx = (dL/dy) * (dy/dx)[y = x^2] + (dL/dz) * (dz/dx)[z = x^3]
#Since there are mutiple values of x, you have multiple sources of the gradient.
#Also, consider that if you didn't sum up the multiple sources and the gradient had x in the function, which value of x would you use?
#This helps us find the exception to the rule as well.
#Since xi_hat is just a normalised version of xi, we have 1:1 relationship between the two (as opposed to many:many), and so we don't have to sum the gradients
#for dxi_hat/dxi

In [18]:
# backward pass

# before we had:
# dbnraw = bngain * dhpreact
# dbndiff = bnvar_inv * dbnraw
# dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True)
# dbnvar = (-0.5*(bnvar + 1e-5)**-1.5) * dbnvar_inv
# dbndiff2 = (1.0/(n-1))*torch.ones_like(bndiff2) * dbnvar
# dbndiff += (2*bndiff) * dbndiff2
# dhprebn = dbndiff.clone()
# dbnmeani = (-dbndiff).sum(0)
# dhprebn += 1.0/n * (torch.ones_like(hprebn) * dbnmeani)

# calculate dhprebn given dhpreact (i.e. backprop through the batchnorm)
# (you'll also need to use some of the variables from the forward pass up above)

# -----------------
# YOUR CODE HERE :)
dhprebn = bngain*bnvar_inv/n * (n*dhpreact - dhpreact.sum(0) - n/(n-1) * bnraw * (bnraw*dhpreact).sum(0))
# -----------------

cmp('hprebn', dhprebn, hprebn) # I can only get approximate to be true, my maxdiff is 9e-10

hprebn          | exact: False | approximate: True  | maxdiff: 9.313225746154785e-10


In [21]:
# Exercise 4: putting it all together!
# Train the MLP neural net with your own backward pass

# init
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden,                        generator=g) * 0.1
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1
b2 = torch.randn(vocab_size,                      generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden))*0.1 + 1.0
bnbias = torch.randn((1, n_hidden))*0.1

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

# same optimization as last time
max_steps = 200000
batch_size = 32
n = batch_size # convenience
lossi = []

# use this context manager for efficiency once your backward pass is written (TODO)
with torch.no_grad():
    
    # kick off optimization
    for i in range(max_steps):
    
      # minibatch construct
      ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
      Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
    
      # forward pass
      emb = C[Xb] # embed the characters into vectors
      embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
      # Linear layer
      hprebn = embcat @ W1 + b1 # hidden layer pre-activation
      # BatchNorm layer
      # -------------------------------------------------------------
      bnmean = hprebn.mean(0, keepdim=True)
      bnvar = hprebn.var(0, keepdim=True, unbiased=True)
      bnvar_inv = (bnvar + 1e-5)**-0.5
      bnraw = (hprebn - bnmean) * bnvar_inv
      hpreact = bngain * bnraw + bnbias
      # -------------------------------------------------------------
      # Non-linearity
      h = torch.tanh(hpreact) # hidden layer
      logits = h @ W2 + b2 # output layer
      loss = F.cross_entropy(logits, Yb) # loss function
    
      # backward pass
      for p in parameters:
        p.grad = None
      #loss.backward() # use this for correctness comparisons, delete it later!
    
      # manual backprop! #swole_doge_meme
      # -----------------
    
      #Logits backprop
      dlogits = F.softmax(logits, 1)
      dlogits[range(n), Yb] -= 1
      dlogits /= n
    
      #Second layer backprop
      dh = dlogits @ torch.transpose(W2, 0, 1)
      dW2 = torch.transpose(h, 0, 1) @ dlogits
      db2 = dlogits.sum(0)
    
      #Tanh
      dhpreact = (1.0 - h**2) * dh
    
      #Batchnorm backprop
      dbngain = (dhpreact * bnraw).sum(0, keepdim=True)
      dbnbias = dhpreact.sum(0, keepdims=True)
      dhprebn = bngain*bnvar_inv/n * (n*dhpreact - dhpreact.sum(0) - n/(n-1) * bnraw * (bnraw*dhpreact).sum(0))
    
      #First layer backprop
      dembcat = dhprebn @ W1.T
      dW1 = embcat.T @ dhprebn 
      db1 = dhprebn.sum(0)
    
      #Embedding
      demb = dembcat.view(emb.shape) 
      dC = torch.zeros_like(C)
      for k in range(Xb.shape[0]):
        for j in range(Xb.shape[1]):
            ix = Xb[k,j]
            dC[ix] += demb[k,j]
    
      #Assemble grads
      grads = [dC, dW1, db1, dW2, db2, dbngain, dbnbias]
      # -----------------
    
      # update
      lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
      for p, grad in zip(parameters, grads):
        #p.data += -lr * p.grad # old way of cheems doge (using PyTorch grad from .backward())
        p.data += -lr * grad # new way of swole doge TODO: enable
    
      # track stats
      if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
      lossi.append(loss.log10().item())
    
    #  if i >= 100: # TODO: delete early breaking when you're ready to train the full net
    #    break

12297
      0/ 200000: 3.7643
  10000/ 200000: 2.1733
  20000/ 200000: 2.4059
  30000/ 200000: 2.4332
  40000/ 200000: 2.0174
  50000/ 200000: 2.3131
  60000/ 200000: 2.3475
  70000/ 200000: 2.0757
  80000/ 200000: 2.3081
  90000/ 200000: 2.1669
 100000/ 200000: 1.9726
 110000/ 200000: 2.2734
 120000/ 200000: 1.9720
 130000/ 200000: 2.4597
 140000/ 200000: 2.2755
 150000/ 200000: 2.1626
 160000/ 200000: 1.9549
 170000/ 200000: 1.8071
 180000/ 200000: 1.9860
 190000/ 200000: 1.8411


In [20]:
# useful for checking your gradients
for p,g in zip(parameters, grads):
    cmp(str(tuple(p.shape)), g, p)

(27, 10)        | exact: False | approximate: True  | maxdiff: 1.1175870895385742e-08
(30, 200)       | exact: False | approximate: True  | maxdiff: 9.313225746154785e-09
(200,)          | exact: False | approximate: True  | maxdiff: 6.51925802230835e-09
(200, 27)       | exact: False | approximate: True  | maxdiff: 1.4901161193847656e-08
(27,)           | exact: False | approximate: True  | maxdiff: 3.725290298461914e-09
(1, 200)        | exact: False | approximate: True  | maxdiff: 1.862645149230957e-09
(1, 200)        | exact: False | approximate: True  | maxdiff: 3.725290298461914e-09


In [22]:
#calibrate the batch normalisation at the end of training
#by finding the mean and std.dev over the entire training set
#therefore eliminating the need for training using batches
#The mean and std.dev are also locked at specific values doing this i.e. no updating
#This is optional if the mean and std.dev of the training set are going to be estimated during
#batch training

with torch.no_grad():
    #pass the training set through
    emb = C[Xtr]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    # measure the mean and std.dev over the entire training set
    bnmean = hpreact.mean(0, keepdim=True)
    bnvar = hpreact.var(0, keepdim=True, unbiased=True)
    

In [23]:
#evaluate train and validation loss
@torch.no_grad()
def split_loss(split):
    x, y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte)
    }[split]
    emb = C[x] # (N, block_size, n_embd)
    embcat = emb.view(emb.shape[0], -1) #concat into (N, block_size * n_embd)
    hpreact = embcat @ W1 + b1
    hpreact = bngain * (hpreact - bnmean) * (bnvar+1e-5)**-0.5 + bnbias 
    h = torch.tanh(hpreact) # (N, n_hidden)
    logits = h @ W2 + b2 #(N, vocab_size)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.0720479488372803
val 2.109550952911377


In [24]:
#sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):

    out = []
    context = [0] * block_size #initialise with all ellipsis
    while True:
        #forward pass
        emb = C[torch.tensor([context])] # (1, block_size, n_embd)
        embcat = emb.view(emb.shape[0], -1) #concat into (N, block_size * n_embd)
        hpreact = embcat @ W1 + b1
        hpreact = bngain * (hpreact - bnmean) * (bnvar+1e-5)**-0.5 + bnbias 
        h = torch.tanh(hpreact) # (N, n_hidden)
        logits = h @ W2 + b2 #(N, vocab_size)

        #sample from the distribution
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        #shift the context window and track the samples
        context = context[1:] + [ix]
        out.append(ix)
        #if we sample the special '.', token, break
        if ix == 0:
            break
    print(''.join(itos[i] for i in out)) #decode and print the generated word

mona.
mayah.
see.
mad.
rylla.
emmasiendra.
gradelyn.
elin.
shi.
jen.
eden.
sana.
arleigh.
malaia.
noshubergihira.
sten.
joselle.
joseus.
kuba.
geder.
