In [2]:
import torch 
from torch.nn import functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
import requests
from pathlib import Path

if Path('names.txt').exists():
    words = open('names.txt', 'r').read().splitlines()
else:
    req = requests.get(r'https://raw.githubusercontent.com/karpathy/makemore/master/names.txt')
    with open('names.txt', 'wb') as f:
        f.write()
    words = open('names.txt','r').read().splitlines()

In [5]:
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [6]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [7]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [70]:
#Utility function which we will use later when copamring Manual Gradients to Pytorch Gradients
def cmp(s, dt, t):
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f"{s:15s} | exact: {str(ex):5s} | approximate {str(app):5s} | maxdiff: {maxdiff}")


In [18]:
!jt -l

Available Themes: 
   chesterish
   grade3
   gruvboxd
   gruvboxl
   monokai
   oceans16
   onedork
   solarizedd
   solarizedl


In [22]:
from jupyterthemes import jtplot
jtplot.style(theme='onedork')

In [32]:
n_embd = 10
n_hidden = 64

g = torch.Generator().manual_seed(42)
C = torch.randn((vocab_size, n_embd), generator = g)

W1 = torch.randn((n_embd * block_size, n_hidden), generator= g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden, generator = g) * 0.1  # using b1 just for fun, it's useless because of BN

W2 = torch.randn((n_hidden, vocab_size), generator = g) * 0.1
b2 = torch.randn(vocab_size, generator = g) * 0.1

#BatchNorm Parameters
bngain = torch.randn((1, n_hidden), generator = g)*0.1 + 1.0
bnbias = torch.randn((1, n_hidden), generator = g) * 0.1

# Note: I am initializating many of these parameters in non-standard ways
# because sometimes initializating with e.g. all zeros could mask an incorrect
# implementation of the backward pass.

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True




4137


In [34]:
batch_size = 32
n = batch_size #a shorter variable for convinence
#constrcut a minibatch
ix = torch.randint(0, Xtr.shape[0],(batch_size,), generator = g)
Xb, Yb = Xtr[ix], Ytr[ix]


In [53]:
#Forward pass,k "chunckated into smaller steps that are possible to backward one at a time

emb = C[Xb] # embed the characters into the vector
embcat = emb.view(emb.shape[0], -1) #Concatenate the vectors

#liner layer 1 
hprebn = embcat @ W1 + b1 #hidden layer pre activation
#Batch Norm Layer
bnmeani= 1/n*hprebn.sum(0,keepdim = True) #(hprebn.sum(0,keepdim = True)/n)
bndiff = hprebn - bnmean
bndiff2 = bndiff **2
bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim= True) #note : Bessel's Correction (dividing by n-1 , not n)
bnvar_inv = (bnvar + 1e-5)**0.5
bnraw = bndiff - bnvar_inv
hpreact = bngain * bnraw +bnbias

#Non Linearity
h = torch.tanh(hpreact) #hidden layer

#linear layer 2
logits = h @ W2 + b2 #Output Layer

#cross entropy loss (same as F.cross_Entropy loss)
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes #subtract the max for numerical stability refer the previous notebooks 
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims = True)
counts_sum_inv = counts_sum ** -1  # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...
probs = counts * counts_sum_inv
logprobs = probs.log()
loss = -logprobs[range(n), Yb].mean()

#pytorch backward pass
for p in parameters:
    p.grad = None

for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, # afaik there is no cleaner way
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
         bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
         embcat, emb]:
    t.retain_grad()
loss.backward()
loss



tensor(3.4786, grad_fn=<NegBackward0>)

In [54]:
# Exercise 1: backprop through the whole thing manually, 
# backpropagating through exactly all of the variables 
# as they are defined in the forward pass above, one by one

In [92]:
# cmp('logprobs', dlogprobs, logprobs)
print('logprobs shape',logprobs.shape)
#calcualte the dlogprobs 
#example loss = -(a + b + c) /3 where 3 is the n 
#dloss/da = -1/3 where b and c are constatnts and will become zero so the derivative loss for n logits can be written as -1/n
print(f"Calculating the gradients for ech parameters or steps which will be used in back propagation\n")
print("the gradients that need to be updated for a step are calculated by finding the derivative from the succeeding step and\
\nwill be stored in the parameter.grad \n ")
print(f"since the most part of the logprobs which has the shape of {logprobs.shape} is gonna be zero\
     \nBecause only the logprobs[range(n), Yb] of shape {logprobs[range(n), Yb].shape} will be taken into consideration\n \
     and loss of other logits or logprobs will be derviatively - d/dx zero")

print("\n-----Implementing the derivative-----")
dlogprobs = torch.zeros_like(logprobs)
#1.0/n is the equivalent of dloss/dlogprobs
dlogprobs[range(n), Yb] = -1.0/n
cmp('logprobs', dlogprobs, logprobs)
print("\n")

# cmp('probs', dprobs, probs)
# now find the gradient of probs which will the derivative from the logprob step
print(f"derivative of probs.log() is equivalent to log(x) which is 1/x and need to implement chain rule\
      \nthat is wee need to find the derivative of probs which is dlogprobs")
print("Compare the gradients probs.grad[range(n), Yb] and dprobs[range(n), Yb]")
dprobs = (1.0/probs) * dlogprobs
cmp('probs',dprobs,probs)
print("\n")

logprobs shape torch.Size([32, 27])
Calculating the gradients for ech parameters or steps which will be used in back propagation

the gradients that need to be updated for a step are calculated by finding the derivative from the succeeding step and
will be stored in the parameter.grad 
 
since the most part of the logprobs which has the shape of torch.Size([32, 27]) is gonna be zero     
Because only the logprobs[range(n), Yb] of shape torch.Size([32]) will be taken into consideration
      and loss of other logits or logprobs will be derviatively - d/dx zero

-----Implementing the derivative-----
logprobs        | exact: True  | approximate True  | maxdiff: 0.0


derivative of probs.log() is equivalent to log(x) which is 1/x and need to implement chain rule      
that is wee need to find the derivative of probs which is dlogprobs
Compare the gradients probs.grad[range(n), Yb] and dprobs[range(n), Yb]
probs           | exact: True  | approximate True  | maxdiff: 0.0




In [80]:
probs.grad[range(n), Yb]

tensor([-0.9789, -0.4115, -1.9931, -2.0392, -0.6188, -0.3112, -0.9464, -3.1652,
        -1.6925, -0.5919, -1.1059, -1.2846, -0.8153, -3.5235, -1.0352, -2.2519,
        -0.4642, -4.3114, -1.9340, -2.4587, -0.5418, -0.2978, -3.1657, -1.4804,
        -0.4597, -2.0392, -1.2553, -0.1862, -0.6654, -0.2453, -0.6218, -1.0468])

In [82]:
dprobs[range(n), Yb]

tensor([-0.9789, -0.4115, -1.9931, -2.0392, -0.6188, -0.3112, -0.9464, -3.1652,
        -1.6925, -0.5919, -1.1059, -1.2846, -0.8153, -3.5235, -1.0352, -2.2519,
        -0.4642, -4.3114, -1.9340, -2.4587, -0.5418, -0.2978, -3.1657, -1.4804,
        -0.4597, -2.0392, -1.2553, -0.1862, -0.6654, -0.2453, -0.6218, -1.0468],
       grad_fn=<IndexBackward0>)

In [91]:
counts_sum_inv.grad

tensor([[-0.2599],
        [-0.3763],
        [-0.2715],
        [-0.1683],
        [-0.2736],
        [-0.3112],
        [-0.3070],
        [-0.3149],
        [-0.3927],
        [-0.4288],
        [-0.2620],
        [-0.2978],
        [-0.2364],
        [-0.1683],
        [-0.2216],
        [-0.2254],
        [-0.3198],
        [-0.1823],
        [-0.3176],
        [-0.2977],
        [-0.1683],
        [-0.2978],
        [-0.3288],
        [-0.1683],
        [-0.3726],
        [-0.1683],
        [-0.2816],
        [-0.1862],
        [-0.2729],
        [-0.2453],
        [-0.3128],
        [-0.2908]])