In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#read all words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
#build vocabulaty of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [6]:
#build the dataset
block_size = 3 #context length: how many characters to look at to predict the next one?

def build_dataset(words):
    X,Y = [],[]
    for w in words: #all words
        
        context = [0]*block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix] #crop the first element and apend the new one

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X,Y

import random
random.seed(42)
random.shuffle(words)
n1  = int(0.8*len(words))
n2 = int(0.9*len(words))
Xtr, Ytr = build_dataset(words[:n1])            #80%
Xdev, Ydev = build_dataset(words[n1:n2])        #10%
Xte, Yte = build_dataset(words[n2:])            #10%

torch.Size([182580, 3]) torch.Size([182580])
torch.Size([22767, 3]) torch.Size([22767])
torch.Size([22799, 3]) torch.Size([22799])


In [7]:
#MLP Revisited
n_embd = 10 #dimentionality of the character embedding vectors
n_hidden = 200 #the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) #for reproducibility
C = torch.randn((vocab_size,n_embd), generator=g) #embedding layer

W1 = torch.randn((block_size*n_embd,200), generator=g) #first layer
b1 = torch.randn(n_hidden, generator=g) #biases for first layer

W2 = torch.randn((n_hidden,vocab_size), generator=g) #second layer
b2 = torch.randn(vocab_size, generator=g) #biases for second layer

parameters = [C, W1, b1, W2, b2] #all the parameters

for p in parameters:
    p.requires_grad = True #enabling gradients for all parameters

print(sum(p.nelement() for p in parameters)) #total no. of parameters

11897


In [9]:
stepi = []
lossi = []
max_steps = 20000
batch_size = 32

for i in range(max_steps):
    
    #minibatch construct
    ix = torch.randint(0,Xtr.shape[0],(batch_size,), generator=g) #randomly sample 32 numbers from 0 to 228146
    Xb, Yb = Xtr[ix], Ytr[ix] #batch X,Y

    #Forward pass
    emb = C[Xb] #embedding layer #First pick only ix indexes from X, and then based on X[ix], pick only X[ix] indexes from C #32x3x2
    embcat = emb.view(emb.shape[0], -1) #concatenate the input vectors
    hpreact = embcat@W1 +b1 #hidden layer pre-activation
    h = torch.tanh(hpreact) #first layer 
    logits = h@W2 + b2 #output of W2 layer 
    loss = F.cross_entropy(logits, Yb) #ix also indexes from Y, so that we can calculate loss for only those 32 examples
    # print(loss.item())
    
    #bakcward pass
    for p in parameters:
        p.grad = None #setting gradients to 0 before each backward pass
    loss.backward()

    #update
    lr = 0.1 if i<100000 else 0.01 #step lr decay
    for p in parameters:
        p.data += -lr*p.grad #updating the parameters

    #trackstats
    if i%1000 == 0: #print evey 1000th step
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    stepi.append(i)

print('Training Loss:', loss.item())

      0/  20000: 2.3085
   1000/  20000: 1.7711
   2000/  20000: 2.1199
   3000/  20000: 1.9246
   4000/  20000: 2.5164
   5000/  20000: 2.2617
   6000/  20000: 2.4543
   7000/  20000: 2.3356
   8000/  20000: 2.6346
   9000/  20000: 2.1007
  10000/  20000: 2.3525
  11000/  20000: 2.2331
  12000/  20000: 2.2699
  13000/  20000: 2.0758
  14000/  20000: 2.0495
  15000/  20000: 2.3625
  16000/  20000: 2.1662
  17000/  20000: 2.3300
  18000/  20000: 2.5478
  19000/  20000: 2.1320
Training Loss: 2.741368055343628


In [None]:
plt.plot(lossi)

In [None]:
@torch.no_grad() #this decorator disables gradient tracking
                 #whatever happens in this function below, it instructs pytorch
                 #to never require gradients for these operations since pytorch usually keeps track of gradients in anticipation of a backward pass
                 #but yaha koi backward pass nahi hai!
                 #for efficiency
def split_loss(split):
    x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
    }[split]
    emb = C[x] #embedding layer #First pick only ix indexes from X, and then based on X[ix], pick only X[ix] indexes from C #32x3x2
    embcat = emb.view(emb.shape[0], -1) #concatenate the input vectors
    hpreact = embcat@W1 +b1 #hidden layer pre-activation
    h = torch.tanh(hpreact) #first layer 
    logits = h@W2 + b2 #output of W2 layer 
    loss = F.cross_entropy(logits, Yb)

split_loss('train')
split_loss('val')

In [10]:
#Sample form the model
g = torch.Generator().manual_seed(2147483647+10)
for _ in range(20):
    out = []
    context = [0] * block_size #initialise with all '...'
    while True:
        emb = C[torch.tensor([context])] #(1,block_size,d): 1st dim was the size of training set, but here we're generating only 1 example,
        h = torch.tanh(emb.view(1,-1)@W1+b1)
        logits = h@W2 +b2
        probs = F.softmax(logits, dim=1)
        #sample from the distribution
        ix = torch.multinomial(probs, num_samples=1, generator = g).item()
        # shift the context window and track the samples
        context = context[1:]+[ix]
        out.append(ix)
        # if we sample the special '.' token, break
        if ix == 0:
            break

    print(''.join(itos[i] for i in out)) # decode and print the generated word


moha.
imyah.
see.
mad.
rylla.
ethantengrlee.
adejedieliigh.
porea.
eden.
sanana.
sephetormara.
noshabergaviana.
tin.
joselynn.
nikolanobemander.
yaralyehs.
lia.
myskolanihan.
sakyansun.
zakelijunett.


In [None]:
#Deeper look at the code
#Initial loss, at 0th iteration is very high, idially it must be low
#and we can have an idea of the loss to expect at the beginnning
#by thinking that at the start, the probab distrib of all of the characters 
#must be a uniform distribution, since we have no reason to think about why 
#some characters must be higher than others
