In [1]:
words = open('names.txt', 'r').read().splitlines()

In [4]:
#Define the character set 
word = sorted(list(set(''.join(words))))

#Using the word which is a sorted list, let's map each character to a number
stoi = {s:i+1 for i,s in enumerate(word)}
stoi['.'] = 0

#Now the vice versa that is mapping a integer to the letter
itos = {i:s for s, i in stoi.items() }
#itos

In [5]:
#Dataset creation
import torch
block_size = 3 #This is the context length, it specifies the number of words/characters you are giving to the model

def build_dataset():
    X = []
    Y = []

    for w in words:
        content = [0] * block_size
        for ch in w + '.':
            inx = stoi[ch]
            X.append(content)
            Y.append(inx)
            content = content[1:] + [inx]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X,Y

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
import random
random.seed(42)
random.shuffle(words)
X, Y = build_dataset()
Xtr, Ytr = X[:n1], Y[:n1]
Xdev, Ydev = X[n1: n2], Y[n1: n2]
Xval, Yval = X[n2:], Y[n2:]

print(Xtr.shape)
print(Ytr.shape)
    

torch.Size([25626, 3])
torch.Size([25626])


In [6]:
#C here is the look up table (also called the embeddings table)
#Each row of C has the embeddings required for a character
C = torch.rand((27, 2))
#Embed each character in X from C
emb = C[Xtr]
#Let's check the shape of emb
emb.shape
#emb is a 3d vector. Let's convert it into 2d of 27 x 6 where each row is the embedding of the 3 characters in X
it = emb.view(emb.shape[0], 6)
print(it)

tensor([[0.1264, 0.8536, 0.1264, 0.8536, 0.1264, 0.8536],
        [0.1264, 0.8536, 0.1264, 0.8536, 0.9506, 0.6642],
        [0.1264, 0.8536, 0.9506, 0.6642, 0.4870, 0.6013],
        ...,
        [0.1264, 0.8536, 0.1264, 0.8536, 0.1264, 0.8536],
        [0.1264, 0.8536, 0.1264, 0.8536, 0.9506, 0.6642],
        [0.1264, 0.8536, 0.9506, 0.6642, 0.6848, 0.3773]])


In [7]:
#Now the number of training examples x 6 inputs are given as a weighted sum to the hidden layer with 100 neurons
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

#The weighted sums are activated by tanh
h = torch.tanh(it @ W1 + b1)
h.shape

torch.Size([25626, 100])

In [8]:
#The parameters of the hidden layer to the output layer
W2 = torch.randn(100, 27)
b2 = torch.randn(27)

#The output of the last layer
y = h @ W2 + b2

In [10]:
#using the y we got from the neural network and the actual y we will calculate the loss
loss = torch.nn.functional.cross_entropy(y, Ytr)
loss

tensor(14.1075)

In [11]:
#Let's try to segment the data to minibatches
m = torch.randint(0, X.shape[0], (32,))
m

tensor([ 54187, 178316, 140304,   7763, 151416, 199851, 211720,  41983, 131684,
        129710, 183766, 185746,  95338,  56675,  43583, 222632,  35070,  95947,
          5739,  41217, 136666,  30955,    974,  11952,  90100,  91723,   8711,
         19589,  20827, 199652,  89079,  11618])

In [27]:
#On combining the whole code, we need to do a forward pass, find the loss, do a backward pass, update the values and repeat the steps again
#Until the loss does not minimise
parameters = [C, W1, W2, b1, b2]
for p in parameters:
    p.requires_grad = True
    
for _ in range(100):

    #Minibatch of 32
    inx = torch.randint(0, Xtr.shape[0], (32,))
    
    #forward pass
    emb = C[Xtr[inx]]
    h= torch.tanh(emb.view(inx.shape[0], 6) @ W1 + b1)
    y = h @ W2 + b2

    #Calculate the loss
    loss = torch.nn.functional.cross_entropy(y, Ytr[inx])
    #print(loss.item())

    #Back pass
    for p in parameters:
        p.grad = None
    loss.backward()

    #Update my parameters
    for p in parameters:
        p.data += -0.01 * p.grad

print(loss.item())

2.1535000801086426


In [29]:
#Evaluate on dev data
emb = C[Xdev]
h= torch.tanh(emb.view(Xdev.shape[0], 6) @ W1 + b1)
y = h @ W2 + b2
loss = torch.nn.functional.cross_entropy(y, Ydev)
loss.item()

2.4081273078918457