In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
len(words)

32033

In [12]:
#Building vocabulary of words and mappings
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos = {i:s for s,i in stoi.items()}

In [20]:
#Building the dataset
block_size = 3 #Context length to specify how many characters we consider to predict the next one
X, Y = [], []

for w in words[:5]:

    context = [0] * block_size
    print(w)
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '---->', itos[ix])
        context = context[1:] + [ix] #sliding window

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ----> e
..e ----> m
.em ----> m
emm ----> a
mma ----> .
olivia
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
ava
... ----> a
..a ----> v
.av ----> a
ava ----> .
isabella
... ----> i
..i ----> s
.is ----> a
isa ----> b
sab ----> e
abe ----> l
bel ----> l
ell ----> a
lla ----> .
sophia
... ----> s
..s ----> o
.so ----> p
sop ----> h
oph ----> i
phi ----> a
hia ----> .


In [21]:
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [22]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g) #a 2-D embedding matrix for each character.
W1 = torch.randn((6,100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100,27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C,W1,W2,b1,b2]
sum(p.nelement() for p in parameters)

3481

In [31]:
for p in parameters:
    p.requires_grad = True

In [35]:
for i in range(1000):
    #forward pass
    emb = C[X]
    h = torch.tanh(emb.view(-1,6) @ W1 + b1)
    logits = h @ W2 + b2
    # counts = logits.exp()
    # probs = counts / counts.sum(1, keepdims=True)
    # loss2 = -probs[torch.arange(C[X].shape[0]), Y].log().mean()
    # This is nothing but the cross entropy loss
    # Much more efficient to use the pytorch function than do the above
    
    loss = F.cross_entropy(logits, Y)
    
    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    #update params
    for p in parameters:
        p.data += -0.1 * p.grad
print(loss.item())

0.2534908354282379
