In [10]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt


In [11]:
words = open('names.txt','r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [12]:
len(words)

32033

In [13]:
#build the vocabulary of character and mapping from integer
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [14]:
#build dataset
def build_dataset(words) :
    block_size = 3 # context length : how many characters to predict next
    X,Y = [],[]
    for w in words:
        # print(w)
        context = [0] * block_size
        # print(context)
        for ch in w + '.':
            ix = stoi [ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context),'--->',itos[ix])
            context = context[1:] + [ix] #crop and append
            # print(context)
    X =torch.tensor(X)
    Y =torch.tensor(Y)
    return X,Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words)) 

Xtr,Ytr = build_dataset(words[:n1])
Xval,Yval = build_dataset(words[n1:n2])
Xte,Yte = build_dataset(words[n2:])

 


<h3>implementing the Embedding lookup table</h3>

In [15]:
Xtr.shape,Ytr.shape


(torch.Size([182625, 3]), torch.Size([182625]))

In [16]:
C = torch.randn(27,2) # embedding table


In [17]:
emb = C[Xtr] #lookup the embedding table
emb.shape

torch.Size([182625, 3, 2])

In [18]:
w1 = torch.randn(6,100)
b1 = torch.randn(100)

<p>The reason why we concat or view emb is we can't simply multiply emb@w1 +b . Because of their shape.</p>

In [19]:
# torch.concat(torch.unbind(emb,1),1).shape

""" can use this cell or below cell : 

    concat method is less efficient because it create new memory
"""

' can use this cell or below cell : \n\n    concat method is less efficient because it create new memory\n'

In [20]:
(emb.view(-1,6)).shape

torch.Size([182625, 6])

<p>Let's Math!</p>

In [21]:
h = emb.view(-1,6) @ w1 + b1 # use -1 instead of directly put 32, -1 make code dynamic
h.shape

torch.Size([182625, 100])

In [22]:
h = torch.tanh(h)
h

tensor([[ 0.0275, -0.0299, -0.4851,  ..., -0.8488,  0.9407,  0.5003],
        [-0.3448,  0.0695,  0.5760,  ...,  0.6662,  0.8238,  0.5124],
        [-0.9879, -0.9922, -0.9793,  ..., -0.9693,  0.9983,  0.9964],
        ...,
        [-0.7631,  0.9913, -0.8861,  ..., -0.5243, -0.8523, -1.0000],
        [-0.0869,  0.8355, -0.0922,  ..., -0.9981,  0.9980,  0.9096],
        [-0.2037,  0.9803,  0.1589,  ..., -0.3580, -0.0530, -0.9966]])

In [23]:
w2 = torch.randn(100,27)
b2 = torch.randn(27)



In [24]:
logits = h @ w2 + b2
logits.shape

torch.Size([182625, 27])

In [25]:
# Manual Cross Entrophy
# counts = logits.exp()
# prob = counts/counts.sum(1,keepdim=True)
# loss = -prob[torch.arange(32),Y].log().mean() # neg loglikelihood
# loss

<h3>Cross Entrophy</h3>

In [28]:
en_loss = F.cross_entropy(logits,Ytr)

In [29]:
param = [C,w1,w2,b1,b2]
sum(p.nelement() for p in param)

3481

In [30]:
for p in param:
    p.requires_grad = True

<h3>Forward pass and Backward pass<h3>

In [31]:
# loss for mini batch
for _ in range (10):
    #create the mini batch for speed up  the training 
    ix = torch.randint(0,Xtr.shape[0],(32,))
    emb = C[Xtr[ix]]
    h = torch.tanh(emb.view(-1,6) @ w1 + b1)
    logits = h @ w2 + b2
    loss = F.cross_entropy(logits,Ytr[ix])
    print(loss.item())
    for p in param:
        p.grad = None
    loss.backward()
    for p in param:
        p.data += -0.1 * p.grad
        


17.489194869995117
17.653717041015625
16.3277530670166
13.510794639587402
12.50063419342041
15.514881134033203
11.034486770629883
12.129260063171387
14.726625442504883
10.385401725769043


In [32]:
#loss for all X
for _ in range(10):
    emb = C[Xtr]
    h = torch.tanh(emb.view(-1,6)@ w1 + b1)
    logits = h @ w2 + b2
    loss = F.cross_entropy(logits,Ytr)
    print(loss.item())
    for p in param:
        p.grad = None
    loss.backward()
    for p in param:
        p.data += -0.1 * p.grad

11.759575843811035
11.084226608276367
10.67184066772461
10.291000366210938
9.931710243225098
9.590688705444336
9.266900062561035
8.96006965637207
8.67010498046875
8.396855354309082


In [33]:
logits.max(1)

torch.return_types.max(
values=tensor([ 3.8702,  6.6806, 12.1353,  ..., 12.2024,  5.1972,  7.4417],
       grad_fn=<MaxBackward0>),
indices=tensor([ 5,  5,  4,  ..., 12, 18,  0]))

In [34]:
Yte

tensor([13, 21, 19,  ..., 14,  9,  0])