In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [2]:
words = open('names.txt','r').read().split()

In [3]:
chars = sorted(list(set(''.join(words))))
stoi={ch:i+1 for i,ch in enumerate(chars)}
stoi['.']=0
itos={i:ch for ch,i in stoi.items()}

In [4]:
# build the dataset

block_size=3 # context length: how many characters do we take to predict the next one
X, Y = [], []

for w in words[:5]:
    print(w)

    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)

        print(''.join([itos[ix] for ix in context]),'-->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... --> e
..e --> m
.em --> m
emm --> a
mma --> .
olivia
... --> o
..o --> l
.ol --> i
oli --> v
liv --> i
ivi --> a
via --> .
ava
... --> a
..a --> v
.av --> a
ava --> .
isabella
... --> i
..i --> s
.is --> a
isa --> b
sab --> e
abe --> l
bel --> l
ell --> a
lla --> .
sophia
... --> s
..s --> o
.so --> p
sop --> h
oph --> i
phi --> a
hia --> .


In [5]:
X.shape, Y.shape # This is our dataset

(torch.Size([32, 3]), torch.Size([32]))

In [6]:
X # the context indices
Y # the correct character index that comes after

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [7]:
C = torch.randn((27, 2)) # compress 27 letters to 2 dimensions

In [8]:
# F.one_hot(torch.tensor(5), num_classes=27) @ C # multiply by C? -> error int and float multiplication

F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([-2.0986,  1.0436])

In [9]:
C[5] # identical as above: matrix masks out the values

tensor([-2.0986,  1.0436])

In [10]:
# Can also do this for a whole tensor of indices
emb = C[X] # now we are creating a 2D embeddings
emb.shape

torch.Size([32, 3, 2])

In [11]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100) # 100 neurons

In [12]:
# Transform 32*3*2 to 32*6 for 
# emb @ W1 + b1 # i.e. first layer
torch.cat([emb[:,0,:], emb[:,1,:], emb[:,2,:]], dim=1) @ W1 + b1 # first word context

tensor([[-0.7182,  0.7298, -1.8483,  ...,  1.0975,  0.3763,  0.9531],
        [-4.1967,  2.0517, -0.9259,  ..., -1.0026,  1.8886,  3.3526],
        [ 5.1707,  2.2442,  0.3344,  ..., -6.0207,  0.8492, -0.0119],
        ...,
        [ 5.1149,  3.5183, -3.2203,  ..., -1.1636, -1.3646, -1.6377],
        [ 0.0416, -1.4230,  0.2466,  ...,  5.8275,  0.5455,  0.4878],
        [-6.9327,  1.6369, -4.0974,  ...,  2.6320, -0.6855,  0.6517]])

In [13]:
torch.cat(torch.unbind(emb, dim=1), 1).shape # split the tensor into a list of 3 tensors

torch.Size([32, 6])

In [14]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [15]:
a.view(3, 2, 3) 

tensor([[[ 0,  1,  2],
         [ 3,  4,  5]],

        [[ 6,  7,  8],
         [ 9, 10, 11]],

        [[12, 13, 14],
         [15, 16, 17]]])

In [16]:
# Somplify emb @ W1 + b1 
h = torch.tanh((emb.view(-1, 6) @ W1 + b1))


Final Layer

In [17]:
W2 = torch.randn((100, 27)) # 100 neurons to 27 letters
b2 = torch.randn(27)

In [18]:
logits= h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [19]:
counts = logits.exp()
prob = counts / counts.sum(1, keepdim=True)

In [20]:
prob.shape

torch.Size([32, 27])

In [21]:
prob[torch.arange(32), Y] #index into prob to get the probability of the correct letter (index indicated by Y)

tensor([9.9221e-01, 2.5629e-03, 9.9995e-01, 9.7233e-08, 9.9007e-01, 9.6560e-11,
        4.0746e-12, 1.8092e-08, 2.0229e-06, 9.2677e-11, 1.3608e-11, 5.7356e-18,
        2.2242e-10, 3.9777e-03, 2.1640e-08, 2.1020e-05, 1.7209e-07, 5.7062e-11,
        2.7947e-01, 2.8300e-07, 2.0690e-09, 3.9926e-05, 1.5137e-13, 6.8173e-08,
        1.5217e-13, 4.9039e-12, 6.0956e-16, 2.9748e-16, 4.4758e-12, 1.3319e-16,
        7.5456e-09, 3.2048e-18])

In [22]:
loss=-prob[torch.arange(32), Y].log().mean()
loss

tensor(19.5632)

In [23]:
# Easier to use F.cross_entropy
# substracts maximum value such that the behavior is good (good numerical stability)
# enhance the performance for forward and backward pass
F.cross_entropy(logits, Y)

tensor(19.5632)

# Clean up

In [26]:
parameters = [C, W1, b1, W2, b2]

In [29]:
for p in parameters:
    p.requires_grad = True

for _ in range(1000):
    # forward pass
    emb = C[X]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1)) # first layer 32*100
    logits = h @ W2 + b2 # second layer 32*27
    loss = F.cross_entropy(logits, Y)
    print(loss.item())

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data -= p.grad * 1e-2

6.63040828704834
6.569538593292236
6.508933067321777
6.448582172393799
6.388481140136719
6.328622341156006
6.269002437591553
6.209619522094727
6.150469779968262
6.091554641723633
6.03287410736084
5.974429130554199
5.916221618652344
5.858257293701172
5.800537109375
5.743066310882568
5.685851097106934
5.628894329071045
5.572202682495117
5.515783786773682
5.459641456604004
5.403782367706299
5.348214626312256
5.292943477630615
5.237977504730225
5.183321952819824
5.128985404968262
5.07497501373291
5.021297454833984
4.96796178817749
4.914973735809326
4.862344264984131
4.810079097747803
4.7581868171691895
4.706674098968506
4.6555495262146
4.604820251464844
4.554492950439453
4.504574775695801
4.4550700187683105
4.405986785888672
4.357327461242676
4.309096813201904
4.261298179626465
4.213934898376465
4.167008399963379
4.120518207550049
4.074466705322266
4.028851509094238
3.9836716651916504
3.9389257431030273
3.894611358642578
3.8507235050201416
3.8072609901428223
3.764219045639038
3.72159099578

loss is low? -> overfitting!

In [30]:
sum(p.nelement() for p in parameters) # See? 3481 parameters for 32 training data

3481

Go to MLP_full_data