In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#read all the words
words = open('names.txt', 'r').read().splitlines()

In [3]:
len(words)

32033

In [5]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [122]:
# build the dataset

block_size = 3 #context length
X, Y = [], []
for w in words:
    # print(w)
    context = [0] * block_size #starts as block_size #'.'s. (Since itos[0] is . and we use the integer representation of the character)
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        # print(''.join(itos[i] for i in context), '---->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [123]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [77]:
C = torch.randn((27,2)) #27 possible characters, each with 2-dimensional embedding. C is our lookup table of vector embedding for vocabulary

In [12]:
C[5]

tensor([-1.8775, -0.5638])

In [78]:
F.one_hot(torch.tensor(5), num_classes= 27).float() @ C

tensor([-1.5116,  2.3201])

In [79]:
emb = C[X]

In [80]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [81]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1)

tensor([[ 0.2012, -0.0892,  0.2012, -0.0892,  0.2012, -0.0892],
        [ 0.2012, -0.0892,  0.2012, -0.0892, -1.5116,  2.3201],
        [ 0.2012, -0.0892, -1.5116,  2.3201,  1.8382,  0.2528],
        [-1.5116,  2.3201,  1.8382,  0.2528,  1.8382,  0.2528],
        [ 1.8382,  0.2528,  1.8382,  0.2528, -0.8278,  0.0196],
        [ 0.2012, -0.0892,  0.2012, -0.0892,  0.2012, -0.0892],
        [ 0.2012, -0.0892,  0.2012, -0.0892,  0.7286,  1.2661],
        [ 0.2012, -0.0892,  0.7286,  1.2661, -0.2472,  0.9470],
        [ 0.7286,  1.2661, -0.2472,  0.9470,  0.4170, -0.1270],
        [-0.2472,  0.9470,  0.4170, -0.1270,  0.1409, -0.1828],
        [ 0.4170, -0.1270,  0.1409, -0.1828,  0.4170, -0.1270],
        [ 0.1409, -0.1828,  0.4170, -0.1270, -0.8278,  0.0196],
        [ 0.2012, -0.0892,  0.2012, -0.0892,  0.2012, -0.0892],
        [ 0.2012, -0.0892,  0.2012, -0.0892, -0.8278,  0.0196],
        [ 0.2012, -0.0892, -0.8278,  0.0196,  0.1409, -0.1828],
        [-0.8278,  0.0196,  0.1409, -0.1

In [26]:
torch.cat(torch.unbind(emb, 1), 1) #dynamic but inefficient (creates a new tensor meaning more memory)

tensor([[ 1.0731,  1.2645,  1.0731,  1.2645,  1.0731,  1.2645],
        [ 1.0731,  1.2645,  1.0731,  1.2645, -1.8775, -0.5638],
        [ 1.0731,  1.2645, -1.8775, -0.5638,  0.0778, -0.6400],
        [-1.8775, -0.5638,  0.0778, -0.6400,  0.0778, -0.6400],
        [ 0.0778, -0.6400,  0.0778, -0.6400,  0.7460,  1.4331],
        [ 1.0731,  1.2645,  1.0731,  1.2645,  1.0731,  1.2645],
        [ 1.0731,  1.2645,  1.0731,  1.2645,  0.1652, -0.9800],
        [ 1.0731,  1.2645,  0.1652, -0.9800,  0.0865,  1.1077],
        [ 0.1652, -0.9800,  0.0865,  1.1077, -0.1222, -0.3618],
        [ 0.0865,  1.1077, -0.1222, -0.3618, -0.5252,  0.5518],
        [-0.1222, -0.3618, -0.5252,  0.5518, -0.1222, -0.3618],
        [-0.5252,  0.5518, -0.1222, -0.3618,  0.7460,  1.4331],
        [ 1.0731,  1.2645,  1.0731,  1.2645,  1.0731,  1.2645],
        [ 1.0731,  1.2645,  1.0731,  1.2645,  0.7460,  1.4331],
        [ 1.0731,  1.2645,  0.7460,  1.4331, -0.5252,  0.5518],
        [ 0.7460,  1.4331, -0.5252,  0.5

In [82]:
emb.view(32, 6) #BEST 
"""
NOTE ON TORCH.TENSOR INTERNALS:
Tensors have an internal sotrage which is all of the data in memory as a one dimensional vector. Whenever we manipulate the tensor with a function
like view(), nothing in the tensors storage changes, just some of the attirbutes of the tensor change which changes how the values in storage are interpreted
or should be represented. No memory or values are changed or moved, only some attributes which dictate the values' representation/interpretation.
That's why tensor.view() is great because no memory is being changed.
"""

"\nNOTE ON TORCH.TENSOR INTERNALS:\nTensors have an internal sotrage which is all of the data in memory as a one dimensional vector. Whenever we manipulate the tensor with a function\nlike view(), nothing in the tensors storage changes, just some of the attirbutes of the tensor change which changes how the values in storage are interpreted\nor should be represented. No memory or values are changed or moved, only some attributes which dictate the values' representation/interpretation.\nThat's why tensor.view() is great because no memory is being changed.\n"

In [91]:
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1) # first layer

In [92]:
h

tensor([[-0.8648, -0.8292, -0.0760,  ..., -0.9338,  0.9306,  0.6233],
        [ 0.4545, -0.9312,  0.2247,  ..., -0.9998, -0.8601, -0.5025],
        [-1.0000,  0.9998,  1.0000,  ..., -0.9999, -0.9325,  0.9612],
        ...,
        [-0.9572,  0.9993,  0.8370,  ..., -0.9968, -0.9918,  0.9913],
        [ 0.9622,  0.2132,  1.0000,  ...,  0.2152,  0.7130, -0.9966],
        [ 0.1329, -1.0000, -0.7810,  ..., -0.9998,  0.9750,  0.7462]])

In [93]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [94]:
logits = h @ W2 + b2 #second linear layer

In [95]:
logits.shape

torch.Size([32, 27])

In [96]:
counts = logits.exp()

In [97]:
prob = counts / counts.sum(1, keepdims=True)
prob

tensor([[4.7008e-09, 8.8714e-06, 3.3168e-01, 1.2584e-05, 2.3167e-09, 1.2796e-10,
         8.3215e-11, 6.4294e-07, 5.1951e-08, 3.3789e-06, 1.5189e-03, 6.8262e-05,
         3.7613e-07, 6.2532e-13, 1.5927e-10, 8.2239e-06, 1.8323e-08, 1.4269e-08,
         2.6168e-05, 8.2330e-07, 6.5794e-01, 1.8119e-11, 4.5998e-09, 2.0341e-05,
         5.4432e-06, 8.7054e-03, 6.0415e-12],
        [9.3150e-01, 2.4325e-06, 1.0237e-09, 1.7496e-06, 6.6780e-09, 6.4393e-08,
         5.7660e-08, 6.3291e-08, 2.0872e-02, 2.4175e-03, 6.2874e-04, 8.8956e-05,
         8.9498e-07, 2.4573e-08, 3.1046e-13, 1.2666e-05, 9.7728e-11, 2.0570e-06,
         3.1907e-02, 6.4657e-09, 1.2503e-02, 1.6720e-08, 5.3784e-05, 6.9911e-06,
         3.7002e-09, 1.9793e-06, 1.6624e-06],
        [7.0282e-11, 1.8660e-11, 7.7693e-04, 2.1158e-04, 1.2086e-09, 2.1356e-12,
         1.8347e-05, 2.8229e-01, 3.1247e-06, 2.9106e-08, 1.4384e-09, 3.9568e-05,
         4.3158e-04, 1.9857e-14, 7.1619e-01, 1.3114e-06, 1.8246e-09, 1.2738e-07,
         2.9759e-

In [98]:
prob.shape

torch.Size([32, 27])

In [51]:
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [45]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [99]:
prob[torch.arange(32),Y]

tensor([1.2796e-10, 2.4573e-08, 1.9857e-14, 6.2196e-10, 8.5623e-13, 8.2239e-06,
        1.1673e-07, 7.1976e-03, 2.3370e-09, 1.4973e-07, 2.1407e-06, 1.3090e-08,
        8.8714e-06, 2.6627e-10, 5.4176e-06, 1.7271e-05, 3.3789e-06, 9.8017e-08,
        2.0169e-06, 6.5201e-09, 2.8685e-12, 1.1812e-08, 2.2188e-10, 9.8056e-06,
        2.4963e-05, 8.2330e-07, 1.3454e-08, 1.2808e-08, 2.1678e-03, 1.0823e-03,
        1.0947e-04, 1.4606e-09])

In [101]:
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(16.2385)

In [104]:
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [354]:
#All together now
CONTEXT_LENGTH = 3
EMB_VECTOR_DIMS = 15
EMBEDDING_DIMS = EMB_VECTOR_DIMS * CONTEXT_LENGTH
C = torch.randn((27,EMB_VECTOR_DIMS))
W1 = torch.randn((EMBEDDING_DIMS,200))
b1 = torch.randn(200)
W2 = torch.randn((200,27))
b2 = torch.randn(27)
parameters = [C, W1, b1, W2, b2]

In [191]:
sum(p.nelement() for p in parameters) # total # of parameters

11897

In [355]:
for p in parameters:
    p.requires_grad = True

In [357]:
#NOTE: The dimensions (32 specifically) were with only 5 training examples. Wherever you see 32, it's actually just the # of training examples.
for _ in range(20000):

    #mini-batch of size 32
    ix = torch.randint(0, Xtr.shape[0], (32, )) #returns the indexes of 32 samples from our train set
    
    #forward pass
    emb = C[Xtr[ix]] # (32, 3, 2)
    h = torch.tanh(emb.view(emb.shape[0], EMBEDDING_DIMS) @ W1 + b1) # (32, 100), first layer
    logits = h @ W2 + b2 # (32, 27), output/second layer
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdims= True)
    # loss = -prob[torch.arange(32), Y].log().mean()
    loss = F.cross_entropy(logits, Ytr[ix]) # this is better than the three lines above
    # print(loss.item())
    
    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    #update
    for p in parameters:
        p.data += -0.01 * p.grad

print(loss.item())

1.9292185306549072


In [244]:
#training, val/dev split, test split
#.8, .1, .1
# NOTE ON SPLITS: Validation split is for hyperparameter tuning. Test split should only be evaluated on a few times to avoid the model beginning
#to overfit the test set as well.

# build the dataset
def build_dataset(words):
    block_size = 3 #context length
    X, Y = [], []
    for w in words:
        # print(w)
        context = [0] * block_size #starts as block_size #'.'s. (Since itos[0] is . and we use the integer representation of the character)
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '---->', itos[ix])
            context = context[1:] + [ix]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182691, 3]) torch.Size([182691])
torch.Size([22793, 3]) torch.Size([22793])
torch.Size([22662, 3]) torch.Size([22662])


In [177]:
len(words)

32033

In [358]:
#Evaluation
emb = C[Xdev]
h = torch.tanh(emb.view(emb.shape[0], EMBEDDING_DIMS) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.1579, grad_fn=<NllLossBackward0>)

In [185]:
#NOTE ON TRAINING AND EVALUATION
"""
If we are underfitting, we will have a very similar loss on the training as we do on the eval and test sets. This suggests that our model is very 
small and we will increase performance by increasing the size.
Large nets take longer to converge.
Batches not only decrease training time, but also can lead to better gradients as stochastic GD leads to a lot of noise in training as we're calculating
gradients at every piece of training data.
"""


'\n\n'

In [200]:
torch.randint(0, X.shape[0], (32, ))

tensor([156061, 163476, 205784,  89185, 191318,  51782,  61629, 174337, 167960,
         80483,  26513, 160966, 149353, 109253, 105360, 135393, 169759,  68677,
         27572, 218261, 220674, 156892,  73078,  61898, 129056,  54335, 224361,
         48185,   6119, 186061, 160520,  90039])