We currently have a model that can deal with one character of previous context. But the precition was not good enough.

* More context? way too many rows => expensive to calculate
* MLP

In [1]:
import torch
import torch.nn.functional as F

# Data

In [2]:
names = open("names.txt").read().splitlines()
names[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
vocab = sorted(set(''.join(names)))
vocab = ["."] + vocab
len(vocab)

27

In [4]:
stoi = {s:i for i, s in enumerate(vocab)}
itos = {i:s for s, i in stoi.items()}

In [5]:
block_size = 5 # context length
xs, ys = [], []

for name in names[:5]:
    context = [0] * block_size
    print(name)
    for ch in name+".":
        idx = stoi[ch]
        xs.append(context)
        ys.append(idx)
        print(f"{''.join([itos[i] for i in context])} -------> {itos[idx]}")
        context = context[1:] + [idx]
xs, ys = torch.tensor(xs), torch.tensor(ys)

emma
..... -------> e
....e -------> m
...em -------> m
..emm -------> a
.emma -------> .
olivia
..... -------> o
....o -------> l
...ol -------> i
..oli -------> v
.oliv -------> i
olivi -------> a
livia -------> .
ava
..... -------> a
....a -------> v
...av -------> a
..ava -------> .
isabella
..... -------> i
....i -------> s
...is -------> a
..isa -------> b
.isab -------> e
isabe -------> l
sabel -------> l
abell -------> a
bella -------> .
sophia
..... -------> s
....s -------> o
...so -------> p
..sop -------> h
.soph -------> i
sophi -------> a
ophia -------> .


In [6]:
xs.shape, xs.dtype, ys.shape, ys.dtype

(torch.Size([32, 5]), torch.int64, torch.Size([32]), torch.int64)

# Embedding

Our vocab size is 27 and we want to map them into 2-D space. In original paper from Bengio, they have 17,000 words and then map them into 30-D space.

In [7]:
C = torch.randn([27, 2])
C

tensor([[-0.4131, -0.0551],
        [-0.1361, -0.2084],
        [ 0.5974,  0.4307],
        [ 0.4001,  0.7031],
        [-0.0385, -0.8782],
        [ 1.6016,  1.0163],
        [-0.1608, -0.5976],
        [ 0.3574, -0.1388],
        [ 0.8116, -1.0007],
        [ 0.5179,  0.2517],
        [-0.3108,  0.2666],
        [ 0.3383, -2.5546],
        [-0.7341, -0.6211],
        [-0.4486, -0.3803],
        [-1.6594, -2.8194],
        [ 0.4599,  0.6645],
        [-2.1260, -0.3983],
        [-0.5086,  0.6481],
        [-0.1754, -0.6186],
        [ 0.0508,  0.6391],
        [-0.2584,  0.7166],
        [ 2.4814, -0.5229],
        [-2.1862,  0.3440],
        [-0.7775,  0.8753],
        [-0.3211,  0.8242],
        [ 0.1115,  0.6870],
        [ 0.1554,  0.3896]])

In [8]:
xenc = F.one_hot(torch.tensor(5), num_classes=27).float()
# which is equal to C[5]
# 5 th row will be plucked out.
C[5], C[5] == xenc @ C

(tensor([1.6016, 1.0163]), tensor([True, True]))

In [9]:
xs

tensor([[ 0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  5],
        [ 0,  0,  0,  5, 13],
        [ 0,  0,  5, 13, 13],
        [ 0,  5, 13, 13,  1],
        [ 0,  0,  0,  0,  0],
        [ 0,  0,  0,  0, 15],
        [ 0,  0,  0, 15, 12],
        [ 0,  0, 15, 12,  9],
        [ 0, 15, 12,  9, 22],
        [15, 12,  9, 22,  9],
        [12,  9, 22,  9,  1],
        [ 0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  1],
        [ 0,  0,  0,  1, 22],
        [ 0,  0,  1, 22,  1],
        [ 0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  9],
        [ 0,  0,  0,  9, 19],
        [ 0,  0,  9, 19,  1],
        [ 0,  9, 19,  1,  2],
        [ 9, 19,  1,  2,  5],
        [19,  1,  2,  5, 12],
        [ 1,  2,  5, 12, 12],
        [ 2,  5, 12, 12,  1],
        [ 0,  0,  0,  0,  0],
        [ 0,  0,  0,  0, 19],
        [ 0,  0,  0, 19, 15],
        [ 0,  0, 19, 15, 16],
        [ 0, 19, 15, 16,  8],
        [19, 15, 16,  8,  9],
        [15, 16,  8,  9,  1]])

In [10]:
C[xs].shape, C[xs][1, 4], C[xs][1, 4] == C[5] 

(torch.Size([32, 5, 2]), tensor([1.6016, 1.0163]), tensor([True, True]))

In [11]:
emb = C[xs]
emb.shape

torch.Size([32, 5, 2])

# Hidden Layer

In [12]:
W = torch.randn([10, 100])
emb @ W

RuntimeError: mat1 and mat2 shapes cannot be multiplied (160x2 and 10x100)

In [13]:
# Method 1: Extract and concat
concated_1 = torch.concat((emb[:, 0, :], emb[:, 1, :], emb[:, 2, :], emb[:, 3, :], emb[:, 4, :]), dim=1)
(concated_1 @ W).shape

torch.Size([32, 100])

In [14]:
# Method 2: Unbind
unbind_emb = torch.unbind(emb, dim=1)
concated_2 = torch.concat(unbind_emb, dim=1)
(concated_2 @ W).shape

torch.Size([32, 100])

In [15]:
# Method 3: View
viewed_emb = emb.view(32, -1)
concated_1 == concated_2, concated_1 == viewed_emb

(tensor([[True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
      

In [16]:
b = torch.rand(100)
hidden = emb.view(32, -1) @ W + b # b is broadcasted. emb: [32, 100] b: [1, 100] -> [32, 100]
hiddden_out = torch.tanh(hidden)
hiddden_out.shape

torch.Size([32, 100])

# Output Layer

In [35]:
W2 = torch.randn([100, 27])
b2 = torch.randn(27)
logits = hiddden_out @ W2 + b2

Previously how we did

In [36]:
logits = hiddden_out @ W2 + b2
counts = logits.exp()
probs = counts / counts.sum(dim=1, keepdim=True)
probs.shape, probs[0].sum()

(torch.Size([32, 27]), tensor(1.0000))

In [37]:
output = probs[torch.arange(32), ys] # Output prob at correct position, ideally all of them should be 1
output.shape, output

(torch.Size([32]),
 tensor([2.0493e-07, 1.2585e-07, 2.8754e-12, 1.0543e-05, 2.2118e-06, 4.0500e-07,
         9.7637e-07, 2.2095e-09, 1.1765e-07, 3.1710e-11, 1.1918e-03, 1.0476e-08,
         1.6969e-06, 6.2076e-05, 4.3031e-08, 4.8551e-06, 1.7727e-07, 2.2477e-07,
         4.5843e-08, 1.6284e-07, 7.0345e-06, 3.1205e-09, 1.4766e-09, 4.1119e-07,
         6.2726e-08, 1.9470e-06, 5.7198e-10, 2.1182e-02, 4.6109e-08, 1.1464e-02,
         1.5935e-13, 3.3557e-11]))

In [38]:
loss = -output.log().mean()
loss

tensor(15.8728)

More efficient in torch

In [45]:
test_x = torch.tensor([-5, 5, 100])
how_torch_did_test_x = test_x - 100
test_x.exp(), how_torch_did_test_x.exp()

(tensor([6.7379e-03, 1.4841e+02,        inf]),
 tensor([0.0000e+00, 5.5211e-42, 1.0000e+00]))

In [39]:
loss_torch = F.cross_entropy(logits, ys)
loss_torch

tensor(15.8728)

# Put above together

In [47]:
xs.shape, ys.shape

(torch.Size([32, 5]), torch.Size([32]))

In [48]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn([27, 2], generator=g) # 32x5x27 * 27* 2 => 32x5x2
W1 = torch.randn([10, 100], generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn([100, 27], generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [51]:
sum(p.nelement() for p in parameters)

3881

In [52]:
# Fordward Pass
emb = C[xs]
h = torch.tanh(emb.view(32, -1) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, ys)