We currently have a model that can deal with one character of previous context. But the precition was not good enough.

* More context? way too many rows => expensive to calculate
* MLP

In [2]:
import torch
import torch.nn.functional as F

# Data

In [3]:
names = open("names.txt").read().splitlines()
names[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [4]:
vocab = sorted(set(''.join(names)))
vocab = ["."] + vocab
len(vocab)

27

In [5]:
stoi = {s:i for i, s in enumerate(vocab)}
itos = {i:s for s, i in stoi.items()}

In [6]:
block_size = 5 # context length
xs, ys = [], []

for name in names[:5]:
    context = [0] * block_size
    print(name)
    for ch in name+".":
        idx = stoi[ch]
        xs.append(context)
        ys.append(idx)
        print(f"{''.join([itos[i] for i in context])} -------> {itos[idx]}")
        context = context[1:] + [idx]
xs, ys = torch.tensor(xs), torch.tensor(ys)

emma
..... -------> e
....e -------> m
...em -------> m
..emm -------> a
.emma -------> .
olivia
..... -------> o
....o -------> l
...ol -------> i
..oli -------> v
.oliv -------> i
olivi -------> a
livia -------> .
ava
..... -------> a
....a -------> v
...av -------> a
..ava -------> .
isabella
..... -------> i
....i -------> s
...is -------> a
..isa -------> b
.isab -------> e
isabe -------> l
sabel -------> l
abell -------> a
bella -------> .
sophia
..... -------> s
....s -------> o
...so -------> p
..sop -------> h
.soph -------> i
sophi -------> a
ophia -------> .


In [7]:
xs.shape, xs.dtype, ys.shape, ys.dtype

(torch.Size([32, 5]), torch.int64, torch.Size([32]), torch.int64)

# Embedding

Our vocab size is 27 and we want to map them into 2-D space. In original paper from Bengio, they have 17,000 words and then map them into 30-D space.

In [8]:
C = torch.randn([27, 2])
C

tensor([[-0.9003,  0.4254],
        [ 0.1871, -0.2595],
        [-0.2032,  0.4650],
        [-0.3530,  0.5689],
        [ 1.0710,  0.5460],
        [ 1.6349, -0.1869],
        [-0.5411,  1.2125],
        [-0.5219, -0.7689],
        [-0.5040, -0.4964],
        [-1.2047, -1.7238],
        [ 1.8328,  0.2773],
        [ 0.8992, -0.0759],
        [-0.6523,  0.7593],
        [-1.3640,  0.4190],
        [ 1.4653,  1.9644],
        [-0.5914, -0.7811],
        [-0.4783, -0.3846],
        [ 0.6647,  0.7656],
        [-0.2210,  1.1042],
        [ 1.3462, -0.2974],
        [-0.5912, -0.1251],
        [-0.1386, -0.1115],
        [-0.5900,  1.0797],
        [-1.5561, -1.1057],
        [-1.2170,  0.4879],
        [ 0.8231,  0.0020],
        [-1.2715,  0.3157]])

In [9]:
xenc = F.one_hot(torch.tensor(5), num_classes=27).float()
# which is equal to C[5]
# 5 th row will be plucked out.
C[5], C[5] == xenc @ C

(tensor([ 1.6349, -0.1869]), tensor([True, True]))

In [10]:
xs

tensor([[ 0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  5],
        [ 0,  0,  0,  5, 13],
        [ 0,  0,  5, 13, 13],
        [ 0,  5, 13, 13,  1],
        [ 0,  0,  0,  0,  0],
        [ 0,  0,  0,  0, 15],
        [ 0,  0,  0, 15, 12],
        [ 0,  0, 15, 12,  9],
        [ 0, 15, 12,  9, 22],
        [15, 12,  9, 22,  9],
        [12,  9, 22,  9,  1],
        [ 0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  1],
        [ 0,  0,  0,  1, 22],
        [ 0,  0,  1, 22,  1],
        [ 0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  9],
        [ 0,  0,  0,  9, 19],
        [ 0,  0,  9, 19,  1],
        [ 0,  9, 19,  1,  2],
        [ 9, 19,  1,  2,  5],
        [19,  1,  2,  5, 12],
        [ 1,  2,  5, 12, 12],
        [ 2,  5, 12, 12,  1],
        [ 0,  0,  0,  0,  0],
        [ 0,  0,  0,  0, 19],
        [ 0,  0,  0, 19, 15],
        [ 0,  0, 19, 15, 16],
        [ 0, 19, 15, 16,  8],
        [19, 15, 16,  8,  9],
        [15, 16,  8,  9,  1]])

In [11]:
C[xs].shape, C[xs][1, 4], C[xs][1, 4] == C[5] 

(torch.Size([32, 5, 2]), tensor([ 1.6349, -0.1869]), tensor([True, True]))

In [14]:
emb = C[xs]
emb.shape

torch.Size([32, 5, 2])

# Hidden Layer

In [16]:
W = torch.randn([10, 100])
emb @ W

RuntimeError: mat1 and mat2 shapes cannot be multiplied (160x2 and 10x100)

In [31]:
# Method 1: Extract and concat
concated_1 = torch.concat((emb[:, 0, :], emb[:, 1, :], emb[:, 2, :], emb[:, 3, :], emb[:, 4, :]), dim=1)
(concated_1 @ W).shape

torch.Size([32, 100])

In [38]:
# Method 2: Unbind
unbind_emb = torch.unbind(emb, dim=1)
concated_2 = torch.concat(unbind_emb, dim=1)
(concated_2 @ W).shape

torch.Size([32, 100])

In [44]:
# Method 3: View
viewed_emb = emb.view(32, -1)
concated_1 == concated_2, concated_1 == viewed_emb

(tensor([[True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True, True],
      

In [49]:
b = torch.rand(100)
hidden = emb.view(32, -1) @ W + b # b is broadcasted. emb: [32, 100] b: [1, 100] -> [32, 100]
hiddden_out = torch.tanh(hidden)
hiddden_out.shape

torch.Size([32, 100])