If we increase the context size (from 1 character to 2, for example), the model we built is going to get overly complicated. \
\
This notebook focuses on implementing a multi-layer perceptron character-level language model. \
\
**Number of features = number of dimensions** \
\
Randomly initialized vectors of words (or characters in this notebook) => tune the embeddings (vectors) using backpropagation.

In [23]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [24]:
words = open('names.txt', 'r').read().splitlines()

In [25]:
len(words)

32033

In [26]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [27]:
# build the dataset

block_size = 3 # context length: the number of characters we take to predict the next one
X, Y = [], []
for w in words[:5]:
    print(w)
    context = [0] * 3
    for ch in w + '.':
        idx = stoi[ch]
        X.append(context)
        Y.append(idx)
        print(''.join(itos[i] for i in context), '--->', itos[idx])
        context = context[1:] + [idx]

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [28]:
X.shape, X.dtype, Y.shape, Y.dtype # the dataset we built has 32 examples with 3 integers each and 32 integer labels 

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [29]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [30]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [31]:
# we have 27 characters to embed in a lower-dimensional space
# in the paper (Bengio et al.), they embed 17000 words into 30d space
# we can embed 27 characters in 2d
C = torch.randn((27, 2))

In [32]:
C[[5, 6, 7]] # the vectors of E, F, G in the embedding space

tensor([[-0.8140,  0.3182],
        [ 0.5108, -0.7072],
        [-0.2109, -0.9830]])

In [33]:
# we can embed all of the integers like:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

### The hidden layer

In [34]:
W1 = torch.randn((6, 100)) # weights of the hidden layer: we have three 2D embeddings, so the number of inputs = 6
b1 = torch.randn(100) # number of neurons in the layer is variable, e.g. 100

emb @ W1 + b1 matrix multiplication does not work because the dimensions don't match, so we need to concatenate the input layer, or more efficiently, "view" it in another dimension (constant space complexity).

**(32, 3, 2) => (32, 6)**

In [45]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # tanh activation layer, we'll have numbers between -1 and 1

In [46]:
h.shape

torch.Size([32, 100])

### The output layer

In [47]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [48]:
logits = h @ W2 + b2

In [49]:
logits.shape

torch.Size([32, 27])