In [210]:
# Load the dataset

with(open('names.txt')) as f:
    data = f.readlines()

print(f'Total data: {len(data)}')
print(data[:10])

Total data: 32033
['emma\n', 'olivia\n', 'ava\n', 'isabella\n', 'sophia\n', 'charlotte\n', 'mia\n', 'amelia\n', 'harper\n', 'evelyn\n']


In [211]:
# Preprocess of the data
import numpy as np

data = [d.strip().lower() for d in data]
data = np.array(data)

# Shuffle the data
np.random.seed(45)
np.random.shuffle(data)

print(data[:10])

['jaiceion' 'avari' 'shahbaz' 'mehnaz' 'alexzandra' 'harlem' 'naelle'
 'cayson' 'rosalea' 'jaselle']


In [212]:
# Create the tokenizer
vocabulary = sorted(list(set(''.join(data))))
# add special token
S_TOK = '.'
ixtos = [S_TOK] + vocabulary
print(ixtos)
print(len(ixtos))
stoix = {s: i for i, s in enumerate(ixtos)}

['.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
27


In [213]:
# Tokenize the data
tokenized = [[stoix[c] for c in n] for n in data]

# Verify the tokenization is working properly
[ixtos[t] for t in tokenized[0]]

['j', 'a', 'i', 'c', 'e', 'i', 'o', 'n']

In [214]:
# Build the dataset with a chunk of characters as a context and the next character as label
context = 4 # Characters of context

def build_dataset(data: list):
    X, y = [], []
    for name in data:
        # Pad word and add final special token
        name = [stoix[S_TOK]]*context + name + [stoix[S_TOK]]
        if len(name) < context + 1:
            print(f'name {name} not large enough')
            continue
        for i in range(len(name)-context):
            ctxt = name[i:i+context]
            label = name[i+context]
            # print(ctxt, label)
            X.append(ctxt)
            y.append(label)
    
    return np.array(X), np.array(y)

X, y = build_dataset(tokenized)
len(X), len(y)

(228146, 228146)

In [221]:
import torch
# Split the data intro training, validation, and test
train_samples = int(len(X)*0.8)
valid_samples = int(len(X)*0.1)

X_train, y_train = X[:train_samples], y[:train_samples]
X_valid, y_valid = X[train_samples:train_samples+valid_samples], y[train_samples:train_samples+valid_samples]
X_test, y_test = X[train_samples+valid_samples:], y[train_samples+valid_samples:]

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_valid = torch.tensor(X_valid)
y_valid = torch.tensor(y_valid)
X_test = torch.tensor(X_test)
y_test = torch.tensor(y_test)

len(X_train), len(X_valid), len(X_test)

(182516, 22814, 22816)

In [222]:
for d, k in zip(X_train[:30], y[:30]):
    print([ixtos[i] for i in d], ixtos[k])

['.', '.', '.', '.'] j
['.', '.', '.', 'j'] a
['.', '.', 'j', 'a'] i
['.', 'j', 'a', 'i'] c
['j', 'a', 'i', 'c'] e
['a', 'i', 'c', 'e'] i
['i', 'c', 'e', 'i'] o
['c', 'e', 'i', 'o'] n
['e', 'i', 'o', 'n'] .
['.', '.', '.', '.'] a
['.', '.', '.', 'a'] v
['.', '.', 'a', 'v'] a
['.', 'a', 'v', 'a'] r
['a', 'v', 'a', 'r'] i
['v', 'a', 'r', 'i'] .
['.', '.', '.', '.'] s
['.', '.', '.', 's'] h
['.', '.', 's', 'h'] a
['.', 's', 'h', 'a'] h
['s', 'h', 'a', 'h'] b
['h', 'a', 'h', 'b'] a
['a', 'h', 'b', 'a'] z
['h', 'b', 'a', 'z'] .
['.', '.', '.', '.'] m
['.', '.', '.', 'm'] e
['.', '.', 'm', 'e'] h
['.', 'm', 'e', 'h'] n
['m', 'e', 'h', 'n'] a
['e', 'h', 'n', 'a'] z
['h', 'n', 'a', 'z'] .


## Build the model

In [233]:
import torch

g = torch.Generator()
g.manual_seed(2147483647)

# Embeddings table
emb_size = 4
E = torch.randn((len(stoix), emb_size), generator=g, requires_grad=True)

# Dense layer
W1 = torch.randn((context * emb_size, 100), generator=g, requires_grad=True)
b1 = torch.randn((1, 100), generator=g, requires_grad=True)

# Final dense layer
W2 = torch.randn((100, E.shape[0]), generator=g, requires_grad=True)
b2 = torch.randn((1, E.shape[0]), generator=g, requires_grad=True)

# Model
# emb_layer = E[X_train].view(-1, context * emb_size) # embed each token and concatenate the context tokens
# print(emb_layer.shape)
# dense_layer = ((emb_layer @ W1) + b1).relu()
# print(dense_layer.shape)
# dense_layer = (dense_layer @ W2) + b2
# print(dense_layer.shape)
# logits = dense_layer.exp()
# norm = logits.sum(1, keepdim=True)
# print(norm.shape)
# probs = logits / norm
# print(probs.shape)

## Training

In [234]:
parameters = [E, W1, b1, W2, b2]

sum(p.nelement() for p in parameters)

4535

In [236]:
import torch.nn.functional as F

epochs = 80+100
lrs = [0.1, 0.01]
batch_size = 64

for i in range(epochs):
    train_loss = []
    if i < 80:
        lr = lrs[0]
    else:
        lr = lrs[1]
    for b in range(0, len(X_train), batch_size):
        # Model
        X_batch = X_train[b:b+batch_size]
        y_batch = y_train[b:b+batch_size]
        # print(X_batch.shape, y_batch.shape)
       
        emb_layer = E[X_batch].view(-1, context * emb_size) # embed each token and concatenate the context tokens
        # print(E.shape)

        dense_layer = ((emb_layer @ W1) + b1).relu()
        dense_layer = (dense_layer @ W2) + b2
        # logits = dense_layer.exp()
        # norm = logits.sum(1, keepdim=True)
        # probs = logits / norm

        # Loss
        loss = F.cross_entropy(dense_layer, y_batch)
        # loss = -probs[[torch.arange(len(y_batch)), torch.tensor(y_batch)]].log().mean()

        # print(loss.item())
        train_loss.append(loss.item())

        # Update
        for p in parameters:
            p.grad = None
        loss.backward()

        for p in parameters:
            p.data -= lr * p.grad
    
    with torch.no_grad():
        emb_layer = E[X_valid].view(-1, context * emb_size) # embed each token and concatenate the context tokens
        dense_layer = ((emb_layer @ W1) + b1).relu()
        dense_layer = (dense_layer @ W2) + b2
        valid_loss = F.cross_entropy(dense_layer, y_valid)

    print(f"Epoch {i} - Training loss: {sum(train_loss)/len(train_loss):.5f} - Validation loss: {valid_loss:.5f}")
    
    

Epoch 0 - Training loss: 2.43256 - Validation loss: 2.44857
Epoch 1 - Training loss: 2.39487 - Validation loss: 2.41644
Epoch 2 - Training loss: 2.37408 - Validation loss: 2.39407
Epoch 3 - Training loss: 2.35881 - Validation loss: 2.37261
Epoch 4 - Training loss: 2.34604 - Validation loss: 2.35661
Epoch 5 - Training loss: 2.33505 - Validation loss: 2.34553
Epoch 6 - Training loss: 2.32555 - Validation loss: 2.33547
Epoch 7 - Training loss: 2.31689 - Validation loss: 2.32867
Epoch 8 - Training loss: 2.30944 - Validation loss: 2.31785
Epoch 9 - Training loss: 2.30302 - Validation loss: 2.31489
Epoch 10 - Training loss: 2.29728 - Validation loss: 2.30782
Epoch 11 - Training loss: 2.29187 - Validation loss: 2.30418
Epoch 12 - Training loss: 2.28703 - Validation loss: 2.29947
Epoch 13 - Training loss: 2.28271 - Validation loss: 2.29394
Epoch 14 - Training loss: 2.27870 - Validation loss: 2.28917
Epoch 15 - Training loss: 2.27486 - Validation loss: 2.28541
Epoch 16 - Training loss: 2.27165 

In [237]:
with torch.no_grad():
    emb_layer = E[X_test].view(-1, context * emb_size) # embed each token and concatenate the context tokens
    dense_layer = ((emb_layer @ W1) + b1).relu()
    dense_layer = (dense_layer @ W2) + b2
    test_loss = F.cross_entropy(dense_layer, y_test)

test_loss

tensor(2.1681)

## Inference

In [286]:
names_to_generate = 5

def generate_name():
    inp = torch.tensor([[stoix[S_TOK]]*context])
    # print(inp)
    # print(inp[:,-4:])
    while True:
        # print(inp.shape)
        # print(E[inp].shape)
        emb_layer = E[inp[:,-4:]].view(-1, context * emb_size) # embed each token and concatenate the context tokens
        dense_layer = ((emb_layer @ W1) + b1).relu()
        dense_layer = (dense_layer @ W2) + b2
        prob = dense_layer.exp()
        prob /= prob.sum()
        # next_token = dense_layer.argmax(1, keepdim=True)
        next_token = torch.multinomial(prob, 1)
        # print(next_token.item())
        # print(torch.cat((inp, next_token), 1))
        inp = torch.cat((inp, next_token), 1)
        if next_token[0].item() == stoix[S_TOK]:
            break
    
    return ''.join([ixtos[i] for i in inp[0]])
    
    # test_loss = F.cross_entropy(dense_layer, y_test)

for i in range(10):
    print(generate_name())

....alewy.
....roirynn.
....jaigh.
....zohy.
....nekenabrikyn.
....jai.
....shil.
....crecar.
....leor.
....avisa.
