In [1]:
# classic PyTorch modules we'll need

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# Reading in the dataset, which is a list of names, with a name per line

with open('names.txt', 'r', encoding = 'utf-8') as f:
    names = f.read().splitlines()

names[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [4]:
# Let's get the vocab of characters, and the mappings between characters and integers

vocab = ['.'] + sorted(list(set(''.join(names))))
vocab_len = len(vocab)
itoc = {i:c for i, c in enumerate(vocab)}
ctoi = {c:i for i, c in itoc.items()}

In [7]:
# Creating the dataset of bigrams

x = []  # training set
y = []  # correct outputs for the training set

for name in names:
    
    word =  '.' + name + '.'
    
    for ch1, ch2 in zip(word, word[1:]):
        
        idx1 = ctoi[ch1]
        idx2 = ctoi[ch2]
        
        x.append(idx1)
        y.append(idx2)

x = torch.tensor(x)
y = torch.tensor(y)

In [36]:
hidden_neurons = vocab_len
g = torch.Generator().manual_seed(1869)
inputs = F.one_hot(x, num_classes = vocab_len).float()
W = torch.randn((vocab_len, hidden_neurons), generator = g, requires_grad = True)

In [37]:
tr_iter = 500
num_samples = len(inputs)
lre = 50
reg_p = 0.01

for i in range(tr_iter):
    
    logits = inputs @ W
    counts = logits.exp()
    probs = counts / counts.sum(dim = -1, keepdims = True)
    loss = -probs[torch.arange(num_samples), y].log().mean() + reg_p * (W**2).mean()
    
    if i % 10 == 0:
        print(loss.item())
    
    W.grad = None
    loss.backward()
    
    W.data += -lre * W.grad


3.8659188747406006
2.676997423171997
2.5712695121765137
2.5346262454986572
2.5170304775238037
2.507004976272583
2.5005927085876465
2.4961941242218018
2.4930431842803955
2.490718364715576
2.488964080810547
2.4876129627227783
2.4865529537200928
2.485705614089966
2.4850172996520996
2.484450340270996
2.483976125717163
2.4835760593414307
2.483234405517578
2.482940673828125
2.4826862812042236
2.48246431350708
2.482269525527954
2.482097864151001
2.4819464683532715
2.4818115234375
2.481691837310791
2.4815850257873535
2.4814891815185547
2.481403112411499
2.48132586479187
2.4812557697296143
2.4811933040618896
2.481135845184326
2.4810843467712402
2.4810373783111572
2.480994701385498
2.480955123901367
2.480919361114502
2.480886697769165
2.4808566570281982
2.4808287620544434
2.4808032512664795
2.4807796478271484
2.48075795173645
2.4807376861572266
2.4807186126708984
2.480701446533203
2.480684995651245
2.480670213699341


In [48]:
def generate(c, num_chars = 10):
    
    idx = ctoi[c]
    out = [c]
    
    for i in range(num_chars):
        inp = F.one_hot(torch.tensor(idx), num_classes = vocab_len).float()
        logits = inp @ W
        counts = logits.exp()
        probs = counts / counts.sum(dim = -1, keepdims = True)
        
        idx = torch.multinomial(probs, num_samples = 1, generator = g).item()
        out.append(itoc[idx])
        
        if idx == 0:
            break
        
    print(''.join(out))
    

In [61]:
generate('b')

beyle.


In [67]:
for ch in vocab:
    generate(ch)

.ae.
aansorarysa
bassin.
chi.
den.
elrae.
fuwaynaryk.
gens.
hiprileofac
idhereylie.
jbeeelle.
kahnahrsa.
lynnalitadr
mtiklyacysi
naryl.
onn.
peita.
quezelllyae
racaramerif
shie.
then.
uda.
varuwbadze.
wbarayeossh
x.
ysha.
zanaleliera
