In [None]:
words = open('names.txt', 'r').read().splitlines()

In [None]:
# Normal bigram approach 

In [None]:
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) + 1      

In [None]:
import torch

In [None]:
N = torch.zeros((27,27), dtype=torch.int32)

In [None]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
itos

In [None]:
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
P = N.float()
P /= P.sum(1, keepdim=True)

In [None]:
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    out = []
    ix = 0
    while True:
        p = P[ix]

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break

    print(''.join(out))
    

In [None]:
log_likelihood = 0.0
n = 0
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        log_likelihood += torch.log(prob)
        n += 1
        
print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

In [None]:
# Neural network bigram approach

# Create the training set of bigrams
xs, ys = [],[]
for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [None]:
import torch.nn.functional as F
import matplotlib.pyplot as plt

xenc = F.one_hot(xs, num_classes=27).float()

In [None]:
# Summary

# Create the training set of bigrams
xs, ys = [],[]
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
#initialize network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27,27), generator=g, requires_grad=True)

In [None]:
#gradient descent
for i in range(10):
    #forward pass
    xenc = F.one_hot(xs, num_classes=27).float()
    logits = xenc @ W # log-counts
    counts = logits.exp() # Equivalent to the previous N array, "counts" array.
    prob = counts / counts.sum(1, keepdims=True) 
    # Last two lines here define the softmax function 
    loss = -prob[torch.arange(num), ys].log().mean()
    print(loss.item())

    #backward pass
    W.grad = None # set the gradient to zero
    loss.backward()
 
    #update
    W.data += -10*W.grad
    