In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt


In [2]:
words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)


In [33]:
block_size = 3
def build_dataset(words):
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

# split the dataset
import random 
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
Xtr, Ytr = build_dataset(words[:n1])
Xval, Yval = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])



In [57]:
n_embd = 10
n_hidden = 200

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, n_embd), generator=g)
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/(n_embd * n_hidden)**0.5 ##* 0.01 # we scale by 0.01 to make it smaller and therefore not kill the tanh activation at the start, because if the inputs are too large, tanh will saturate and gradients will be very small
# b1 = torch.randn(n_hidden, generator=g) * 0.01
# we dont need bias when using batch norm because batch norm has its own bias term
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01 # we scale by 0.01 to make the initial logits small, so that softmax is evenly distributed at the start
b2 = torch.randn(vocab_size, generator=g) * 0

bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))
bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, W2, b2 , bngain, bnbias]

for p in parameters:
    p.requires_grad = True


In [24]:
hpreact.shape

torch.Size([32, 200])

In [26]:
hpreact.mean(0,keepdim=True).shape

torch.Size([1, 200])

In [58]:
max_steps = 200000
batch_size = 32
lossi = []

for step in range(max_steps):

    # MINIBATCH SAMPLE
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix]

    # FORWARD PASS
    emb = C[Xb]
    embcat = emb.view(emb.shape[0], -1)

    # Linear layer
    hpreact = embcat @ W1 #+ b1

    #Batch Normalization Layer
    # We do batch normalization on the hidden layer pre-activation, because otherwise tanh can saturate and kill gradients
    # we use the trainable parameters bngain and bnbias to allow the network to learn the optimal scale and shift after normalization
    bmeani = hpreact.mean(0, keepdim=True)
    bnstdi = hpreact.std(0, keepdim=True)
    hpreact = bngain * (hpreact - bmeani)/bnstdi + bnbias

    with torch.no_grad():
        # update running mean and std
        bnmean_running = 0.9 * bnmean_running + 0.1 * bmeani
        bnstd_running = 0.9 * bnstd_running + 0.1 * bnstdi


    # Non-linearity layer / activation function
    h = torch.tanh(hpreact)

    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yb)
    
    # BACKWARD PASS
    for p in parameters:
        p.grad = None
    loss.backward()

    # SGD step
    for p in parameters:
        p.data += -0.1 * p.grad

    lossi.append(loss.item())

    if step % 10000 == 0:
        print(f"{step:7d}/{max_steps:7d}: {loss.item():.4f}")

      0/ 200000: 3.3111
  10000/ 200000: 2.5608
  20000/ 200000: 1.8233
  30000/ 200000: 2.3316
  40000/ 200000: 2.2627
  50000/ 200000: 2.0380
  60000/ 200000: 1.8556
  70000/ 200000: 1.9904
  80000/ 200000: 2.3875
  90000/ 200000: 2.2780
 100000/ 200000: 1.9153
 110000/ 200000: 2.0341
 120000/ 200000: 2.0200
 130000/ 200000: 2.1637
 140000/ 200000: 1.7200
 150000/ 200000: 1.9482
 160000/ 200000: 2.2946
 170000/ 200000: 2.4649
 180000/ 200000: 2.2373
 190000/ 200000: 2.1730


In [54]:
# calibrate batch norm statistics by doing a forward pass through the training set
# but we are now doing this in the training loop
with torch.no_grad():
    #pass the entire training set
    emb = C[Xtr]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1
    # compute the mean and std over the entire training set
    bnmean_running = hpreact.mean(0, keepdim=True)
    bnstd_running = hpreact.std(0, keepdim=True)


In [59]:
@torch.no_grad()
def split_loss(split):
    x,y = {'train': (Xtr, Ytr), 'val': (Xval, Yval), 'test': (Xte, Yte)}[split]
    emb = C[x]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1
    #hpreact = bngain * (hpreact - hpreact.mean(0, keepdim=True))/hpreact.std(0, keepdim=True) + bnbias
    hpreact = bngain * (hpreact - bnmean_running)/bnstd_running + bnbias
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    print(f"{split} loss: {loss.item():.4f}")
split_loss('train')
split_loss('val')

train loss: 2.1162
val loss: 2.1822


In [60]:
# sample from the model
g = torch.Generator().manual_seed(2147483647)
for _ in range(20):
    out = []
    context = [0] * block_size
    while True:
        emb = C[torch.tensor([context])]
        embcat = emb.view(1, -1)
        hpreact = embcat @ W1  #+ b1
        hpreact = bngain * (hpreact - bnmean_running)/bnstd_running + bnbias
        h = torch.tanh(hpreact)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
    print(''.join(itos[i] for i in out))
    

cexi.
karalynn.
ima.
kayden.
mariyanainella.
kamandr.
samiyah.
jaxsi.
gotti.
mckiellah.
jayla.
daria.
emilyssaly.
tiaviyah.
fobstihilinghviah.
asu.
jadri.
antil.
gyan.
ivan.


In [12]:
torch.tensor(1/27.0).log()

tensor(-3.2958)