## Going to train this NN
```mermaid
graph TD
    IN[Input context 3 chars]

    subgraph Embedding_Layer_C
        EMB[Embedding C 27x2 to 3x2]
    end

    subgraph Flatten
        FLAT[Flatten to vector size 6]
    end

    subgraph Hidden_Layer
        H[Hidden layer 300 neurons]
    end

    subgraph Output_Layer
        LOGITS[Output logits size 27]
    end

    subgraph Softmax
        PROBS[Softmax to probabilities]
    end

    subgraph Prediction
        PRED[Pick next character]
    end

    IN --> EMB
    EMB --> FLAT
    FLAT --> H
    H --> LOGITS
    LOGITS --> PROBS
    PROBS --> PRED



#### Example flow

```mermaid

graph TD
    A0["Start: context of 3 chars<br/>example: ., ., e"]
    A1["Embed each char using C<br/>C shape 27 x 2 (one row per char incl. '.')"]
    A2["Stacked embeddings<br/>shape 3 x 2"]
    A3["Flatten<br/>shape 6<br/>order: c1_d1 c1_d2 c2_d1 c2_d2 c3_d1 c3_d2"]
    A4["Hidden layer<br/>Linear 6→300 then tanh"]
    A5["Output layer<br/>Linear 300→27 logits"]
    A6["Softmax<br/>probabilities over 27 chars"]
    A7["Pick next char<br/>(sample or argmax)"]

    A0 --> A1 --> A2 --> A3 --> A4 --> A5 --> A6 --> A7



In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt


In [3]:
words = open('names.txt', 'r').read().splitlines() 
words[:8]  # show first 8 words

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
len(words)  # number of words in the dataset

32033

In [7]:
chars = sorted(list(set(''.join(words))))  # all unique characters in the dataset
stoi = {s:i+1 for i,s in enumerate(chars)}  # char to int
stoi['.'] = 0  # add a special 'end of word' character
itos = {i:s for s,i in stoi.items()}  # int to char

In [8]:
block_size = 3  # context length: how many characters do we take to predict the next one?, bigram, trigram, etc.

X, Y = [], []
for w in words:
    context = [0] * block_size  # initialize with 'end of word' characters
    for ch in w + '.':  # for each character plus the 'end of word' character
        ix = stoi[ch]  # get the integer representation
        X.append(context)  # add the current context to inputs
        Y.append(ix)  # add the target character to outputs
        context = context[1:] + [ix]  # slide the context window / rolling window

# So for "emma", we will have:
# X: [.,.,.] -> Y: 'e'
# X: [.,.,e] -> Y: 'm'
# X: [.,e,m] -> Y: 'm'
# X: [e,m,m] -> Y: 'a'
# X: [m,m,a] -> Y: '.' 

X = torch.tensor(X)
Y = torch.tensor(Y)


In [11]:
print(X.shape) # (228146, 3) 228146 rows, where each row has 3 context characters, so the 3 characters used to predict the next character which is in Y
print(Y.shape) # (228146,) 228146 target characters

torch.Size([228146, 3])
torch.Size([228146])


In [13]:
C = torch.randn((27, 2))  # character embedding matrix, 27 characters, each represented by a 2-dimensional vector, so each character/it's integer representation is mapped to a point in 2D space
C[5]

tensor([-0.6328, -0.1950])

In [18]:
F.one_hot(torch.tensor(5), num_classes=27).float()

tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [19]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C  # get the embedding for character 5 by multiplying one-hot vector with embedding matrix C
# the reason we get the same result as C[5] is because when we multiply the one-hot vector with C, only the row corresponding to the '1' in the one-hot vector contributes to the result, effectively selecting that row from C.
# we take a (1, 27) @ (27, 2) -> (1, 2) vector

# so we could send in a 27-dimensional one-hot vector to get the embedding for any character in the first layer of our neural network

tensor([-0.6328, -0.1950])

In [38]:
emb = C[X]
emb.shape # (228146, 3, 2) we have 228146 examples of contexts, each with a context of 3 characters, each character represented by a 2-dimensional embedding vector

torch.Size([228146, 3, 2])

In [95]:
W1 = torch.randn((6, 300))  # first layer weights, input dim is 6 (3 characters * 2 dimensions each), output dim is 100
b1 = torch.randn(300)  # first layer bias, output dim is 100

In [40]:
emb @ W1 + b1  # doesnt work because the shapes are not aligned for matrix multiplication

RuntimeError: mat1 and mat2 shapes cannot be multiplied (684438x2 and 6x100)

In [41]:
torch.cat([emb[:,0, :], emb[:,1, :], emb[:,2, :]],1 ) # get the embeddings for each of the 3 context characters separately
# so basically in simple terms we are flattening the (3, 2) embedding for each example into a (6,) vector by concatenating the embeddings of the 3 characters together
# so one example could be [c1_dim1, c1_dim2, c2_dim1, c2_dim2, c3_dim1, c3_dim2] instead of [[c1_dim1, c1_dim2], [c2_dim1, c2_dim2], [c3_dim1, c3_dim2]]

# this is ugly if we want to scale to larger context sizes

tensor([[ 0.9260,  0.4435,  0.9260,  0.4435,  0.9260,  0.4435],
        [ 0.9260,  0.4435,  0.9260,  0.4435, -0.6328, -0.1950],
        [ 0.9260,  0.4435, -0.6328, -0.1950, -0.0846,  0.2794],
        ...,
        [ 1.3785,  0.5128,  1.3785,  0.5128,  0.5580, -0.7022],
        [ 1.3785,  0.5128,  0.5580, -0.7022,  1.3785,  0.5128],
        [ 0.5580, -0.7022,  1.3785,  0.5128,  1.2880,  0.1033]])

In [42]:
torch.unbind(emb,1)  # is equivalent to this list [emb[:,0, :], emb[:,1, :], emb[:,2, :]]

torch.cat(torch.unbind(emb,1),1)  # more scalable way to do the same thing as above

tensor([[ 0.9260,  0.4435,  0.9260,  0.4435,  0.9260,  0.4435],
        [ 0.9260,  0.4435,  0.9260,  0.4435, -0.6328, -0.1950],
        [ 0.9260,  0.4435, -0.6328, -0.1950, -0.0846,  0.2794],
        ...,
        [ 1.3785,  0.5128,  1.3785,  0.5128,  0.5580, -0.7022],
        [ 1.3785,  0.5128,  0.5580, -0.7022,  1.3785,  0.5128],
        [ 0.5580, -0.7022,  1.3785,  0.5128,  1.2880,  0.1033]])

In [59]:
# even simpler way is to use view
emb.reshape(emb.shape[0],6)


tensor([[ 0.9260,  0.4435,  0.9260,  0.4435,  0.9260,  0.4435],
        [ 0.9260,  0.4435,  0.9260,  0.4435, -0.6328, -0.1950],
        [ 0.9260,  0.4435, -0.6328, -0.1950, -0.0846,  0.2794],
        ...,
        [ 1.3785,  0.5128,  1.3785,  0.5128,  0.5580, -0.7022],
        [ 1.3785,  0.5128,  0.5580, -0.7022,  1.3785,  0.5128],
        [ 0.5580, -0.7022,  1.3785,  0.5128,  1.2880,  0.1033]])

In [96]:
h = torch.tanh(emb.reshape(len(emb),6) @ W1 + b1)  # now this works, we have (228146, 6) @ (6, 100) -> (228146, 100)
h.shape  # (228146, 100)

torch.Size([228146, 300])

In [97]:
W2 = torch.randn((300, 27))  # second layer weights, input dim is 100, output dim is 27 (number of characters)
b2 = torch.randn(27)  # second layer bias, output dim is 27


In [98]:
logits = h @ W2 + b2  # (228146, 100) @ (100, 27) -> (228146, 27)
#Softmax to get probabilities
counts = logits.exp()  # (228146, 27)
probs = counts / counts.sum(1, keepdim=True)  # (228146, 27) 
probs.shape

torch.Size([228146, 27])

In [99]:
loss = -probs[torch.arange(len(Y)), Y].log().mean() # probs is (228146, 27), Y is (228146,)
# we use torch.arange(len(Y)) to create a tensor of indices from 0 to 228145, which corresponds to the row indices in probs
# Y contains the target character indices for each example, which we use as the column indices in probs
# so probs[torch.arange(len(Y)), Y] extracts the predicted probabilities for the correct target characters for all examples
# we then take the logarithm of these probabilities, negate them to get the negative log-likelihood, and finally compute the mean to get the average loss across all examples.

loss

tensor(23.6600, grad_fn=<NegBackward0>)

In [100]:
for p in [C, W1, b1, W2, b2]:
    p.requires_grad = True

In [94]:
for _ in range(1000):    
    #Forward pass
    emb = C[X]  # (228146, 3, 2)
    h = torch.tanh(emb.reshape(len(emb),6) @ W1 + b1)  # (228146, 100)
    logits = h @ W2 + b2  # (228146, 27)
    loss = F.cross_entropy(logits, Y)  # this does the same as the loss calculation above in a more numerically stable way
    print(loss.item())
    #backward pass
    for p in [C, W1, b1, W2, b2]:
        p.grad = None  # set gradients to zero
    loss.backward()  # compute gradients

    #update
    lr = 0.1  # learning rate
    for p in [C, W1, b1, W2, b2]:
        p.data += -lr * p.grad  # gradient descent step



21.274520874023438
14.87217903137207
14.08586311340332
10.791343688964844
9.548832893371582
11.837512016296387


KeyboardInterrupt: 

In [122]:
# Minibatch training, should do train/val/test but skipping for now
batch_size = 32
for _ in range(100000):
    #Mini-batch sample
    ix = torch.randint(0, X.shape[0], (batch_size,))  # random indices for the batch
    Xb, Yb = X[ix], Y[ix]  # minibatch of inputs and targets

    #Forward pass
    emb = C[Xb]  # (32, 3, 2)
    h = torch.tanh(emb.reshape(len(emb),6) @ W1 + b1)  # (32, 100)
    logits = h @ W2 + b2  # (32, 27)
    loss = F.cross_entropy(logits, Yb)  # cross-entropy loss for the minibatch
    
    #backward pass
    for p in [C, W1, b1, W2, b2]:
        p.grad = None  # set gradients to zero
    loss.backward()  # compute gradients

    #update
    lr = 0.2  # learning rate
    for p in [C, W1, b1, W2, b2]:
        p.data += -lr * p.grad  # gradient descent step




In [123]:
emb = C[X]  # (32, 3, 2)
h = torch.tanh(emb.reshape(len(emb),6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)  # cross-entropy loss for the minibatch
print(loss.item())

2.383111000061035


In [124]:
# sampling from the model
for _ in range(20):  # generate 20 names
    out = []
    context = [0] * block_size  # start with 'end of word' characters
    while True:
        emb = C[torch.tensor([context])]  # (1, 3, 2)
        h = torch.tanh(emb.reshape(1,6) @ W1 + b1)  # (1, 100)
        logits = h @ W2 + b2  # (1, 27)
        probs = F.softmax(logits, dim=1)  # (1, 27)
        ix = torch.multinomial(probs, num_samples=1).item()  # sample from the distribution
        if ix == 0:  # 'end of word' character
            break
        out.append(itos[ix])
        context = context[1:] + [ix]  # slide the context window
    print(''.join(out))

lux
niyah
jel
joa
lorah
khustyn
lei
ele
caratvin
juf
kyla
jouleigus
xhrus
joh
lolyn
reyah
aton
ilei
ubry
kyna
