In [1]:
import torch

In [29]:
names = [name.strip() for name in open('./names.txt', 'r').readlines()]
names[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [41]:
ctoi = {}
itoc = {}
chars = set()
for name in names:
    for c in name:
        chars.add(c)

for i, c in enumerate(sorted(chars)):
    ctoi[c] = i
    itoc[i] = c

ctoi['.'] = 26
itoc[26] = '.'


In [48]:
vocab = list(ctoi.keys())
vocab_size = len(vocab)
vocab_size

27

Trigram model - given 2 tokens predict the next token
@ character level

In [120]:
X = []
y = []
for name in names[:2]:
    name = '..' + name + '.'
    for c1, c2, c3 in zip(name, name[1:], name[2:]):
        print(c1,c2,c3)
        X.append([ctoi[c1],ctoi[c2]+vocab_size])
        y.append(ctoi[c3])

. . e
. e m
e m m
m m a
m a .
. . o
. o l
o l i
l i v
i v i
v i a
i a .


In [167]:
input = torch.zeros((len(X), 2*vocab_size))
for i in range(len(X)):
    input[i][X[i][0]] = 0.5
    input[i][X[i][1]] = 1

torch.Size([12, 27])

In [206]:
# not required here
"""total = len(y)
train_size = int(total*0.8)
dev_size = int(0.5*(total - train_size))
# test_size = total - train_size - dev_size # remaining
xtrain, xdev, xtest = X[:train_size], torch.tensor(X[train_size:train_size+dev_size]), torch.tensor(X[train_size+dev_size:])
ytrain, ydev, ytest = F.one_hot(torch.tensor(y[:train_size]), vocab_size), F.one_hot(torch.tensor(y[train_size:train_size+dev_size]), vocab_size), F.one_hot(torch.tensor(y[train_size+dev_size:]), vocab_size)"""

'total = len(y)\ntrain_size = int(total*0.8)\ndev_size = int(0.5*(total - train_size))\n# test_size = total - train_size - dev_size # remaining\nxtrain, xdev, xtest = X[:train_size], torch.tensor(X[train_size:train_size+dev_size]), torch.tensor(X[train_size+dev_size:])\nytrain, ydev, ytest = F.one_hot(torch.tensor(y[:train_size]), vocab_size), F.one_hot(torch.tensor(y[train_size:train_size+dev_size]), vocab_size), F.one_hot(torch.tensor(y[train_size+dev_size:]), vocab_size)'

In [161]:
# define NN with 2 inputs indices of the characters in the vocabulary
# no hidden layer
# single output layer of vocab_size, here the input order won't matter since we take 1x27 vector with 2 indices turned on which would be symmetric for m,n or n,m
# to add positional sense, add 0 for 1st char, 1 for 2nd and divide entire input vector by 2
g = torch.Generator().manual_seed(2147483647)
W1 = torch.randn((2*vocab_size, vocab_size), generator=g)

In [175]:
# forward pass
logits = input @ W1
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
probs.shape

torch.Size([12, 27])

- probs.shape (12, 27) 12 examples with 27 probabilities indicating probs of each character in vocab
- here loss is nothing but the negative log likelihood of the probability of predicting y[i] given the bigram in X[i]
- why negative log likelihood? Check out perplexity and entropy relation with negative log likelihood
- we want to minimize the negative log-likelihood means we want to maximize the log-likelihood (ll) means we want to maximize the probability of the next character given the input [bigram here]
- higher the confidence on next character better the language model

In [176]:
loss_one_example = -probs[0, y[0]].log()
# loss over all examples in mini_batch = average of each nll
loss = -probs[torch.arange(len(input)), y].log().mean()
loss

tensor(3.6864)

- for backward pass we need gradients and torch sets require gradients to False by default
- hence re-write everything from start

In [179]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((2*vocab_size, vocab_size), generator=g, requires_grad=True)

In [199]:
# forward pass
logits = input @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
loss = -probs[torch.arange(len(input)), y].log().mean()

In [200]:
loss

tensor(3.6399, grad_fn=<NegBackward0>)

In [197]:
# backward pass
W.grad = None # first set gradients to 0 as they are accumulated over time; recall micrograd +=; :)
loss.backward() # compute the gradients starting from loss all the way backward to the weights in 1st layer; in micrograd we had implemented a toposort to do this

In [189]:
W.grad.shape

torch.Size([54, 27])

In [198]:
# update the weights
learning_rate = 0.1
W.data += -learning_rate*W.grad

In [201]:
# compile all forward backward into loop of iterations
num_iterations = 1000
for i in range(num_iterations):
    # forward pass
    logits = input @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    loss = -probs[torch.arange(len(input)), y].log().mean()
    print(f"Loss after {i} iterations: {loss}")
    
    # backward pass
    W.grad = None
    loss.backward()
    W.data += -learning_rate*W.grad


Loss after 0 iterations: 3.639911651611328
Loss after 1 iterations: 3.6283226013183594
Loss after 2 iterations: 3.6167495250701904
Loss after 3 iterations: 3.605191469192505
Loss after 4 iterations: 3.59364914894104
Loss after 5 iterations: 3.5821235179901123
Loss after 6 iterations: 3.570613145828247
Loss after 7 iterations: 3.559119939804077
Loss after 8 iterations: 3.5476415157318115
Loss after 9 iterations: 3.536179780960083
Loss after 10 iterations: 3.524733781814575
Loss after 11 iterations: 3.5133044719696045
Loss after 12 iterations: 3.501891851425171
Loss after 13 iterations: 3.4904944896698
Loss after 14 iterations: 3.479114532470703
Loss after 15 iterations: 3.467749834060669
Loss after 16 iterations: 3.4564027786254883
Loss after 17 iterations: 3.4450721740722656
Loss after 18 iterations: 3.4337587356567383
Loss after 19 iterations: 3.422461748123169
Loss after 20 iterations: 3.4111814498901367
Loss after 21 iterations: 3.3999178409576416
Loss after 22 iterations: 3.3886716

- we trained the NN on a small number of examples (12) till now
- lets train on entire dataset

In [204]:
X = []
y = []
for name in names:
    name = '..' + name + '.'
    for c1, c2, c3 in zip(name, name[1:], name[2:]):
        # print(c1,c2,c3)
        X.append([ctoi[c1],ctoi[c2]+vocab_size])
        y.append(ctoi[c3])

In [235]:
input = torch.zeros((len(X), 2*vocab_size))
for i in range(len(X)):
    input[i][X[i][0]] = 0.5
    input[i][X[i][1]] = 1

In [208]:
# initialize weights and hyperparameters
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((2*vocab_size, vocab_size), generator=g, requires_grad=True)
# hyperparameters
learning_rate = 10
num_iterations = 1000

In [264]:
# compile all forward backward into loop of iterations
num_iterations = 100
learning_rate = 20
for i in range(num_iterations):
    # forward pass
    logits = input @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    loss = -probs[torch.arange(len(input)), y].log().mean()
    print(loss.item())
    
    # backward pass
    W.grad = None
    loss.backward()
    W.data += -learning_rate*W.grad

2.3369383811950684
2.3369383811950684
2.3369383811950684
2.3369381427764893
2.3369381427764893
2.336937665939331
2.336937427520752
2.336937189102173
2.3369369506835938
2.3369369506835938
2.3369367122650146
2.3369364738464355
2.3369364738464355
2.3369359970092773
2.3369359970092773
2.3369359970092773
2.336935520172119
2.33693528175354
2.33693528175354
2.33693528175354
2.336935043334961
2.336935043334961
2.3369345664978027
2.3369345664978027
2.3369340896606445
2.3369338512420654
2.3369338512420654
2.3369336128234863
2.3369333744049072
2.3369333744049072
2.336933135986328
2.336932897567749
2.336932897567749
2.336932897567749
2.3369321823120117
2.3369321823120117
2.3369319438934326
2.3369317054748535
2.3369314670562744
2.3369314670562744
2.3369312286376953
2.3369312286376953
2.3369312286376953
2.336930513381958
2.336930513381958
2.336930274963379
2.3369300365448
2.3369300365448
2.3369300365448
2.3369295597076416
2.3369295597076416
2.3369295597076416
2.3369293212890625
2.3369290828704834
2.

In [265]:
# use the same generator to sample from the neural network
g = torch.Generator().manual_seed(2147483647)


ka.
kene.
madri.
ava.
gri.
ber.
rayino.
urite.
tah.
rahemikely.


In [272]:
# lets generate 10 names from scratch
for i in range(10):
    out = []
    # initialize the input bigram as '..' indicating start of the name
    ix1 = 26 # keeps track of 2nd last character
    ix2 = 26 # keeps track of index generated by model, use itoc[ix2] to get the corresponding character
    while True:
        test_input = torch.zeros((1, 2*vocab_size))
        test_input[0][ix1] = 0.5
        test_input[0][vocab_size+ix2] = 1
        # W.requires_grad = False # should be set to false for inference, saves the extra memory for gradients which is not required
        logits = test_input @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdim=True)
        ix3 = torch.multinomial(probs, num_samples=1, replacement = True, generator=g).item() # sample 1 character from the probability distribution generated by the model using multinomial sampling with replacement [as names can have repeating characters]
        # following statement fails if we don't use .item()!! don't forget to put it as then it outputs a tensor!
        out.append(itoc[ix3])

        # stop if '.' is generated as it is our stop token in this case
        if ix3 == 26:
            break

        ix1 = ix2 # update ix1 to hold the next character
        ix2 = ix3 # update ix2 to the newly generated character
        # these two characters will be used to sample in next iteration
    print(''.join(out))

caleeliynaysuveryel.
catain.
kamikadem.
key.
krim.
elaiman.
naplentaege.
ny.
cangtoyn.
are.
