In [4]:
# E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?
# E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?
# E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?
# E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?
# E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?
# E06: meta-exercise! Think of a fun/interesting exercise and complete it.

In [6]:
import torch

In [204]:
words = open('names.txt', 'r').read().splitlines()

In [229]:
# Given tri-gram add another dimension
N = torch.zeros((27, 27, 27), dtype=torch.int32)
chars = sorted(list(set(''.join(words))))
# Character number: a : 1
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
# Number character: 1 : a
itos = {i:s for s,i in stoi.items()}

In [266]:
# COUNTING METHOD APPROACH TO TRIGRAM
log_likehood = 0.0
n = 0
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        # Index which character and add to distribution model
        N[ix1, ix2, ix3] += 1
        
# Ensure no zeros in generation, therefore no infinity unlikely
# Model smoothing by adding 1 to all values.
P = (N+1).float()
 # .sum(2) because we want the third level of the array to be normalized to get the probabilities
P /= P.sum(2, keepdims=True)

In [267]:
# # Bi-gram model

# [ 
#   a: [a => 0.4, b => 0.6] # We had a probability of 0.4 to get an "a" after an "a"
#   b: [a => 0.3, b => 0.7]
# ] 

# # Trigram model

# [ 
#   a: [
#     a: [a => 0.4, b => 0.6] # We had a probability of 0.4 to get an "a" after an "a" after an "a"
#     b: [a => 0.3, b => 0.7]
#   ]
#   b: [
#     a: [a => 0.8, b => 0.2]
#   ]
# ] 

SyntaxError: invalid syntax (3552410657.py, line 4)

In [None]:
# Check normalization was successfull
# Sum of values in third level sums to 1 (normalized)
P[1, 1].sum()

In [None]:
g = torch.Generator().manual_seed(2147483647)
# Sample for probabilities
for i in range(10):
  # Change depending on what we want
  out = ['a', 'b']
  while True:
    p = P[stoi[out[-2]], stoi[out[-1]]] # Plug the last two chars into our probabilities table
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))
        

In [None]:
import torch.nn.functional as F
# input to the network: one hot encoding indexing
xenc = F.one_hot(xs, num_classes=27).float()
logits = xenc @ W

# SOFTMAX 
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character

print(probs.shape)

In [172]:
xenc.shape
# Because it is a trigram, we have 54 values, which are the combinations and (a, b)
# 27 neurons still given number of characters
W = torch.randn((27, 27), requires_grad=True)
xenc @ W

tensor([[ 0.1154, -0.9587,  1.6701,  ..., -1.5305, -1.2729, -1.6326],
        [-1.3154,  1.1859, -0.5984,  ...,  0.2010, -0.7432, -0.0135],
        [-1.3154,  1.1859, -0.5984,  ...,  0.2010, -0.7432, -0.0135],
        ...,
        [ 1.8039, -1.0420,  0.0166,  ...,  0.6736,  0.3726, -0.8610],
        [ 1.8039, -1.0420,  0.0166,  ...,  0.6736,  0.3726, -0.8610],
        [-0.4264, -0.8964, -0.8476,  ...,  2.0451, -0.7935,  0.5973]],
       grad_fn=<MmBackward0>)

In [182]:
# forward pass
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# vectorized form of previous nll example
loss = -probs[torch.arange(27), ys].log().mean()

In [190]:
print(loss.item())

4.269371509552002


In [192]:
# backward pass
W.grad = None # set to zero the gradient
loss.backward()

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [188]:
# update tensor with new pos
W.data += -0.1 * W.grad