In [4]:
# E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?
# E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?
# E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?
# E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?
# E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?
# E06: meta-exercise! Think of a fun/interesting exercise and complete it.

# E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [6]:
import torch

In [204]:
words = open('names.txt', 'r').read().splitlines()

In [229]:
# Given tri-gram add another dimension
N = torch.zeros((27, 27, 27), dtype=torch.int32)
chars = sorted(list(set(''.join(words))))
# Character number: a : 1
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
# Number character: 1 : a
itos = {i:s for s,i in stoi.items()}

In [266]:
# COUNTING METHOD APPROACH TO TRIGRAM
log_likehood = 0.0
n = 0
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        # Index which character and add to distribution model
        N[ix1, ix2, ix3] += 1
        
# Ensure no zeros in generation, therefore no infinity unlikely
# Model smoothing by adding 1 to all values.
P = (N+1).float()
 # .sum(2) because we want the third level of the array to be normalized to get the probabilities
P /= P.sum(2, keepdims=True)

In [272]:
# # Bi-gram model

# [ 
#   a: [a => 0.4, b => 0.6] # We had a probability of 0.4 to get an "a" after an "a"
#   b: [a => 0.3, b => 0.7]
# ] 

# # Trigram model

# [ 
#   a: [
#     a: [a => 0.4, b => 0.6] # We had a probability of 0.4 to get an "a" after an "a" after an "a"
#     b: [a => 0.3, b => 0.7]
#   ]
#   b: [
#     a: [a => 0.8, b => 0.2]
#   ]
# ] 

In [274]:
# Check normalization was successfull
# Sum of values in third level sums to 1 (normalized)
P[1, 1].sum()

tensor(1.0000)

In [276]:
g = torch.Generator().manual_seed(2147483647)
# Sample for probabilities
for i in range(10):
  # Change depending on what we want
  out = ['a', 'b']
  while True:
    p = P[stoi[out[-2]], stoi[out[-1]]] # Plug the last two chars into our probabilities table
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))
        

abdunide.
abilyasid.
aburelay.
abelin.
ab.
abdi.
abritoper.
abrayel.
abetannaaryanileniassibiainewin.
abressiyanayla.


In [284]:
# Neural network time
# create data
# input, target
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append((ix1, ix2)) # Need to keep two chars for input, so array of tuple
        ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

print("examples", ys.shape[0])

g = torch.Generator().manual_seed(2147483647)
W =  torch.randn((27*2, 27), generator=g, requires_grad=True) 
# You just need 27x2, the first 27 will activate for the first input, the second 27 for the second one
# Double the inputs, double the neurons, that makes sense! 

examples 196113


In [294]:
import torch.nn.functional as F
# Sampling size and gradient descent.softmax classifier to find loss function
for k in range(1000):
    xenc = F.one_hot(xs, num_classes=27).float() # does ont hot on both inputs, so get two times 27 tensors, does neuron
    # .view() is used to change the format of tensors. xenc is currently [examplesCount, 2, 27]
    # and we want [examplesCount, 54] (to match the shape of W [54, 27])
    logits = xenc.view(-1, 27*2) @ W # -1 means we let torch define how much we'll get in the first dimension to stay compatible, in our case it should not change
    
    # SOFTMAX CLASSIFIER
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    
    loss = -probs[torch.arange(ys.shape[0]), ys].log().mean() # ys.shape[0] is the number of examples (allows us to avoid using a count variable)
    
    W.grad = None
    loss.backward()
    
    W.data += -3 * W.grad
    
print(loss.item())


2.283792018890381


# E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [308]:
import random
import math

random.shuffle(words)
c = len(words)

trainSet = words[:math.floor(c * 0.8)]
devSet = words[math.floor(c * 0.8): math.floor(c * 0.9)]
testSet = words[math.floor(c * 0.9):]


In [326]:
# TRAIN SET

xs, ys = [], []
lossi = []
for w in trainSet:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append((ix1, ix2)) # Need to keep two chars for input, so array of tuple
        ys.append(ix3)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)

print("examples", ys.shape[0])

g = torch.Generator().manual_seed(2147483647)
W =  torch.randn((27*2, 27), generator=g, requires_grad=True) 

# Sampling size and gradient descent.softmax classifier to find loss function
for k in range(1000):
    xenc = F.one_hot(xs, num_classes=27).float() # does ont hot on both inputs, so get two times 27 tensors, does neuron
    # .view() is used to change the format of tensors. xenc is currently [examplesCount, 2, 27]
    # and we want [examplesCount, 54] (to match the shape of W [54, 27])
    logits = xenc.view(-1, 27*2) @ W # -1 means we let torch define how much we'll get in the first dimension to stay compatible, in our case it should not change
    
    # SOFTMAX CLASSIFIER
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    
    loss = -probs[torch.arange(ys.shape[0]), ys].log().mean() # ys.shape[0] is the number of examples (allows us to avoid using a count variable)
    
    W.grad = None
    loss.backward()
    
    W.data += -3 * W.grad
    
print(loss.item())


examples 156870
2.2842559814453125


In [330]:
# Let's check the loss for the testSet && devSet ? 
# TEST SET
# No training is done here, just comparing loss function against training set
lossi = []
for w in testSet:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
      
    xenc = F.one_hot(torch.tensor((ix1, ix2)), num_classes=27).float()
    logits = xenc.view(-1, 27*2) @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    lossi.append(-probs[0, ix3].log())
print(torch.tensor(lossi).mean()) 

# DEV SET
lossi = []
for w in devSet:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
      
    xenc = F.one_hot(torch.tensor((ix1, ix2)), num_classes=27).float()
    logits = xenc.view(-1, 27*2) @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    lossi.append(-probs[0, ix3].log())
print(torch.tensor(lossi).mean())

# The losses are very similar, which means that the model is not overfitting.

tensor(2.2846)
tensor(2.2820)


# E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [363]:
# Smoothing
smoothness = [0, 0.01, 0.02, 0.05, 0.1, 0.25, 0.5, 1.0]

for i, smoothness in enumerate(smoothness):
    W = torch.randn((27*2, 27), generator=g, requires_grad=True) 
    for k in range(100):
        xenc = F.one_hot(xs, num_classes=27).float()
        logits = xenc.view(-1, 27*2) @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)
        # standard negative log-likelihood loss
        # By squaring W, the penalty grows faster as weights increase in magnitude.
        # This discourages the optimizer from increasing weights excessively, 
        # acting as a form of L2 regularization to promote smoother, more generalizable models.
        loss = -probs[torch.arange(ys.shape[0]), ys].log().mean() + smoothness*(W**2).mean()
        
        W.grad = None
        loss.backward()
        
        W.data += -3 * W.grad

print(f"Smoothness: {smoothness} => loss train set: {loss.item()}")
    
lossi = []
for w in devSet:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
      
    xenc = F.one_hot(torch.tensor((ix1, ix2)), num_classes=27).float()
    logits = xenc.view(-1, 27*2) @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    lossi.append(-probs[0, ix3].log())
print(f"Smoothness: {smoothness} => loss dev set: {torch.tensor(lossi).mean()}")
print()

Smoothness: 1.0 => loss train set: 2.965202569961548
Smoothness: 1.0 => loss dev set: 2.570209264755249



# E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [365]:
# Unsure how this works
xenc_pre = torch.zeros(*xs.shape, 27)
xenc_pre[torch.arange(xs.shape[0]), 0, xs[:,0]] = 1
xenc_pre[torch.arange(xs.shape[0]), 1, xs[:,1]] = 1

# E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

In [346]:
xenc = F.one_hot(xs, num_classes=27).float()
logits = xenc.view(-1, 27*2) @ W

counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(ys.shape[0]), ys].log().mean() + 0.01*(W**2).mean()

loss, F.cross_entropy(logits, ys) + 0.01*(W**2).mean()

# https://jamesmccaffrey.wordpress.com/2013/11/05/why-you-should-use-cross-entropy-error-instead-of-classification-error-or-mean-squared-error-for-neural-network-classifier-training/

(tensor(2.2926, grad_fn=<AddBackward0>),
 tensor(2.2926, grad_fn=<AddBackward0>))