# Setup and Imports

In [1]:
import torch
import torch.nn.functional as F
import random

In [2]:
words = open('names.txt').read().splitlines()

In [3]:
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i for i, s in enumerate(chars, 1)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

# E01:

train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

### Counting Model

In [5]:
# creating the trigram counting model

N = torch.zeros(size=(27, 27, 27))

for word in words:
    chs = ['.'] + list(word) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1, ix2, ix3 = stoi[ch1], stoi[ch2], stoi[ch3]
        N[ix1, ix2, ix3] += 1

In [6]:
# sampling from the model

# first, sample the second letter as a bigram model (because you only have the special dot token as input)


P = N / N.sum(dim=2, keepdim=True)

for i in range(10):
    out = []
    ix1 = 0
    p = N[ix1].float()
    p = p.sum(axis=0) / p.sum() # look for the probability of the second letter

    ix2 = torch.multinomial(p, 1, replacement=True).item()

    out.append(itos[ix2])

# all letters from now on will depend on the previous two characters (trigram)
    
    while True:
        p = P[ix1, ix2]
        ix1 = ix2
        ix2 = torch.multinomial(p, 1, replacement=True).item()
        out.append(itos[ix2])
        if ix2 == 0:
            break

    print(''.join(out))



aylah.
on.
el.
ox.
os.
abia.
ven.
yah.
ehann.
emoe.


In [7]:
# calculating the negative log likliehood

nll = 0
n = 0
for word in words:
    chs = ['.'] + list(word) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1, ix2, ix3 = stoi[ch1], stoi[ch2], stoi[ch3]
        nll += -torch.log(P[ix1, ix2, ix3])
        n += 1
        
print(nll/n)

tensor(2.0620)


### Neural Network

In [8]:
# preparing the dataset

xs, ys = [], []
for word in words:
    chs = ['.'] + list(word) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1, ix2, ix3 = stoi[ch1], stoi[ch2], stoi[ch3]
        xs.append([ix1, ix2])
        ys.append(ix3)
        
xs, ys = torch.tensor(xs), torch.tensor(ys)

In [9]:
W = torch.randn(27*2, 27, requires_grad=True)

for k in range(200):
    xenc = F.one_hot(xs, num_classes=27).reshape(-1, 27*2).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(axis=1, keepdims=True)
    loss = -probs[torch.arange(xs.shape[0]), ys].log().mean()
#     print(loss.item())
    
    W.grad = None
    loss.backward()
    
    W.data += -50*W.grad
    
print(loss.item())


2.248483657836914


In [10]:
# sampling from the model

# first, sample the second letter as a bigram model (because you only have the special dot token as input)



for i in range(10):
    out = []
    
    ix1 = None
    ix2 = 0
    
    xenc = torch.concat([F.one_hot(torch.tensor([ix2]), num_classes=27), torch.zeros(1,27)], axis=1).float()
    logits = xenc @ W
    counts = logits.exp()
    p = counts / counts.sum(axis=1, keepdims=True)
    
    ix1 = ix2
    ix2 = torch.multinomial(p, 1, replacement=True).item()

    out.append(itos[ix2])

# all letters from now on will depend on the previous two characters (trigram)
    
    while True:
        xenc = torch.concat([F.one_hot(torch.tensor([ix1]), num_classes=27),
                             F.one_hot(torch.tensor([ix2]), num_classes=27)], axis=1).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(axis=1, keepdims=True)
        
        ix1 = ix2
        ix2 = torch.multinomial(p, 1, replacement=True).item()
        out.append(itos[ix2])
        if ix2 == 0:
            break

    print(''.join(out))



len.
imkelderi.
iar.
ie.
ah.
azelen.
ufzi.
myn.
wa.
ola.


# E02: 
split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [11]:
# preparing the dataset

xs, ys = [], []
for word in words:
    chs = ['.'] + list(word) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1, ix2, ix3 = stoi[ch1], stoi[ch2], stoi[ch3]
        xs.append([ix1, ix2])
        ys.append(ix3)
        
xs, ys = torch.tensor(xs), torch.tensor(ys)

In [12]:
train_split, dev_split = int(0.8*len(xs)), int(0.9*len(xs))
print(train_split, dev_split)

156890 176501


Shuffling the dataset first

In [13]:
idxs = list(range(xs.shape[0]))
random.shuffle(idxs)

xs, ys = xs[idxs], ys[idxs]

In [14]:
xs_train, ys_train = xs[:train_split], ys[:train_split]

In [15]:
xs_dev, ys_dev = xs[train_split:dev_split], ys[train_split:dev_split]

In [16]:
xs_test, ys_test = xs[dev_split:], ys[dev_split:]

In [17]:
W = torch.randn(27*2, 27, requires_grad=True)

for k in range(200):
    xenc = F.one_hot(xs_train, num_classes=27).reshape(-1, 27*2).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(axis=1, keepdims=True)
    train_loss = -probs[torch.arange(xs_train.shape[0]), ys_train].log().mean()
#     print(f'*train set loss = {loss.item()}')
    
    W.grad = None
    train_loss.backward()
    
    W.data += -50*W.grad
    
    
    ## eval on dev set
    with torch.no_grad():
        xenc = F.one_hot(xs_dev, num_classes=27).reshape(-1, 27*2).float()
        logits = xenc @ W
        counts = logits.exp()
        probs = counts / counts.sum(axis=1, keepdims=True)
        dev_loss = -probs[torch.arange(xs_dev.shape[0]), ys_dev].log().mean()
#         print(f'**dev set loss = {loss.item()}')
    
print(f'*train set loss = {train_loss.item()}')
print(f'**dev set loss = {dev_loss.item()}')


*train set loss = 2.2469754219055176
**dev set loss = 2.254650831222534


In [18]:
# eval on test set

xenc = F.one_hot(xs_test, num_classes=27).reshape(-1, 27*2).float()
logits = xenc @ W
counts = logits.exp()
probs = counts / counts.sum(axis=1, keepdims=True)
loss = -probs[torch.arange(xs_test.shape[0]), ys_test].log().mean()
print(f'**test set loss = {loss.item()}')

**test set loss = 2.257509231567383


# E03: 
use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [19]:
W = torch.randn(27*2, 27, requires_grad=True)

for alpha in [0.0001, 0.001, 0.01, 0.1]:
    print(f'Testing with alpha={alpha}')
    for k in range(200):
        xenc = F.one_hot(xs_train, num_classes=27).reshape(-1, 27*2).float()
        logits = xenc @ W
        counts = logits.exp()
        probs = counts / counts.sum(axis=1, keepdims=True)
        train_loss = -probs[torch.arange(xs_train.shape[0]), ys_train].log().mean() + alpha * (W**2).mean()
#         print(f'*train set loss = {loss.item()}')

        W.grad = None
        train_loss.backward()

        W.data += -50*W.grad


        ## eval on dev set
        with torch.no_grad():
            xenc = F.one_hot(xs_dev, num_classes=27).reshape(-1, 27*2).float()
            logits = xenc @ W
            counts = logits.exp()
            probs = counts / counts.sum(axis=1, keepdims=True)
            dev_loss = -probs[torch.arange(xs_dev.shape[0]), ys_dev].log().mean()
    print(f'**final dev set loss = {dev_loss.item()}')

Testing with alpha=0.0001
**final dev set loss = 2.2562365531921387
Testing with alpha=0.001
**final dev set loss = 2.2491519451141357
Testing with alpha=0.01
**final dev set loss = 2.248021364212036
Testing with alpha=0.1
**final dev set loss = 2.2681424617767334


alpha of 0.01 seems to be the best choice

In [20]:
xenc = F.one_hot(xs_test, num_classes=27).reshape(-1, 27*2).float()
logits = xenc @ W
counts = logits.exp()
probs = counts / counts.sum(axis=1, keepdims=True)
loss = -probs[torch.arange(xs_test.shape[0]), ys_test].log().mean()
print(f'**test set loss = {loss.item()}')

**test set loss = 2.2707808017730713


# E04:
we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [21]:
W = torch.randn(27*2, 27, requires_grad=True)

xs_train_offset = xs_train.clone()
xs_dev_offset = xs_dev.clone()

xs_train_offset[:, 1] = xs_train[:, 1] + 27
xs_dev_offset[:, 1] = xs_dev[:, 1] + 27

for k in range(200):
    logits = W[xs_train_offset].sum(axis=1)
    counts = logits.exp()
    probs = counts / counts.sum(axis=1, keepdims=True)
    train_loss = -probs[torch.arange(xs_train.shape[0]), ys_train].log().mean() + 0.01 * (W**2).mean()
#     print(f'*train set loss = {loss.item()}')

    W.grad = None
    train_loss.backward()

    W.data += -50*W.grad


    ## eval on dev set
    with torch.no_grad():
        logits = W[xs_dev_offset].sum(axis=1)
        counts = logits.exp()
        probs = counts / counts.sum(axis=1, keepdims=True)
        dev_loss = -probs[torch.arange(xs_dev.shape[0]), ys_dev].log().mean()
#         print(f'**dev set loss = {loss.item()}')


print(f'*train set loss = {train_loss.item()}')
print(f'**dev set loss = {dev_loss.item()}')

*train set loss = 2.2586700916290283
**dev set loss = 2.2561118602752686


# E05:
look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

In [22]:
W = torch.randn(27*2, 27, requires_grad=True)

for k in range(200):
    xenc = F.one_hot(xs_train, num_classes=27).reshape(-1, 27*2).float()
    logits = xenc @ W
#     counts = logits.exp()
#     probs = counts / counts.sum(axis=1, keepdims=True)
    train_loss = F.cross_entropy(logits, ys_train)
#     print(f'*train set loss = {loss.item()}')
    
    W.grad = None
    train_loss.backward()
    
    W.data += -50*W.grad
    
    
    ## eval on dev set
    xenc = F.one_hot(xs_dev, num_classes=27).reshape(-1, 27*2).float()
    logits = xenc @ W
#     counts = logits.exp()
#     probs = counts / counts.sum(axis=1, keepdims=True)
    dev_loss = F.cross_entropy(logits, ys_dev)
#     print(f'**dev set loss = {loss.item()}')
    
print(f'*train set loss = {train_loss.item()}')
print(f'**dev set loss = {dev_loss.item()}')

*train set loss = 2.247150182723999
**dev set loss = 2.2552051544189453


# E06: 
meta-exercise! Think of a fun/interesting exercise and complete it.


In [23]:
# combining input words through addition instead of concat


import torch.nn.functional as F

W = torch.randn(27, 27, requires_grad=True)

for k in range(200):
    xenc = F.one_hot(xs, num_classes=27).sum(dim=1).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(axis=1, keepdims=True)
    loss = -probs[torch.arange(xs.shape[0]), ys].log().mean()
#     print(loss.item())
    
    W.grad = None
    loss.backward()
    
    W.data += -50*W.grad
    
print(loss.item())

2.5141541957855225


In [24]:
# sampling from the model

# first, sample the second letter as a bigram model (because you only have the special dot token as input)



for i in range(10):
    out = []
    
    ix1 = None
    ix2 = 0
    
    xenc = torch.concat([F.one_hot(torch.tensor([ix2]), num_classes=27)], axis=1).float()
    logits = xenc @ W
    counts = logits.exp()
    p = counts / counts.sum(axis=1, keepdims=True)
    
    ix1 = ix2
    ix2 = torch.multinomial(p, 1, replacement=True).item()

    out.append(itos[ix2])

# all letters from now on will depend on the previous two characters (trigram)
    
    while True:
        xenc = (F.one_hot(torch.tensor([ix1]), num_classes=27) + F.one_hot(torch.tensor([ix2]), num_classes=27)).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(axis=1, keepdims=True)
        
        ix1 = ix2
        ix2 = torch.multinomial(p, 1, replacement=True).item()
        out.append(itos[ix2])
        if ix2 == 0:
            break

    print(''.join(out))



aveys.
etykaeleidegahnansaimalbi.
aanienndgiamiahreyanvo.
quetzekyniaslahnanemkariahnesdelnannaaih.
anfarlieigente.
vyilnasfiaadveobleevnaynkeppenantlasgexlbaryf.
ydielnaderniealinecaenoinetniaslanianlioadne.
aihalivlinalryonniaskaylaotelnarnanemriyncise.
hal.
halyadreadermeymric.
