In [2]:
import torch
import torch.nn.functional as F

In [3]:
words = open('train.txt', 'r').read().splitlines()
test_words = open('test.txt', 'r').read().splitlines()
dev_words = open('dev.txt', 'r').read().splitlines()

# Bigram Model

In [5]:
chars = sorted(list(set(''.join(words))))   # alphabetically sorted list of unique set of letters (26)
stoi = {s:i+1 for i, s in enumerate(chars)}  # creates a dict with mapping of index to each letter. eg {'a': 0, 'b':1 ...}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [41]:
# creating the dataset
xs, ys = [], []   # inputs, targets

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

number of examples:  182539


In [44]:
# gradient descent
xenc = F.one_hot(xs, num_classes=27).float()  # one hot encoding input to network

for k in range(100):

    # forward pass
    logits = xenc @ W  # predict log-counts
    counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
    probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
    loss = -probs[torch.arange(num), ys].log().mean() 
    print(loss.item())

    # backward pass
    W.grad = None  # set gradient to zero
    loss.backward()

    # update
    W.data += -50 * W.grad

2.462986707687378
2.462939500808716
2.462892770767212
2.4628467559814453
2.462801218032837
2.4627561569213867
2.4627113342285156
2.462667226791382
2.462623357772827
2.4625802040100098
2.4625372886657715
2.4624950885772705
2.4624531269073486
2.462411642074585
2.4623706340789795
2.462329626083374
2.462289333343506
2.462249517440796
2.462209701538086
2.4621708393096924
2.462131977081299
2.4620938301086426
2.4620556831359863
2.4620180130004883
2.4619805812835693
2.4619438648223877
2.461907148361206
2.4618709087371826
2.4618351459503174
2.4617996215820312
2.461764335632324
2.4617292881011963
2.4616947174072266
2.461660623550415
2.4616267681121826
2.4615931510925293
2.461559534072876
2.46152663230896
2.461493730545044
2.4614615440368652
2.4614293575286865
2.461397409439087
2.4613656997680664
2.461334705352783
2.461303472518921
2.4612724781036377
2.4612419605255127
2.461211919784546
2.461181640625
2.4611523151397705
2.461122751235962
2.4610936641693115
2.461064577102661
2.46103572845459
2.461

In [47]:
# dev set 

xs, ys = [], []   # inputs, targets

for w in test_words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the network
g = torch.Generator().manual_seed(2147483647)
# W = torch.randn((27, 27), generator=g, requires_grad=True)

xenc = F.one_hot(xs, num_classes=27).float()  # one hot encoding input to network

number of examples:  22729


In [48]:
logits = xenc @ W  # predict log-counts
counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()  # second term is called L2 regularization
print(loss.item())

2.4755358695983887


# Trigram Model

In [34]:
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']   # a single '.' character to indicate start and end of a word
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        trigram = (ch1, ch2, ch3)
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.shape[0]
print('number of examples: ', num)

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

xenc = F.one_hot(xs).float()
xenc = xenc.view(-1, 54)

number of examples:  156913


In [36]:
for k in range(100):
    # forward pass
    logits = xenc @ W  # This will result in shape (196113, 27*27)    
    counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
    probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
    
    # Calculate loss
    loss = -probs[torch.arange(num), ys].log().mean()
    print(loss.item())

    # backward pass
    W.grad = None  # set gradient to zero
    loss.backward()
    
    # update
    W.data += -50 * W.grad

2.263188123703003
2.2628917694091797
2.262601375579834
2.2623162269592285
2.2620365619659424
2.2617621421813965
2.2614927291870117
2.261228322982788
2.2609689235687256
2.260714054107666
2.2604639530181885
2.2602181434631348
2.259977102279663
2.259740114212036
2.259506940841675
2.2592782974243164
2.2590532302856445
2.2588322162628174
2.2586147785186768
2.2584009170532227
2.258190870285034
2.2579843997955322
2.2577810287475586
2.2575809955596924
2.2573843002319336
2.257190465927124
2.257000207901001
2.256812810897827
2.2566277980804443
2.256446361541748
2.256267547607422
2.256091356277466
2.255918264389038
2.2557473182678223
2.2555792331695557
2.25541353225708
2.2552504539489746
2.255089521408081
2.2549309730529785
2.254775285720825
2.2546215057373047
2.254469633102417
2.2543203830718994
2.2541730403900146
2.254027843475342
2.253884792327881
2.2537436485290527
2.2536046504974365
2.253467082977295
2.2533316612243652
2.2531981468200684
2.2530665397644043
2.252936363220215
2.252808332443237

In [39]:
# dev set 

xs, ys = [], []   # inputs, targets

for w in test_words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        trigram = (ch1, ch2, ch3)
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.shape[0]
print('number of examples: ', num)

# initialize the network
g = torch.Generator().manual_seed(2147483647)

xenc = F.one_hot(xs).float()
xenc = xenc.view(-1, 54)

number of examples:  19525


In [40]:
logits = xenc @ W  # predict log-counts
counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
loss = -probs[torch.arange(num), ys].log().mean() # + 0.1*(W**2).mean()  # second term is called L2 regularization
print(loss.item())

2.246502637863159


# Regularization

In [77]:
# lets train the model


xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']   # a single '.' character to indicate start and end of a word
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        trigram = (ch1, ch2, ch3)
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.shape[0]
print('number of examples: ', num)

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

xenc = F.one_hot(xs).float()
xenc = xenc.view(-1, 54)

number of examples:  156913


In [81]:
for k in range(100):
    # forward pass
    logits = xenc @ W  # This will result in shape (196113, 27*27)    
    counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
    probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
    
    # Calculate loss
    loss = -probs[torch.arange(num), ys].log().mean() + 0.001*(W**2).mean() # this smoothens out the weights
    print(loss.item())

    # backward pass
    W.grad = None  # set gradient to zero
    loss.backward()
    
    # update
    W.data += -50 * W.grad

2.244788885116577
2.2447590827941895
2.244729995727539
2.2447006702423096
2.2446718215942383
2.244642972946167
2.244614839553833
2.244586706161499
2.244558334350586
2.24453067779541
2.2445027828216553
2.2444753646850586
2.244448184967041
2.2444212436676025
2.244394302368164
2.244367837905884
2.2443411350250244
2.2443149089813232
2.244288921356201
2.244263172149658
2.2442374229431152
2.2442119121551514
2.2441866397857666
2.244161605834961
2.244136333465576
2.2441117763519287
2.244086980819702
2.244062900543213
2.2440385818481445
2.244014263153076
2.243990659713745
2.243966579437256
2.243943214416504
2.243919849395752
2.243896722793579
2.2438735961914062
2.2438504695892334
2.2438278198242188
2.243805408477783
2.2437827587127686
2.243760347366333
2.2437379360198975
2.243716239929199
2.243694543838501
2.2436728477478027
2.2436511516571045
2.2436296939849854
2.2436084747314453
2.2435874938964844
2.2435667514801025
2.2435457706451416
2.2435250282287598
2.243504524230957
2.2434842586517334
2.

In [None]:
# our loss is much higher in this case

In [82]:
# dev set 

xs, ys = [], []   # inputs, targets

for w in test_words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        trigram = (ch1, ch2, ch3)
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.shape[0]
print('number of examples: ', num)

# initialize the network
g = torch.Generator().manual_seed(2147483647)

xenc = F.one_hot(xs).float()
xenc = xenc.view(-1, 54)

number of examples:  19525


In [83]:
logits = xenc @ W  # predict log-counts
counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
loss = -probs[torch.arange(num), ys].log().mean() # + 0.1*(W**2).mean()  # second term is called L2 regularization
print(loss.item())

2.239325761795044


# Removing our use of F.one_hot

In [91]:
words = open('names.txt', 'r').read().splitlines()

In [92]:
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']   # a single '.' character to indicate start and end of a word
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        trigram = (ch1, ch2, ch3)
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.shape[0]
print('number of examples: ', num)

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

# xenc = F.one_hot(xs).float()
# xenc = xenc.view(-1, 54)

number of examples:  196113


In [98]:
# W[xs[0]].shape
W[5]

tensor([ 4.7236e-01,  1.4830e+00,  3.1748e-01,  1.0588e+00,  2.3982e+00,
         4.6827e-01, -6.5650e-01,  6.1662e-01, -6.2197e-01,  5.1007e-01,
         1.3563e+00,  2.3445e-01, -4.5585e-01, -1.3132e-03, -5.1161e-01,
         5.5570e-01,  4.7458e-01, -1.3867e+00,  1.6229e+00,  1.7197e-01,
         9.8846e-01,  5.0657e-01,  1.0198e+00, -1.9062e+00, -4.2753e-01,
        -2.1259e+00,  9.6041e-01], grad_fn=<SelectBackward0>)

In [100]:
for k in range(100):
    # forward pass
    logits = W[xs[:, 0]] + W[xs[:, 1] + 27]  # This helps us make the operation more efficient by skipping over the step of having to one_hot encode
    counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
    probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
    
    # Calculate loss
    loss = -probs[torch.arange(num), ys].log().mean()
    print(loss.item())

    # backward pass
    W.grad = None  # set gradient to zero
    loss.backward()
    
    # update
    W.data += -50 * W.grad

2.263436794281006
2.2631397247314453
2.262847900390625
2.2625620365142822
2.2622814178466797
2.2620060443878174
2.2617361545562744
2.2614707946777344
2.2612104415893555
2.2609548568725586
2.2607038021087646
2.2604572772979736
2.2602150440216064
2.259977102279663
2.2597432136535645
2.2595133781433105
2.2592875957489014
2.259065628051758
2.25884747505188
2.2586326599121094
2.2584214210510254
2.258213996887207
2.258009672164917
2.2578089237213135
2.2576115131378174
2.2574169635772705
2.257225751876831
2.2570371627807617
2.2568519115448
2.256669282913208
2.2564897537231445
2.256312608718872
2.256138563156128
2.255966901779175
2.255798101425171
2.255631685256958
2.255467653274536
2.2553060054779053
2.2551469802856445
2.2549901008605957
2.254835605621338
2.254683256149292
2.254533052444458
2.254384994506836
2.2542388439178467
2.2540953159332275
2.253953456878662
2.2538132667541504
2.2536752223968506
2.2535393238067627
2.2534048557281494
2.253272771835327
2.2531421184539795
2.2530131340026855

# Using F.cross_entropy

In [102]:
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']   # a single '.' character to indicate start and end of a word
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]  # input
        ix2 = stoi[ch2]  # input
        ix3 = stoi[ch3]  # output
        trigram = (ch1, ch2, ch3)
        xs.append([ix1, ix2])  # here we're adding the integer denoting the letter into the array, not the letter itself. because you can't do math on characters ofc
        ys.append(ix3)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.shape[0]
print('number of examples: ', num)

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27*2, 27), generator=g, requires_grad=True)

# xenc = F.one_hot(xs).float()
# xenc = xenc.view(-1, 54)

number of examples:  196113


In [103]:
for k in range(100):
    # forward pass
    logits = W[xs[:, 0]] + W[xs[:, 1] + 27]  # This helps us make the operation more efficient by skipping over the step of having to one_hot encode
    counts = logits.exp()  # equivalent to our initial N matrix containing the original frequencies
    probs = counts / counts.sum(1, keepdim=True)  # probabilities for the next character
    
    # Calculate loss using cross_entropy()
    # older method -> loss = -probs[torch.arange(num), ys].log().mean()
    loss = F.cross_entropy(logits, ys)
    print(loss.item())

    # backward pass
    W.grad = None  # set gradient to zero
    loss.backward()
    
    # update
    W.data += -50 * W.grad

4.186270713806152
3.3573663234710693
3.042149543762207
2.8714542388916016
2.7671947479248047
2.694681167602539
2.6390926837921143
2.5949814319610596
2.559002637863159
2.5292224884033203
2.5042338371276855
2.483072519302368
2.464961528778076
2.4493143558502197
2.435654401779175
2.423619031906128
2.4129199981689453
2.4033381938934326
2.394700765609741
2.386871337890625
2.379739999771118
2.3732173442840576
2.3672289848327637
2.3617119789123535
2.3566133975982666
2.3518881797790527
2.34749698638916
2.343407154083252
2.3395884037017822
2.3360161781311035
2.332667350769043
2.3295230865478516
2.3265650272369385
2.3237786293029785
2.3211495876312256
2.3186655044555664
2.3163156509399414
2.314089059829712
2.3119773864746094
2.3099722862243652
2.3080666065216064
2.3062520027160645
2.3045237064361572
2.3028757572174072
2.301301956176758
2.2997987270355225
2.298360586166382
2.2969841957092285
2.2956655025482178
2.294400691986084
2.293187141418457
2.292020797729492
2.290900230407715
2.2898223400115