In [1]:
# picking up at contexts to tell or predict what comes after the inputed contexts...
# splitlines turns the texts into an array
words = open('names.txt', 'r').read().splitlines()

len(words)

32033

In [2]:
min(len(w) for w in words)

2

In [3]:
max(len(w) for w in words)

15

In [4]:
# we need a dictionary that maintains counts for
# learning the statistics by counting how often combinations occur in a data set... like we are taking the contexts in pair and are checking our dataset to count their occurrence

# it is like we are trying to determine whatever comes next by figuring out what comes before; just that we are primarily pairing them together first.
# then counting their occurence

b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    # zip does the normal perfoliating
    for ch1, ch2 in zip(chs, chs[1:]):
        # the pair up
        bigram = (ch1, ch2)
        # the hook bigram as a key and turned into a unique tuple again using get
        b[bigram] = b.get(bigram, 0) + 1
        # print(ch1, ch2)

In [5]:
# in this case b is assumed to have the statistics of the whole dataset.... By measure of calculating the counts of all the bigrams/pairs and getting how often they occur. 
b

# sort from the dictionary
sorted(b.items(), key = lambda kv: -kv[1])

[(('n', '<E>'), 6763),
 (('a', '<E>'), 6640),
 (('a', 'n'), 5438),
 (('<S>', 'a'), 4410),
 (('e', '<E>'), 3983),
 (('a', 'r'), 3264),
 (('e', 'l'), 3248),
 (('r', 'i'), 3033),
 (('n', 'a'), 2977),
 (('<S>', 'k'), 2963),
 (('l', 'e'), 2921),
 (('e', 'n'), 2675),
 (('l', 'a'), 2623),
 (('m', 'a'), 2590),
 (('<S>', 'm'), 2538),
 (('a', 'l'), 2528),
 (('i', '<E>'), 2489),
 (('l', 'i'), 2480),
 (('i', 'a'), 2445),
 (('<S>', 'j'), 2422),
 (('o', 'n'), 2411),
 (('h', '<E>'), 2409),
 (('r', 'a'), 2356),
 (('a', 'h'), 2332),
 (('h', 'a'), 2244),
 (('y', 'a'), 2143),
 (('i', 'n'), 2126),
 (('<S>', 's'), 2055),
 (('a', 'y'), 2050),
 (('y', '<E>'), 2007),
 (('e', 'r'), 1958),
 (('n', 'n'), 1906),
 (('y', 'n'), 1826),
 (('k', 'a'), 1731),
 (('n', 'i'), 1725),
 (('r', 'e'), 1697),
 (('<S>', 'd'), 1690),
 (('i', 'e'), 1653),
 (('a', 'i'), 1650),
 (('<S>', 'r'), 1639),
 (('a', 'm'), 1634),
 (('l', 'y'), 1588),
 (('<S>', 'l'), 1572),
 (('<S>', 'c'), 1542),
 (('<S>', 'e'), 1531),
 (('j', 'a'), 1473),
 (

In [6]:
# lets store the info into a 2-dimensional array tensor
#  using torch

import torch 
import numpy as np
import torch.nn.functional as F

In [7]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s, i in stoi.items()}

In [8]:
# FULL SUMMARIZATION

# create the dataset
# the data size 
dsize = len(stoi)
# let us tokenize the data by going through these transformations
N = torch.zeros((dsize, dsize), dtype=torch.int32)
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

        N[ix1, ix2] += 1

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print(f'number of examples: {num}')


# initialize the 'network'
g= torch.Generator().manual_seed(2147483647)
W = torch.randn((dsize, dsize), generator=g, requires_grad=True)

number of examples: 228146


In [9]:
# let us tokenize the data by going through these transformations
P = (N + 1).float()
P = P/P.sum(1, keepdim=True)
P


tensor([[3.1192e-05, 1.3759e-01, 4.0767e-02, 4.8129e-02, 5.2745e-02, 4.7785e-02,
         1.3038e-02, 2.0898e-02, 2.7293e-02, 1.8465e-02, 7.5577e-02, 9.2452e-02,
         4.9064e-02, 7.9195e-02, 3.5777e-02, 1.2321e-02, 1.6095e-02, 2.9008e-03,
         5.1154e-02, 6.4130e-02, 4.0830e-02, 2.4641e-03, 1.1759e-02, 9.6070e-03,
         4.2109e-03, 1.6719e-02, 2.9008e-02],
        [1.9583e-01, 1.6425e-02, 1.5983e-02, 1.3889e-02, 3.0756e-02, 2.0435e-02,
         3.9809e-03, 4.9835e-03, 6.8796e-02, 4.8685e-02, 5.1899e-03, 1.6779e-02,
         7.4575e-02, 4.8213e-02, 1.6039e-01, 1.8872e-03, 2.4475e-03, 1.7988e-03,
         9.6279e-02, 3.2997e-02, 2.0288e-02, 1.1264e-02, 2.4623e-02, 4.7771e-03,
         5.3963e-03, 6.0480e-02, 1.2857e-02],
        [4.3039e-02, 1.2051e-01, 1.4596e-02, 7.4850e-04, 2.4701e-02, 2.4551e-01,
         3.7425e-04, 3.7425e-04, 1.5719e-02, 8.1587e-02, 7.4850e-04, 3.7425e-04,
         3.8922e-02, 3.7425e-04, 1.8713e-03, 3.9671e-02, 3.7425e-04, 3.7425e-04,
         3.1549e-

In [10]:
# gradient descent

for k in range(100):
    # forward pass
    xenc = F.one_hot(xs, num_classes=dsize).float() # input to the network: one-hot encoding REMEMBER TO CAST AS FLOAT
    logits = xenc @ W # predict log-counts using the matrix multiplier
    # the next two lines are the softmax
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdim=True) # probabilities for the next character || Whenever you are normalizing always remember to preserve the dimension
    # normal loss
    # loss = -probs[torch.arange(num), ys].log().mean() 
    # adding a regularization to the loss so that the loss does not accumulate too much if the weoghts are greater than one; and the loss is zero if the w is 0 >>> adding like a gravity force that makes W 0
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01 * (W**2).mean()
    print(loss.item())


    # backward pass
    W.grad = None
    loss.backward()

    # update the weight
    W.data += -50 * W.grad

3.768618583679199
3.3788068294525146
3.161090850830078
3.0271859169006348
2.9344842433929443
2.867231607437134
2.8166542053222656
2.777146577835083
2.745253801345825
2.7188305854797363
2.696505308151245
2.6773719787597656
2.6608052253723145
2.6463515758514404
2.633665084838867
2.622471570968628
2.6125476360321045
2.6037068367004395
2.595794916152954
2.5886807441711426
2.5822560787200928
2.576429843902588
2.5711236000061035
2.566272735595703
2.5618226528167725
2.5577261447906494
2.5539441108703613
2.550442695617676
2.5471930503845215
2.5441699028015137
2.5413522720336914
2.538722038269043
2.536262035369873
2.5339579582214355
2.531797409057617
2.529768228530884
2.527860164642334
2.5260636806488037
2.5243704319000244
2.522773265838623
2.52126407623291
2.519836664199829
2.5184857845306396
2.5172054767608643
2.515990734100342
2.5148372650146484
2.5137407779693604
2.512697696685791
2.511704921722412
2.5107579231262207
2.509855031967163
2.5089924335479736
2.5081679821014404
2.507380485534668


In [83]:
# sample from the 'neural net' model

g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    out = []
    ix = 0
    while True:
        # ---------
        # Before
        # p = P[ix]
        # --------
        # Now
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W # predict log counts
        counts = logits.exp() # counts, equivalent to N
        p = counts / counts.sum(1, keepdim=True)
        # --------

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))
         

junide.
janasah.
p.
cfay.
a.


In [144]:
# g = torch.Generator().manual_seed(23412342342334)
# m = torch.rand(2, generator=g)
# m


In [145]:
# k = torch.multinomial(m, num_samples=20, replacement=True, generator=g)
# k

In [146]:
# F.one_hot(k, num_classes=4).float()
# torch.tensor([2, 3])