### Bigram character level language modeling

This project was an exploration of how language models are trained at a really simple level. 

Large language models are trained to predict the next most likely token. This model is a basic precursor of the transformer architecture used to train LLMs. I used it to predict the next most likely letter given a previous letter to create name-like words following the tutorial by Andrej Karpathy on Youtube [here](https://www.youtube.com/watch?v=PaCmpygFfXo) . The dataset used is the large dataset of names.


In [54]:
name_r = open("names.txt").read() # Import names dataset

In [55]:
[i if i != '\n'else "." for i in list(name_r)] # Peak at letters...

['e',
 'm',
 'm',
 'a',
 '.',
 'o',
 'l',
 'i',
 'v',
 'i',
 'a',
 '.',
 'a',
 'v',
 'a',
 '.',
 'i',
 's',
 'a',
 'b',
 'e',
 'l',
 'l',
 'a',
 '.',
 's',
 'o',
 'p',
 'h',
 'i',
 'a',
 '.',
 'c',
 'h',
 'a',
 'r',
 'l',
 'o',
 't',
 't',
 'e',
 '.',
 'm',
 'i',
 'a',
 '.',
 'a',
 'm',
 'e',
 'l',
 'i',
 'a',
 '.',
 'h',
 'a',
 'r',
 'p',
 'e',
 'r',
 '.',
 'e',
 'v',
 'e',
 'l',
 'y',
 'n',
 '.',
 'a',
 'b',
 'i',
 'g',
 'a',
 'i',
 'l',
 '.',
 'e',
 'm',
 'i',
 'l',
 'y',
 '.',
 'e',
 'l',
 'i',
 'z',
 'a',
 'b',
 'e',
 't',
 'h',
 '.',
 'm',
 'i',
 'l',
 'a',
 '.',
 'e',
 'l',
 'l',
 'a',
 '.',
 'a',
 'v',
 'e',
 'r',
 'y',
 '.',
 's',
 'o',
 'f',
 'i',
 'a',
 '.',
 'c',
 'a',
 'm',
 'i',
 'l',
 'a',
 '.',
 'a',
 'r',
 'i',
 'a',
 '.',
 's',
 'c',
 'a',
 'r',
 'l',
 'e',
 't',
 't',
 '.',
 'v',
 'i',
 'c',
 't',
 'o',
 'r',
 'i',
 'a',
 '.',
 'm',
 'a',
 'd',
 'i',
 's',
 'o',
 'n',
 '.',
 'l',
 'u',
 'n',
 'a',
 '.',
 'g',
 'r',
 'a',
 'c',
 'e',
 '.',
 'c',
 'h',
 'l',
 'o',
 'e'

In [56]:
# Create a list of each of these names
names = str.split(name_r, "\n")

In [57]:
names[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [29]:
# What we are trying to build is a model that will use the Bigram model to predict the next likely character after a given character

In [30]:
# let's count how many times a given two bigram happen within our name dataset
b = {}
for char in names:
    for i, j in zip("$" + char, char + "#" ):  # $ is used to express start character or end character
        if (i,j) in b: 
            b[(i,j)]+= 1
        else: b[(i,j)] = 1


In [31]:
# let's count how many times a given two bigram happen within our name dataset
b = {}
for char in names:
    for i, j in zip("$" + char, char + "#" ):  # $ is used to express start character or end character
        b[i,j] = b.get((i,j), 0) + 1 


In [32]:
# Order the dictionary
b1 =  {}
for i in sorted(b, key = b.get, reverse = True):  
    b1[i] = b.get(i) 

In [33]:
b

{('$', 'e'): 1531,
 ('e', 'm'): 769,
 ('m', 'm'): 168,
 ('m', 'a'): 2590,
 ('a', '#'): 6640,
 ('$', 'o'): 394,
 ('o', 'l'): 619,
 ('l', 'i'): 2480,
 ('i', 'v'): 269,
 ('v', 'i'): 911,
 ('i', 'a'): 2445,
 ('$', 'a'): 4410,
 ('a', 'v'): 834,
 ('v', 'a'): 642,
 ('$', 'i'): 591,
 ('i', 's'): 1316,
 ('s', 'a'): 1201,
 ('a', 'b'): 541,
 ('b', 'e'): 655,
 ('e', 'l'): 3248,
 ('l', 'l'): 1345,
 ('l', 'a'): 2623,
 ('$', 's'): 2055,
 ('s', 'o'): 531,
 ('o', 'p'): 95,
 ('p', 'h'): 204,
 ('h', 'i'): 729,
 ('$', 'c'): 1542,
 ('c', 'h'): 664,
 ('h', 'a'): 2244,
 ('a', 'r'): 3264,
 ('r', 'l'): 413,
 ('l', 'o'): 692,
 ('o', 't'): 118,
 ('t', 't'): 374,
 ('t', 'e'): 716,
 ('e', '#'): 3983,
 ('$', 'm'): 2538,
 ('m', 'i'): 1256,
 ('a', 'm'): 1634,
 ('m', 'e'): 818,
 ('$', 'h'): 874,
 ('r', 'p'): 14,
 ('p', 'e'): 197,
 ('e', 'r'): 1958,
 ('r', '#'): 1377,
 ('e', 'v'): 463,
 ('v', 'e'): 568,
 ('l', 'y'): 1588,
 ('y', 'n'): 1826,
 ('n', '#'): 6763,
 ('b', 'i'): 217,
 ('i', 'g'): 428,
 ('g', 'a'): 330,
 ('a',

In [34]:
# Instead of keeping these in a dictionary, we'll store the info in a 2d array
# The first row will have have the characters that are first and the second row will have
# characters happening second. At the intersection of each two letters we'll have the number of occurences

In [44]:
import torch #library 

In [45]:
char = ""
for i in names:
    for ch in i:
        char = char + i
char= sorted(list(set(char)))

In [46]:
# Correspondence between letters and numbers
stoi = {}
for count, i in enumerate(char , start = 1):
    stoi[i] = count

stoi["."] = 0


In [47]:
# Reverse correspondence from numbers to letters
itox = {x: list(stoi.items())[x-1][0] for x in range(1,27)}
itox[0] = "."

In [48]:
list(stoi.items())[0]

('a', 1)

In [49]:
# let's add the combination number (bigrams) in the tensor array
n_ar = torch.zeros((27, 27), dtype = torch.int32)

for char in names:
    for i, j in zip("." + char, char + "." ):  # $ is used to express start character or end character
        i1 = stoi[i]
        i2 = stoi[j]
        n_ar[i1, i2] += 1

In [50]:
Sum_rows = n_ar.sum(1, keepdim = True) # get the sum (of columns) accross rows
P = 1 + n_ar.float()
P /= Sum_rows # Obtain the conditional probability distribution for each row

In [51]:
P

tensor([[3.1218e-05, 1.3770e-01, 4.0802e-02, 4.8169e-02, 5.2789e-02, 4.7826e-02,
         1.3049e-02, 2.0916e-02, 2.7316e-02, 1.8481e-02, 7.5641e-02, 9.2530e-02,
         4.9106e-02, 7.9262e-02, 3.5807e-02, 1.2331e-02, 1.6108e-02, 2.9033e-03,
         5.1197e-02, 6.4184e-02, 4.0864e-02, 2.4662e-03, 1.1769e-02, 9.6151e-03,
         4.2144e-03, 1.6733e-02, 2.9033e-02],
        [1.9599e-01, 1.6438e-02, 1.5995e-02, 1.3900e-02, 3.0781e-02, 2.0452e-02,
         3.9841e-03, 4.9875e-03, 6.8851e-02, 4.8724e-02, 5.1940e-03, 1.6792e-02,
         7.4635e-02, 4.8251e-02, 1.6051e-01, 1.8887e-03, 2.4495e-03, 1.8002e-03,
         9.6355e-02, 3.3023e-02, 2.0304e-02, 1.1273e-02, 2.4642e-02, 4.7809e-03,
         5.4006e-03, 6.0528e-02, 1.2867e-02],
        [4.3478e-02, 1.2174e-01, 1.4745e-02, 7.5614e-04, 2.4953e-02, 2.4802e-01,
         3.7807e-04, 3.7807e-04, 1.5879e-02, 8.2420e-02, 7.5614e-04, 3.7807e-04,
         3.9319e-02, 3.7807e-04, 1.8904e-03, 4.0076e-02, 3.7807e-04, 3.7807e-04,
         3.1871e-

In [20]:
g = torch.Generator().manual_seed(21483645)
for j in range(8):
    output  = []
    ch_i = 0 # character under its number form
    while True:
        p = P[ch_i] # This is the conditional probability on chx, ie the probability distribution of potential follow ups given chx is the preceding character
        ch_i = torch.multinomial(p, num_samples = 1, replacement =True, generator = g).item()
        output.append(itox[ch_i])
        if ch_i == 0 :
            print("".join(output))
            break


chilemaniehen.
jaselyatt.
abremin.
modaweroboslyourina.
gus.
zusienamaynonisanercl.
kren.
ahaeiblienin.


#### Evaluate the quality of the model

In [52]:
P

tensor([[3.1218e-05, 1.3770e-01, 4.0802e-02, 4.8169e-02, 5.2789e-02, 4.7826e-02,
         1.3049e-02, 2.0916e-02, 2.7316e-02, 1.8481e-02, 7.5641e-02, 9.2530e-02,
         4.9106e-02, 7.9262e-02, 3.5807e-02, 1.2331e-02, 1.6108e-02, 2.9033e-03,
         5.1197e-02, 6.4184e-02, 4.0864e-02, 2.4662e-03, 1.1769e-02, 9.6151e-03,
         4.2144e-03, 1.6733e-02, 2.9033e-02],
        [1.9599e-01, 1.6438e-02, 1.5995e-02, 1.3900e-02, 3.0781e-02, 2.0452e-02,
         3.9841e-03, 4.9875e-03, 6.8851e-02, 4.8724e-02, 5.1940e-03, 1.6792e-02,
         7.4635e-02, 4.8251e-02, 1.6051e-01, 1.8887e-03, 2.4495e-03, 1.8002e-03,
         9.6355e-02, 3.3023e-02, 2.0304e-02, 1.1273e-02, 2.4642e-02, 4.7809e-03,
         5.4006e-03, 6.0528e-02, 1.2867e-02],
        [4.3478e-02, 1.2174e-01, 1.4745e-02, 7.5614e-04, 2.4953e-02, 2.4802e-01,
         3.7807e-04, 3.7807e-04, 1.5879e-02, 8.2420e-02, 7.5614e-04, 3.7807e-04,
         3.9319e-02, 3.7807e-04, 1.8904e-03, 4.0076e-02, 3.7807e-04, 3.7807e-04,
         3.1871e-

In [53]:
# Let's compute the likelihood of the names we observed given the parameter we extracted from that same dataset under the
# assumption that the probability distribution of a given character given another character follows a multinomial distribution of parameter vector estimated
# from the data.
loglikelihood = 0
for char in names:
    for i, j in zip("." + char, char + "." ):  # $ is used to express start character or end character
        i1 = stoi[i]
        i2 = stoi[j]
        loglikelihood += torch.log(P[i1, i2])
        
print(f'{loglikelihood=}')    

loglikelihood=tensor(-559322.6875)


In [58]:
# Build a neural network
# First get the inputs (xs) and the labels (ys) from our data
xs = []
ys = []
for char in names:
    for i, j in zip("." + char, char + "." ):  # $ is used to express start character or end character
        i1 = stoi[i]
        i2 = stoi[j]
        xs.append(i1)
        ys.append(i2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [59]:
ys[:6]

tensor([ 5, 13, 13,  1,  0, 15])

In [60]:
# Since the values in xs and ys are simply indexes of letters, they are nominal. That means that the number doesn't mean anything on
# its own. 

In [61]:
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes = 27).float()
yenc = F.one_hot(ys, num_classes = 27).float()

In [62]:
# Initialize a the weights in the neural network
g = g = torch.Generator().manual_seed(21483645)

W = torch.randn((27,27), generator = g, requires_grad = True)


In [63]:
# Gradient descent
for i in range(100):
    # Forward pass
    logits = xenc @ W  # We assume that the dot product will give log odds / log counts
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims = True)  # This is called a softmax function activation
    loss = -probs[torch.arange(len(ys)), ys].log().mean()  
    print(f'{loss.item()=}')
    
    # Backward pass
    W.grad = None # set the gradient to Zero
    loss.backward()
    
    # Update the weigths with the weights
    W.data +=  - 50 * W.grad

loss.item()=3.7502005100250244
loss.item()=3.3693087100982666
loss.item()=3.1381993293762207
loss.item()=2.985227346420288
loss.item()=2.8890380859375
loss.item()=2.8225796222686768
loss.item()=2.7731411457061768
loss.item()=2.7355403900146484
loss.item()=2.706230640411377
loss.item()=2.6827292442321777
loss.item()=2.6633713245391846
loss.item()=2.647071599960327
loss.item()=2.633114814758301
loss.item()=2.621004819869995
loss.item()=2.6103827953338623
loss.item()=2.600978136062622
loss.item()=2.5925815105438232
loss.item()=2.5850305557250977
loss.item()=2.5781960487365723
loss.item()=2.571977376937866
loss.item()=2.5662930011749268
loss.item()=2.56107759475708
loss.item()=2.556277275085449
loss.item()=2.5518462657928467
loss.item()=2.547745943069458
loss.item()=2.54394268989563
loss.item()=2.5404067039489746
loss.item()=2.5371124744415283
loss.item()=2.5340373516082764
loss.item()=2.531160354614258
loss.item()=2.5284647941589355
loss.item()=2.5259335041046143
loss.item()=2.52355289459

In [65]:
loss

tensor(2.4731, grad_fn=<NegBackward0>)

In [69]:
g = torch.Generator().manual_seed(21483645)
for j in range(10):
    output  = []
    ch_i = 0 # character under its number form
    while True:
        xenc = F.one_hot(torch.tensor([ch_i]), num_classes = 27).float()
        logits = xenc @ W  # We assume that the dot product will give log odds / log counts
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims = True) 
        ch_i = torch.multinomial(probs, num_samples = 1, replacement =True, generator = g).item()
        output.append(itox[ch_i])
        if ch_i == 0 :
            print("".join(output))
            break

chilemaniehen.
jnselyatt.
abremin.
monawarobeslyourina.
gus.
zusienamwitonisaneril.
kren.
ahaeiblienin.
arolanzyiani.
cetonacrenah.
