# Part-2: The Neural Network Approach

In [1]:
# load the dataset
words = open('names.txt', 'r').read().splitlines()

In [2]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [3]:
# Defining the training set for model training (x, y) => input output mapping
x, y = [], []
for w in words[:5]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]

        print(f"{bigram} = {idx1}, {idx2}")
        print(f"If input is: '{bigram[0]}' ({idx1}) | output will be: '{bigram[1]}' ({idx2})")
        x.append(idx1)
        y.append(idx2)
print()
print("Therefore, this list has been created: ", end = " ")
print(f"x = {x}, y = {y}")

('.', 'e') = 0, 5
If input is: '.' (0) | output will be: 'e' (5)
('e', 'm') = 5, 13
If input is: 'e' (5) | output will be: 'm' (13)
('m', 'm') = 13, 13
If input is: 'm' (13) | output will be: 'm' (13)
('m', 'a') = 13, 1
If input is: 'm' (13) | output will be: 'a' (1)
('a', '.') = 1, 0
If input is: 'a' (1) | output will be: '.' (0)
('.', 'o') = 0, 15
If input is: '.' (0) | output will be: 'o' (15)
('o', 'l') = 15, 12
If input is: 'o' (15) | output will be: 'l' (12)
('l', 'i') = 12, 9
If input is: 'l' (12) | output will be: 'i' (9)
('i', 'v') = 9, 22
If input is: 'i' (9) | output will be: 'v' (22)
('v', 'i') = 22, 9
If input is: 'v' (22) | output will be: 'i' (9)
('i', 'a') = 9, 1
If input is: 'i' (9) | output will be: 'a' (1)
('a', '.') = 1, 0
If input is: 'a' (1) | output will be: '.' (0)
('.', 'a') = 0, 1
If input is: '.' (0) | output will be: 'a' (1)
('a', 'v') = 1, 22
If input is: 'a' (1) | output will be: 'v' (22)
('v', 'a') = 22, 1
If input is: 'v' (22) | output will be: 'a' (1)
(

### One-hot encoding:
- We convert index-based data into one-hot vectors, where a single position corresponding to the index is activated (set to 1), while all other positions remain 0.
- eg: [4, 5, 2] will look like this [[0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]
- i.e '5 will look like this [0, 0, 0, 0, 0, 1]'

### Conclusion so far

In [4]:
from torch.nn import functional as f
import torch
x = torch.tensor(x)
xenc= f.one_hot(x, num_classes=27).float() # One-Hot encodings (input to the Model)

In [5]:
g = torch.Generator().manual_seed(42)
weight = torch.randn((27,27), generator=g)
logits = xenc @ weight # raw output of the Models (including -ve numbers and numbers > 1)
count = torch.exp(logits) # (converting the -ve number to +ve; without loosing meaning)
prob = count / count.sum(dim = 1, keepdims = True) # Converting them into probabilities
# The Above method is called as Softmax activation layer (Cobverts the models raw output to Prediction probabilites)
print(prob[1])

tensor([0.0396, 0.0698, 0.0227, 0.0037, 0.0562, 0.0153, 0.0626, 0.0112, 0.0421,
        0.0084, 0.0581, 0.1125, 0.0951, 0.0292, 0.0181, 0.0107, 0.0247, 0.0316,
        0.0586, 0.0140, 0.1109, 0.0019, 0.0147, 0.0067, 0.0502, 0.0033, 0.0280])


In [6]:
negative_log_likelihood = torch.zeros((5)) # Tracking the model’s loss during training
n = 0
for i in range(5):
    x_char = itos[x[i].item()]
    y_char = itos[y[i]]
    print(f"Bigram example {i+1}: '{x_char + y_char}' (indexes ({x[i]}, {y[i]}))")
    print(f"Input to the neural network = {x[i]}")
    print(f"Output of the neural network (Probabilities): {prob[i]}")
    print(f"Actual label of the Neural Network: {y[i]}")
    p = prob[i, y[i]]
    print(f"Probability assigned by the Neural Network to the correct label: {p.item():.4f}")
    logp = torch.log(p)
    print(f"Log likelihood = {logp.item()}")
    negative_log_likelihood[i] = -logp
    print(f"Negative Log likelihood (Loss) = {-logp.item()}")
    n += 1
    print("-"*15)

print("=" * 30)
avg_nlls = negative_log_likelihood.sum() / n
print(f"Average Negative Log likelihood = {avg_nlls}")


Bigram example 1: '.e' (indexes (0, 5))
Input to the neural network = 0
Output of the neural network (Probabilities): tensor([0.1230, 0.0793, 0.0441, 0.0022, 0.0353, 0.0052, 0.0172, 0.0036, 0.0084,
        0.0932, 0.0121, 0.0044, 0.0087, 0.0102, 0.0083, 0.0384, 0.0926, 0.0153,
        0.0109, 0.0278, 0.0084, 0.0527, 0.0399, 0.0962, 0.0644, 0.0655, 0.0330])
Actual label of the Neural Network: 5
Probability assigned by the Neural Network to the correct label: 0.0052
Log likelihood = -5.2567949295043945
Negative Log likelihood (Loss) = 5.2567949295043945
---------------
Bigram example 2: 'em' (indexes (5, 13))
Input to the neural network = 5
Output of the neural network (Probabilities): tensor([0.0396, 0.0698, 0.0227, 0.0037, 0.0562, 0.0153, 0.0626, 0.0112, 0.0421,
        0.0084, 0.0581, 0.1125, 0.0951, 0.0292, 0.0181, 0.0107, 0.0247, 0.0316,
        0.0586, 0.0140, 0.1109, 0.0019, 0.0147, 0.0067, 0.0502, 0.0033, 0.0280])
Actual label of the Neural Network: 13
Probability assigned by the

In [7]:
# Applying the same process to all elements of the corpus
x, y = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]
        x.append(idx1)
        y.append(idx2)
print()
print(f"x = {x[:10]}, y = {y[:10]}")

# Converting them into tensors
x = torch.tensor(x)
y = torch.tensor(y)


x = [0, 5, 13, 13, 1, 0, 15, 12, 9, 22], y = [5, 13, 13, 1, 0, 15, 12, 9, 22, 9]


## Optimization of Model's Learnable Parameter (Weight)

In [9]:
# Initialization of weight (Model's Learnable Parameter)
g = torch.Generator().manual_seed(42)
weight = torch.randn((27, 27), generator=g, requires_grad=True)

In [10]:
# Training our Neural Network
import torch.nn.functional as F
xenc = F.one_hot(x, num_classes = 27).float()

for i in range(500):
    # Forward pass
    logits = xenc @ weight
    count = torch.exp(logits)
    probs = count / count.sum(dim = 1, keepdims = True) # Probabilities for next character assigned by the Model
    loss = -probs[torch.arange(len(y)), y].log().mean()

    if i % 100 == 0:
        print(f"Loss at epoch {i}: {loss}")

    # Backward pass
    weight.grad = None
    loss.backward()

    # Update the Model's Parameter
    lr = 50
    weight.data -= lr * weight.grad

# For storing it into a dict
neural_network_loss = loss

Loss at epoch 0: 3.696647882461548
Loss at epoch 100: 2.4729576110839844
Loss at epoch 200: 2.4625625610351562
Loss at epoch 300: 2.459355592727661
Loss at epoch 400: 2.4578323364257812


## Predictions of the Model

In [11]:
g = torch.Generator().manual_seed(42)
import torch.nn.functional as F
for i in range(5):
    models_output = []
    idx = 0
    while True:
        xenc = F.one_hot(torch.tensor([idx]), num_classes=27).float()
        logits = xenc @ weight
        exp_logits = torch.exp(logits)
        y_preds = exp_logits / exp_logits.sum(dim = 1, keepdims = True)

        idx = torch.multinomial(y_preds, num_samples=1, replacement=True, generator=g).item()
        models_output.append(itos[idx])

        if idx == 0:
            break
    print(''.join(models_output))

ya.
syahavilin.
dleekahmangonya.
tryahe.
chen.


### Conclusion:

1. Statistical Approach → Using co-occurrence counts + probability distributions.
2. Neural Network Approach → Using trainable parameters + optimization via backpropagation.