In [1]:
# In this notebook, you learn:
#
# 1) How to train a simple neural network to generate names?
# 2) How to encode characters into one-hot encoding?
# 3) What is the similarity between rule-based name generator and neural network-based name generator?
#
# Resources:
# 1) ADD RESOURCES TO UNDERSTAND NEURAL NETWORKS
# 2) https://github.com/MB1151/mimic_micro_autograd
#       -- To understand how backpropagation works in detail.
#       -- This is my other repository where I have implemented autograd from scratch.
#       -- Not everything is needed from this repository, but it is a good resource to understand how backpropagation works.
# 3) makemore_part1/building_makemore_step_by_step/step_3_model_quality.ipynb
#       -- To understand how to calculate loss and use it for training.

In [2]:
import string
import torch
import torch.nn.functional as F

from torch import Tensor

In [3]:
DATA_PATH = "../Data/names.txt"
BOUND_CHARACTER = "."

In [None]:
with open(DATA_PATH, "r") as f:
    names = [name.strip() for name in f.readlines()]

names[:10]

['albonsha',
 'beenapreethi',
 'thushniha',
 'aakaksha',
 'dumeethran',
 'luhit',
 'valam',
 'harinyai',
 'sakthikaa',
 'kaveetha']

In [5]:
# Create character to integer mapping.
char_to_int = {char: idx + 1 for idx, char in enumerate(string.ascii_lowercase)}
char_to_int[BOUND_CHARACTER] = 0
print(char_to_int)
print("-" * 100)
int_to_char = {idx: char for char, idx in char_to_int.items()}
print(int_to_char)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}
----------------------------------------------------------------------------------------------------
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [6]:
# In our name-generator neural network, we will use one-hot encoding to represent characters.
# One-hot encoding is a way to represent characters in a number format so that the neural network can understand them.
#
# In our text, we have 27 characters (26 alphabets + 1 dot). As you can see, we have assigned a unique number to each 
# character in the char_to_int dictionary. However, this is not the best way to represent characters in a neural 
# network. This is because the neural network may think that the characters with higher numbers are more important
# than the characters with lower numbers. This is not true. The numbers assigned to the characters are just for
# representation purposes. They don't have any meaning. 
# For example, the character 'a' is represented as 1 and the character 'z' is represented as 26. But, this doesn't 
# mean that 'z' is more important than 'a'. So, we need to represent characters in a way that the neural network can 
# understand that all characters are equally important. This is where one-hot encoding comes into play.
#
# In one-hot encoding, we will represent each character as a 27-dimensional vector where all elements are zero except 
# the element corresponding to the character index which is 1.
# For example, the character 'a' will be represented as [0, 1, 0, 0, ..., 0] where the second element is 1 and all
# other elements are zero. Similarly, the character 'z' will be represented as [0, 0, 0, ..., 1] where the last 
# element is 1 and all other elements are zero.

In [7]:
# Let's create a very simple neural network for the name generation. We will improve this model later on. For now, 
# the idea is to build a neural network which is very similar to the rule-based model that we built in 
# 'building_makemore_step_by_step/step_2_rule_based_name_generator.ipynb'
#
# The neural network will take a single character as input and predict a single character as output. As explained 
# above, every character is represented as a 27-dimensional vector. So, our network should take an input that has
# 27 features. 
# We want the neural network to output a 27-dimensional vector, where each value in the vector corresponds to the
# probability of the prediction to be the character corresponding to that particular position. 
# For now, we will only have 1 layer in the neural network.
#
# So, the architecture is as follows:
# Number of layers = 1
# Input = [27, 1] -- Ignoring batching in this calculation.
# Number of neurons = Number of outputs = 27
#
# We will also not have bias in our model.

### DATA PREPARATION

In [8]:
# Let's create the input and output as required by the neural network. We want to use backward to run the gradient
# descent algorithm and train the model. Hence, we need to perform all the inputs, targets in the form of tensors
# and the computations between tensors.
input_list = []
target_list = []
for name in names:
    name = BOUND_CHARACTER + name + BOUND_CHARACTER
    for first_char, second_char in zip(name, name[1:]):
        first_char_idx = char_to_int[first_char]
        second_char_idx = char_to_int[second_char]
        input_list.append(first_char_idx)
        target_list.append(second_char_idx)

print(f"shape of input_list: {len(input_list)}")
print(f"input_list[:10]: {input_list[:10]}")
print(f"target_list[:10]: {target_list[:10]}")

# Let's create tensors out of these lists to use with neural networks.
inputs = torch.tensor(data=input_list, dtype=torch.int64)
targets = torch.tensor(data=target_list, dtype=torch.int64)

shape of input_list: 545276
input_list[:10]: [0, 1, 12, 2, 15, 14, 19, 8, 1, 0]
target_list[:10]: [1, 12, 2, 15, 14, 19, 8, 1, 0, 2]


In [9]:
# Now, let's generate the one-hot encoding for the inputs.
# We cast the one-hot vectors into float since we need to pass them through the neural network which
# expects float inputs.
encoded_inputs = F.one_hot(inputs, num_classes=len(char_to_int)).float()
print(f"shape of encoded_inputs: {encoded_inputs.shape}")
print(f"encoded_inputs[:3]: {encoded_inputs[:3]}")

shape of encoded_inputs: torch.Size([545276, 27])
encoded_inputs[:3]: tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])


### MODEL CREATION

In [10]:
# Let's use a seed to keep our outputs consistent across multiple runs of the notebook.
SEED = 1234

In [11]:
# We need to set the generator everytime this code is run.
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# We have 27 inputs and 27 outputs. So, the weights should be of shape (27, 27).
# We will initialize the weights randomly.
# The 'requires_grad=True' argument is used to tell PyTorch that we want to compute the gradients of the weights
# during the backward pass -- This will be explained below in detail.
weights = torch.randn(size=(27, 27), dtype=torch.float32, requires_grad=True)
print(weights.shape)
print(weights)

torch.Size([27, 27])
tensor([[-0.1117, -0.4966,  0.1631, -0.8817,  0.0539,  0.6684, -0.0597, -0.4675,
         -0.2153,  0.8840, -0.7584, -0.3689, -0.3424, -1.4020,  0.3206, -1.0219,
          0.7988, -0.0923, -0.7049, -1.6024,  0.2891,  0.4899, -0.3853, -0.7120,
         -0.1706, -1.4594,  0.2207],
        [ 0.2463, -1.3248,  0.6970, -0.6631,  1.2158, -1.4949,  0.8810, -1.1786,
         -0.9340, -0.5675, -0.2772, -2.1834,  0.3668,  0.9380,  0.0078, -0.3139,
         -1.1567,  1.8409, -1.0174,  1.2192,  0.1601,  1.5985, -0.0469, -1.5270,
         -2.0143, -1.5173,  0.3877],
        [-1.1849,  0.6897,  1.3232,  1.8169,  0.6808,  0.7244,  0.0323, -1.6593,
         -1.8773,  0.7372,  0.9257,  0.9247,  0.1825, -0.0737,  0.3147, -1.0369,
          0.2100,  0.6144,  0.0628, -0.3297, -1.7970,  0.8728,  0.7670, -0.1138,
         -0.9428,  0.7540,  0.1407],
        [-0.6937, -0.6159, -0.7295,  0.4308,  0.2862, -0.2481,  0.2040,  0.8519,
         -1.4102, -0.1071, -0.8018,  0.2771,  2.5599, -1.6

### SAMPLE RUN

In [12]:
# Let's do a sample run of the neural network to understand how it works.
# Assume the input is 'b'. The one-hot encoding of 'b' is [0, 1, 0, 0, ..., 0].
sample_input = torch.zeros(size=(1,27), dtype=torch.float32)
sample_input[0][2] = 1
print(sample_input.shape)
print(sample_input)

torch.Size([1, 27])
tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])


In [13]:
# Just to understand, this is multiplication of a row vector with a matrix. So, the output will be a row vector.
# The output row vector will be the 2nd row of the weights matrix since the input has 1 only in the 2nd position.
# 
# This is equivalent to the rule-based model where we use the pre-computed probabilities of the current character
# to predict the next character. In the rule based model, if the current character is 'b', we simply take the row
# 2nd row from the 'char_probs' tensor and use it to predict the next character. If you observe keenly, we are 
# doing the same thing here. Here, the current character is 'b' and we simply take the 2nd row of the weights. Our
# model setup and one-hot encoding is just making this neural network equivalent to the rule-based model.
sample_output = sample_input @ weights
print(sample_output.shape)
print(sample_output)

torch.Size([1, 27])
tensor([[-1.1849,  0.6897,  1.3232,  1.8169,  0.6808,  0.7244,  0.0323, -1.6593,
         -1.8773,  0.7372,  0.9257,  0.9247,  0.1825, -0.0737,  0.3147, -1.0369,
          0.2100,  0.6144,  0.0628, -0.3297, -1.7970,  0.8728,  0.7670, -0.1138,
         -0.9428,  0.7540,  0.1407]], grad_fn=<MmBackward0>)


### SOFTMAX

Softmax function is used to convert a list of values into probabilities.

Let's say we have a list of values $[x_{0}, x_{1}, x_{2}, ..., x_{n - 1}]$. Now, we need to convert this <br>
into a list of probabilities. 

$$input\_list = [x_{0}, x_{1}, x_{2}, ..., x_{n - 1}]$$
$$\implies logits = [e^{x_{0}}, e^{x_{1}}, e^{x_{2}}, ..., e^{x_{n - 1}}]$$

Applying exponentiation brings the value range to $(0, \infty)$. Now, we normalize the $logits$ to obtain probabilities.

First, let's calculate the sum of logits to normalize.

$$logit\_sum = \sum_{i=0}^{n-1} e^{x_{i}}$$

$$\implies probabilities = [\frac{e^{x_{0}}}{logit\_sum}, \frac{e^{x_{1}}}{logit\_sum}, \frac{e^{x_{2}}}{logit\_sum}, ..., \frac{e^{x_{n-1}}}{logit\_sum}]$$

This whole process to obtain probabilities from a list of values is called softmax.

In [14]:
# The 'sample_output' is a row vector containing some values. We need to convert these values into probabilities. 
# As we know, the probabilities are always between 0 and 1. So, we need to limit the values of the output row 
# vector between 0 and 1. We can achieve this by using the softmax function.
sample_run_logits = sample_output.exp()
print(sample_run_logits.shape)
print(sample_run_logits)

torch.Size([1, 27])
tensor([[0.3058, 1.9931, 3.7554, 6.1530, 1.9754, 2.0634, 1.0328, 0.1903, 0.1530,
         2.0902, 2.5237, 2.5211, 1.2003, 0.9290, 1.3698, 0.3546, 1.2337, 1.8486,
         1.0648, 0.7192, 0.1658, 2.3936, 2.1532, 0.8925, 0.3895, 2.1255, 1.1511]],
       grad_fn=<ExpBackward0>)


In [15]:
# Normalize the logits to get probabilities.
sample_run_probs = sample_run_logits / sample_run_logits.sum(dim=1, keepdim=True)
print(sample_run_probs.shape)
print(sample_run_probs)

torch.Size([1, 27])
tensor([[0.0072, 0.0466, 0.0878, 0.1439, 0.0462, 0.0483, 0.0242, 0.0045, 0.0036,
         0.0489, 0.0590, 0.0590, 0.0281, 0.0217, 0.0320, 0.0083, 0.0289, 0.0432,
         0.0249, 0.0168, 0.0039, 0.0560, 0.0504, 0.0209, 0.0091, 0.0497, 0.0269]],
       grad_fn=<DivBackward0>)


In [16]:
# Let's verify that the sum of the probabilities is 1.
print(sample_run_probs.sum(dim=1, keepdim=True).item())

1.0


In [17]:
# Let's say the target is 'c' which is represented as integer 3. So, we extract the probability of the next 
# character being 'c' from the 'sample_run_probs'.
sample_run_target_prob = sample_run_probs[0][3]
print(sample_run_target_prob.item())

0.14393503963947296


In [18]:
# Now, let's calculate the loss using negative log-likelihood as we explained in the previous notebook ('step_3_model_quality.ipynb').
sample_run_loss = -torch.log(sample_run_target_prob)
print(sample_run_loss.item())

1.938393235206604


In [19]:
# The last step is to update the weights using the gradients. Let's do this in the actual training loop below.

### MODEL CREATION CONTINUED

In [20]:
# This cell just combines all the code explained in SAMPLE RUN SECTION above.
model_output = encoded_inputs @ weights
logits = model_output.exp()
probs = logits / logits.sum(dim=1, keepdim=True)

In [21]:
# Let's verify that the sum of the probabilities is 1 for a few of the rows.
print(probs.sum(dim=1, keepdim=True)[:5])

tensor([[1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000]], grad_fn=<SliceBackward0>)


In [22]:
# Each row of the 'probs' tensor contains the probabilities for the next character for the corresponding input.
# probs[0][0] = The probability of the next character being '.' for the zeroth input.
# probs[0][1] = The probability of the next character being 'a' for the zeroth input.
# probs[0][2] = The probability of the next character being 'b' for the zeroth input.
# probs[2][3] = The probability of the next character being 'c' for the second input.
# ...
# In general, 
# probs[i][j] = The probability of the next character being 'int_to_char[j]' for the ith input.
print(probs[:5])
# Targets tensor contains the indices of the target characters or the true next characters for each input.
print(targets[:5])

tensor([[0.0357, 0.0243, 0.0470, 0.0165, 0.0421, 0.0779, 0.0376, 0.0250, 0.0322,
         0.0967, 0.0187, 0.0276, 0.0284, 0.0098, 0.0550, 0.0144, 0.0888, 0.0364,
         0.0197, 0.0080, 0.0533, 0.0652, 0.0272, 0.0196, 0.0337, 0.0093, 0.0498],
        [0.0342, 0.0071, 0.0536, 0.0138, 0.0901, 0.0060, 0.0645, 0.0082, 0.0105,
         0.0151, 0.0202, 0.0030, 0.0386, 0.0683, 0.0269, 0.0195, 0.0084, 0.1684,
         0.0097, 0.0904, 0.0314, 0.1321, 0.0255, 0.0058, 0.0036, 0.0059, 0.0394],
        [0.0045, 0.0075, 0.0035, 0.0510, 0.0718, 0.0130, 0.0093, 0.0160, 0.0132,
         0.0180, 0.0177, 0.0323, 0.0116, 0.0022, 0.0374, 0.0200, 0.0435, 0.0059,
         0.0593, 0.0522, 0.0014, 0.0995, 0.0304, 0.0911, 0.1873, 0.0208, 0.0797],
        [0.0072, 0.0466, 0.0878, 0.1439, 0.0462, 0.0483, 0.0242, 0.0045, 0.0036,
         0.0489, 0.0590, 0.0590, 0.0281, 0.0217, 0.0320, 0.0083, 0.0289, 0.0432,
         0.0249, 0.0168, 0.0039, 0.0560, 0.0504, 0.0209, 0.0091, 0.0497, 0.0269],
        [0.0064, 0.0111,

In [23]:
# Let's extract the probabilities assigned by the model to the target characters for each input.
target_probs = probs[range(len(targets)), targets]
# Let's understand what the printed values mean.
# In the above cell, the targets for the first 5 inputs are [1, 12, 2, 15, 14].
#
# Let's consider the first target which is 1. This means that the target character (or the true output) is 'a'.
# For the zeroth input, the model assigned a probability of probs[0][1] to the character 'a'. So, the target_probs[0]
# should be equal to probs[0][1] = 0.0243 which is correct.
# 
# Now, let's consider the second target which is 12. This means that the target character (or the true output) is 'k'.
# For the second input, the model assigned a probability of probs[1][12] to the character 'k'. So, the target_probs[1]
# should be equal to probs[1][12] = 0.0386 which is correct.
#
# Similarly, we can verify the rest of the values.
print(target_probs[:5])

tensor([0.0243, 0.0386, 0.0035, 0.0083, 0.0465], grad_fn=<SliceBackward0>)


In [24]:
# Let's calculate the loss for the entire dataset using negative log-likelihood as explained in the previous 
# notebook (building_makemore_step_by_step/step_3_model_quality.ipynb).
# We calculate the mean of the negative log-likelihoods for all the inputs.
loss = -torch.log(target_probs).mean()
# The loss is going to be a single positive number. Lower the loss, better the model.
print(loss.item())

3.79728102684021


In [25]:
# The last step is to update the weights using the gradients. First, let's check what the gradients are.
# It should be None since we haven't calculated the gradients yet. Let's verify.
print(weights.grad)

None


In [26]:
# Now, let's calculate the gradients. This can be done by calling the backward() function on the loss tensor.
loss.backward()
# Now, let's check the gradients. The gradients should be non-zero since we have calculated them.
print(weights.grad)

tensor([[ 3.9656e-03, -1.0170e-02,  1.3612e-03, -4.4705e-04, -1.3427e-03,
          7.2652e-03,  3.8438e-03, -4.1995e-04,  3.5887e-04,  8.9341e-03,
         -2.9020e-03, -5.7328e-03,  5.0600e-04, -7.0073e-03, -9.7385e-04,
          1.1834e-03,  2.9324e-03,  4.0049e-03, -4.5081e-03, -1.0690e-02,
          1.3846e-04,  5.9499e-03, -5.0418e-03,  1.9740e-03,  3.7370e-03,
         -2.1661e-03,  5.2468e-03],
        [-2.6616e-02, -6.9503e-03,  7.5360e-03,  2.0742e-03,  1.3182e-02,
          2.4826e-04,  1.2590e-02, -1.6344e-03, -1.2588e-03, -1.1006e-03,
         -1.6279e-04, -5.4089e-03, -1.0817e-03,  5.1614e-03, -4.4274e-02,
          3.7938e-03, -3.6045e-04,  3.3272e-02, -1.9928e-02,  7.1790e-03,
         -4.2244e-03,  2.5588e-02, -2.1162e-03,  7.9215e-04,  5.6764e-04,
         -4.1262e-03,  7.2589e-03],
        [-1.2791e-04, -3.2168e-03,  8.6052e-04,  1.5944e-03,  4.7394e-04,
         -2.0195e-04,  2.6794e-04,  4.5694e-05, -2.6837e-03, -1.3577e-03,
          6.3454e-04,  6.5036e-04,  2.83

In [27]:
# Just a point to remember - Apparently, there is a difference when you use weights.data and weights in the below
# equation. weights.data is the actual tensor and weights is a wrapper around the tensor. Apparently, if you use
# weights, this changes something in the computation graph and gradient calculation is impacted -- I didn't really
# understand the details. But, it is better to use weights.data to update the weights.
#
# Now, let's just update the weights using the gradients. We will use a learning rate of 0.1 (randomly chosen).
weights.data += 0.1 * weights.grad
# Let's check the updated weights.
print(weights)

tensor([[-0.1113, -0.4976,  0.1632, -0.8817,  0.0538,  0.6691, -0.0593, -0.4675,
         -0.2152,  0.8849, -0.7587, -0.3694, -0.3423, -1.4027,  0.3206, -1.0217,
          0.7991, -0.0919, -0.7054, -1.6034,  0.2891,  0.4905, -0.3858, -0.7118,
         -0.1702, -1.4597,  0.2212],
        [ 0.2436, -1.3255,  0.6977, -0.6628,  1.2171, -1.4949,  0.8822, -1.1788,
         -0.9341, -0.5677, -0.2772, -2.1839,  0.3667,  0.9385,  0.0034, -0.3135,
         -1.1567,  1.8442, -1.0194,  1.2200,  0.1597,  1.6011, -0.0471, -1.5269,
         -2.0142, -1.5177,  0.3885],
        [-1.1849,  0.6894,  1.3233,  1.8171,  0.6808,  0.7243,  0.0323, -1.6593,
         -1.8776,  0.7371,  0.9258,  0.9248,  0.1826, -0.0737,  0.3147, -1.0369,
          0.2101,  0.6145,  0.0628, -0.3297, -1.7970,  0.8728,  0.7670, -0.1138,
         -0.9428,  0.7541,  0.1408],
        [-0.6937, -0.6159, -0.7295,  0.4308,  0.2862, -0.2481,  0.2040,  0.8519,
         -1.4106, -0.1071, -0.8018,  0.2771,  2.5601, -1.6952,  0.1885,  0.7388

In [28]:
# Now, let's compute the loss again. It should be lesser than what we have computed above since we did
# one loop of optimization using gradient descent.
model_output_iter2 = encoded_inputs @ weights
logits_iter2 = model_output_iter2.exp()
probs_iter2 = logits_iter2 / logits_iter2.sum(dim=1, keepdim=True)
target_probs_iter2 = probs_iter2[range(len(targets)), targets]
loss_iter2 = -torch.log(target_probs_iter2).mean()
# Note that the loss before was '3.9736' and the loss now is '3.9685'. Looking at the tiny change, I 
# think we can use slightly higher learning rate.
print(loss_iter2)

tensor(3.7987, grad_fn=<NegBackward0>)


### PUTTING TRAINING LOOP TOGETHER

In [29]:
# All the above steps need to be repeated multiple times to train the model. This is called a training loop.
# Let's put all the above steps in a function and run the training loop and observe how the loss changes over time.
def training_loop(inputs: Tensor, targets: Tensor, weights: Tensor, num_loops: int, learning_rate: float):
    for iteration in range(num_loops):
        # Forward Propagation
        model_output = inputs @ weights
        logits = model_output.exp()
        probs = logits / logits.sum(dim=1, keepdim=True)
        target_probs = probs[torch.arange(start=0, end=len(targets)), targets]
        loss = -torch.log(target_probs).mean()
        print(f"Loss after iteration {iteration} is {loss.item()}")
        # Back Propagation
        # Always, zero the weights from the previous loop. Other it will update the gradients instead of over-writing.
        weights.grad = None
        loss.backward()
        weights.data += -learning_rate * weights.grad

In [30]:
# Note that the loss we obtained here is very close to the loss we obtained in the rule-based model. This is because
# the neural network we built is equivalent to the rule-based model as explained above. Ofcourse, this model seems
# slightly better than the rule-based model.
# Rule-based model loss: 2.483
# Neural network model loss: 2.241
training_loop(inputs=encoded_inputs, targets=targets, weights=weights, num_loops=300, learning_rate=10.0)

Loss after iteration 0 is 3.798703670501709
Loss after iteration 1 is 3.661177158355713
Loss after iteration 2 is 3.541262626647949
Loss after iteration 3 is 3.4366979598999023


Loss after iteration 4 is 3.3456366062164307
Loss after iteration 5 is 3.2659871578216553
Loss after iteration 6 is 3.195650339126587
Loss after iteration 7 is 3.1329286098480225
Loss after iteration 8 is 3.0766279697418213
Loss after iteration 9 is 3.025914430618286
Loss after iteration 10 is 2.9801418781280518
Loss after iteration 11 is 2.9387569427490234
Loss after iteration 12 is 2.90126371383667
Loss after iteration 13 is 2.867218255996704
Loss after iteration 14 is 2.8362231254577637
Loss after iteration 15 is 2.80792498588562
Loss after iteration 16 is 2.7820098400115967
Loss after iteration 17 is 2.7582008838653564
Loss after iteration 18 is 2.7362561225891113
Loss after iteration 19 is 2.715965509414673
Loss after iteration 20 is 2.6971476078033447
Loss after iteration 21 is 2.6796483993530273
Loss after iteration 22 is 2.6633341312408447
Loss after iteration 23 is 2.6480886936187744
Loss after iteration 24 is 2.6338131427764893
Loss after iteration 25 is 2.62041974067688
Loss

### GENERATE NAMES

In [31]:
# We have seen how to train a simple neural network to generate names. Now, let's see how to generate names using this
# trained model. The idea is to take a character as input, pass it through the neural network, get the probabilities
def generate_names(weights: Tensor, char_to_int: dict, int_to_char: dict, num_names: int):
    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

    for _ in range(num_names):
        name = BOUND_CHARACTER
        while True:
            prev_char_idx = char_to_int[name[-1]]
            input_tensor = torch.zeros(size=(1, 27), dtype=torch.float32)
            input_tensor[0][prev_char_idx] = 1.0
            output_tensor = input_tensor @ weights
            logits = output_tensor.exp()
            output_probs = logits / logits.sum(dim=1, keepdim=True)
            output_char = int_to_char[torch.multinomial(output_probs, num_samples=1).item()]
            if output_char == BOUND_CHARACTER:
                break
            name += output_char
        print(name[1:])

In [32]:
# Note the names are also very similar to the names generated by the rule-based model.
generate_names(weights=weights, char_to_int=char_to_int, int_to_char=int_to_char, num_names=20)

k
velivadanilfithivarar
yaakumain
theuthinove
gasubhumath
thewarumady
inthinurath
arakuja
thidufukokanusharina
ven
vanuksith
ntheveg
satithesila
jahagenoraja
asanthran
ati
ueleva
nishina
h
hthavathaila
