In [None]:
# In this notebook, you learn:
#
# 1) How to create training, validation and test data in the format required for the neural network model?
#
# Resources:
# 1) https://youtu.be/TCH_1BHY58I?si=m65oyEMLQ5OXBjjA
#       -- Andrej Karpathy's video on makemore which is used to build the model.
# 2) https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf
#       -- Bengio et al. (2003) paper on A Neural Probabilistic Language Model
#       -- This paper is the basis for the model we are building. Watch the video first and then read the paper.
#
#
# The data used in part 2 is the same as part 1. The data is a list of Indian names. The data is available in the file
# `makemore_part1/Data/names.text`. Please go through `makemore_part1/buildding_makemore_step_by_step/step_1_data_exploration.ipynb` 
# to have a better idea of the data.

<img src="../Data/Images/makemore_2_model.png" alt="name_prediction_neural_network" width="500" height="400">

In [3]:
# The above image shows the model we are going to build. Some of the key points to note are:
#
# 1) We won't be considering the skip connections from input layer to the output layer.
# 2) We will be using character embeddings as input instead of word embeddings.
# 3) The input-output pairs are created from the names in the dataset.
#
# Let me briefly explain the model:
# The model takes in a sequence of characters as input and predicts the next character in the sequence. We will train the 
# model using the names in the dataset and later use the model to generate new names.
# 
# Model Architecture:
# The model consists of 3 layers:
# 1) Input Layer: The input layer is just the embeddings of the last `n` characters in the name.
# 2) Hidden Layer: This layer is a feedforward neural network connected to the input layer. The hidden layer has `h` units.
#                  This layer uses a `tanh` activation function.
# 3) Output Layer: The output of the hidden layer is connected to the output layer. The output layer has `d` units. The 
#                  output layer uses a `softmax` activation function.
#
# Input-Output representation:
#
# Input:
# As described above, we will use characters as input and predict the next character as output. 
# Out data contains 27 characters (26 alphabets + 1 BOUND_CHARACTER). We will use a `d` dimensional embedding to represent
# each character. The embeddings are learned during training along with the weights of the model.  
# The input to the model is the last `n` characters in a name and the target is the next character in the name. The 
# embeddings of the last `n` characters are concatenated and passed through a feedforward neural network. So, we don't 
# really use the order of the characters in the name to train this model.
#
# Output:
# Each neuron in the output layer represents a character. The output of the model is a probability distribution over the
# characters.

In [4]:
import string
import torch

In [None]:
# Path to the dataset used to train the model.
CLEANED_DATASET_PATH = '../../makemore_part1/Data/names.txt'
# Represents the end character  of a name.
BOUND_CHARACTER = '.'
# Number of characters used as input to the model.
BLOCK_SIZE = 3

In [6]:
with open(CLEANED_DATASET_PATH, 'r') as f:
    names = [name.strip() for name in f.readlines()]

names[:10]

['albonsha',
 'beenapreethi',
 'thushniha',
 'aakaksha',
 'dumeethran',
 'luhit',
 'valam',
 'harinyai',
 'sakthikaa',
 'kaveetha']

In [None]:
# This was already explained in makemore_part1 in detail.
#
# Create a mapping from characters to indices. 
char_to_idx = {char: idx + 1 for idx, char in enumerate(string.ascii_lowercase)}
char_to_idx[BOUND_CHARACTER] = 0
print(char_to_idx)
print("-" * 100)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}
----------------------------------------------------------------------------------------------------
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [29]:
# Let's create input-output pairs from the names.
#  
# Holds the input pairs. List of lists -- Each inner list contains the last `BLOCK_SIZE` number of characters in a name.
inputs = []
# Holds the targets to be predicted -- Represents the next character after the block in a name.
targets = []

# This is based on a rolling window method. We start with window of BLOCK_SIZE characters and then remove the first
# character and append the current character to create a new window.
for name in names:
    # If name is 'virat', then bounded_name is 'virat.'
    bounded_name = name + BOUND_CHARACTER
    # This is the block containing the last 3 characters. At the start it is '...'
    current_block = [0] * BLOCK_SIZE
    for current_char in bounded_name:
        target_char_idx = char_to_idx[current_char]
        # Add current block to the inputs.
        inputs.append(current_block)
        targets.append(target_char_idx)
        # Removed the first character from the block.
        current_block = current_block[1:]
        # Append the current character to create a block for the next input-target pair.
        current_block.append(target_char_idx)

In [None]:
# Let's try to understand the output here.
# The first name is 'albonsha'. The input-target pairs created out of this name should be:
#
# ... : a  --  [0,0,0]    : 1
# ..a : l  --  [0,0,1]    : 12
# .al : b  --  [0,1,12]   : 2
# alb : o  --  [1,12,2]   : 15
# lbo : n  --  [12,2,15]  : 14
# bon : s  --  [2,15,14]  : 19
# ons : h  --  [15,14,19] : 8
# nsh : a  --  [14,19,8]  : 1
# sha : .  --  [19,8,1]   : 0
#
# The above is exactly what you see in the inputs and targets output below.
print(f"First 10 names in the dataset: {names[:10]}")
print("-" * 100)
print(f"First 10 inputs created from the names: {inputs[:10]}")
print("-" * 100)
print(f"First 10 targets created for the corresponding inputs: {targets[:10]}")

First 10 names in the dataset: ['albonsha', 'beenapreethi', 'thushniha', 'aakaksha', 'dumeethran', 'luhit', 'valam', 'harinyai', 'sakthikaa', 'kaveetha']
----------------------------------------------------------------------------------------------------
First 10 inputs created from the names: [[0, 0, 0], [0, 0, 1], [0, 1, 12], [1, 12, 2], [12, 2, 15], [2, 15, 14], [15, 14, 19], [14, 19, 8], [19, 8, 1], [0, 0, 0]]
----------------------------------------------------------------------------------------------------
First 10 targets created for the corresponding inputs: [1, 12, 2, 15, 14, 19, 8, 1, 0, 2]


In [32]:
# Both the inputs and targets should be of equal length.
print(f"Number of inputs: {len(inputs)}")
print(f"Number of targets: {len(targets)}")

Number of inputs: 545276
Number of targets: 545276


In [None]:
# We need to convert the inputs and targets to tensors which is what Pytorch primarily deals with.
input_tensors = torch.tensor(data=inputs, dtype=torch.float32)
target_tensors = torch.tensor(data=targets, dtype=torch.float32)

In [None]:
input_tensors[:10]

tensor([[ 0.,  0.,  0.],
        [ 0.,  0.,  1.],
        [ 0.,  1., 12.],
        [ 1., 12.,  2.],
        [12.,  2., 15.],
        [ 2., 15., 14.],
        [15., 14., 19.],
        [14., 19.,  8.],
        [19.,  8.,  1.],
        [ 0.,  0.,  0.]])

In [None]:
target_tensors[:10]

tensor([ 1., 12.,  2., 15., 14., 19.,  8.,  1.,  0.,  2.])

## Train, Validation, and Test Datasets

In [36]:
# The usual standard is that we split the data into train dataset, validation dataset, and test dataset.
# The usual split is:
# Train dataset      : 80% of the total data
# Validation dataset : 10% of the total data
# Test dataset       : 10% of the total data
# 
# Train dataset is used to train the model.
# Validation dataset is used to validate the trained model and identify the best performing hyper parameters.
# Test dataset is used to calculate the performance of the model -- This should be used once and only once. 

In [None]:
total_data_size = len(inputs)
total_data_size

545276

In [44]:
train_size = int(0.8 * total_data_size)
print(f"Number of examples in the train dataset: {train_size}")
print("-" * 100)
validation_size = int(0.1 * total_data_size)
print(f"Number of examples in the validation dataset: {validation_size}")
print("-" * 100)
test_size = total_data_size - train_size - validation_size
print(f"Number of examples in the test dataset: {test_size}")

Number of examples in the train dataset: 436220
----------------------------------------------------------------------------------------------------
Number of examples in the validation dataset: 54527
----------------------------------------------------------------------------------------------------
Number of examples in the test dataset: 54529


In [46]:
train_end = train_size
validation_end = train_size + validation_size

In [48]:
# It is easier to create the split before creating the tensors. 
X_train, Y_train = inputs[:train_end], targets[:train_end]
X_valid, Y_valid = inputs[train_end:validation_end], targets[train_end:validation_end]
X_test, Y_test = inputs[validation_end:], targets[validation_end]

In [50]:
print(f"Train set size: {X_train.shape[0]}")
print(f"Validation set size: {X_valid.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Train set size: 436220
Validation set size: 54527
Test set size: 54529


In [51]:
X_train[:10], Y_train[:10]

(tensor([[ 0.,  0.,  0.],
         [ 0.,  0.,  1.],
         [ 0.,  1., 12.],
         [ 1., 12.,  2.],
         [12.,  2., 15.],
         [ 2., 15., 14.],
         [15., 14., 19.],
         [14., 19.,  8.],
         [19.,  8.,  1.],
         [ 0.,  0.,  0.]]),
 tensor([ 1., 12.,  2., 15., 14., 19.,  8.,  1.,  0.,  2.]))