# How can machines read and understand texts?
- The vocabulary consists of a list of all unique tokens that the model should recognize.
Why?
- Tokenization involves converting each token in the text into a unique integer identifier, or index, using a dictionary called a vocabulary
- Dealing with Variable-Length Inputs: 
- Padding
- Padding involves supplementing shorter sequences with extra tokens (typically designated by the index 0) to match the length of the longest sequence in the batch



In [2]:
class Tokenizer:

    def __init__(self):
        self.dictionary = {}
        self.reverse_dictionary = {}

        # Add the padding token
        self.__add_to_dict('<pad>')

        # Add characters and numbers to the dictionary
        for i in range(10):
            self.__add_to_dict(str(i))
        for i in range(26):
            self.__add_to_dict(chr(ord('a') + i))

        # Add space and punctuation to the dictionary
        self.__add_to_dict('.')
        self.__add_to_dict(' ')


    def get_vocabulary(self):
        return self.dictionary
    
    def __add_to_dict(self, character):
        if character not in self.dictionary:
            self.dictionary[character] = len(self.dictionary)
            self.reverse_dictionary[self.dictionary[character]] = character

    def tokenize(self, text):
        return [self.dictionary[c] for c in text]


    def character_to_token(self, character):
        return self.dictionary[character]


    def token_to_character(self, token):
        return self.reverse_dictionary[token]

    def size(self):
        return len(self.dictionary)

In [3]:
tokenize = Tokenizer()
vocabulary = tokenize.get_vocabulary()
print("Vocabulary - values: {0}".format(vocabulary.values()))
print("Vocabulary - Keys: {0}".format(vocabulary.keys()))

Vocabulary - values: dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38])
Vocabulary - Keys: dict_keys(['<pad>', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', ' '])


In [4]:
# Create the training data
training_data = '. '.join([
    'cats rule the world',
    'dogs are the best',
    'elephants have long trunks',
    'monkeys like bananas',
    'pandas eat bamboo',
    'tigers are dangerous',
    'zebras have stripes',
    'lions are the kings of the savannah',
    'giraffes have long necks',
    'hippos are big and scary',
    'rhinos have horns',
    'penguins live in the arctic',
    'polar bears are white'
])

In [8]:
training_data

'cats rule the world. dogs are the best. elephants have long trunks. monkeys like bananas. pandas eat bamboo. tigers are dangerous. zebras have stripes. lions are the kings of the savannah. giraffes have long necks. hippos are big and scary. rhinos have horns. penguins live in the arctic. polar bears are white'

In [5]:
# Tokenize the training data
tokenizer = Tokenizer()
tokenized_training_data = tokenizer.tokenize(training_data)

In [6]:
tokenized_training_data

[13,
 11,
 30,
 29,
 38,
 28,
 31,
 22,
 15,
 38,
 30,
 18,
 15,
 38,
 33,
 25,
 28,
 22,
 14,
 37,
 38,
 14,
 25,
 17,
 29,
 38,
 11,
 28,
 15,
 38,
 30,
 18,
 15,
 38,
 12,
 15,
 29,
 30,
 37,
 38,
 15,
 22,
 15,
 26,
 18,
 11,
 24,
 30,
 29,
 38,
 18,
 11,
 32,
 15,
 38,
 22,
 25,
 24,
 17,
 38,
 30,
 28,
 31,
 24,
 21,
 29,
 37,
 38,
 23,
 25,
 24,
 21,
 15,
 35,
 29,
 38,
 22,
 19,
 21,
 15,
 38,
 12,
 11,
 24,
 11,
 24,
 11,
 29,
 37,
 38,
 26,
 11,
 24,
 14,
 11,
 29,
 38,
 15,
 11,
 30,
 38,
 12,
 11,
 23,
 12,
 25,
 25,
 37,
 38,
 30,
 19,
 17,
 15,
 28,
 29,
 38,
 11,
 28,
 15,
 38,
 14,
 11,
 24,
 17,
 15,
 28,
 25,
 31,
 29,
 37,
 38,
 36,
 15,
 12,
 28,
 11,
 29,
 38,
 18,
 11,
 32,
 15,
 38,
 29,
 30,
 28,
 19,
 26,
 15,
 29,
 37,
 38,
 22,
 19,
 25,
 24,
 29,
 38,
 11,
 28,
 15,
 38,
 30,
 18,
 15,
 38,
 21,
 19,
 24,
 17,
 29,
 38,
 25,
 16,
 38,
 30,
 18,
 15,
 38,
 29,
 11,
 32,
 11,
 24,
 24,
 11,
 18,
 37,
 38,
 17,
 19,
 28,
 11,
 16,
 16,
 15,
 29,
 38,
 18,
 11,


In [7]:
l = []
for x in tokenized_training_data:
    l.append(x)
l

[13,
 11,
 30,
 29,
 38,
 28,
 31,
 22,
 15,
 38,
 30,
 18,
 15,
 38,
 33,
 25,
 28,
 22,
 14,
 37,
 38,
 14,
 25,
 17,
 29,
 38,
 11,
 28,
 15,
 38,
 30,
 18,
 15,
 38,
 12,
 15,
 29,
 30,
 37,
 38,
 15,
 22,
 15,
 26,
 18,
 11,
 24,
 30,
 29,
 38,
 18,
 11,
 32,
 15,
 38,
 22,
 25,
 24,
 17,
 38,
 30,
 28,
 31,
 24,
 21,
 29,
 37,
 38,
 23,
 25,
 24,
 21,
 15,
 35,
 29,
 38,
 22,
 19,
 21,
 15,
 38,
 12,
 11,
 24,
 11,
 24,
 11,
 29,
 37,
 38,
 26,
 11,
 24,
 14,
 11,
 29,
 38,
 15,
 11,
 30,
 38,
 12,
 11,
 23,
 12,
 25,
 25,
 37,
 38,
 30,
 19,
 17,
 15,
 28,
 29,
 38,
 11,
 28,
 15,
 38,
 14,
 11,
 24,
 17,
 15,
 28,
 25,
 31,
 29,
 37,
 38,
 36,
 15,
 12,
 28,
 11,
 29,
 38,
 18,
 11,
 32,
 15,
 38,
 29,
 30,
 28,
 19,
 26,
 15,
 29,
 37,
 38,
 22,
 19,
 25,
 24,
 29,
 38,
 11,
 28,
 15,
 38,
 30,
 18,
 15,
 38,
 21,
 19,
 24,
 17,
 29,
 38,
 25,
 16,
 38,
 30,
 18,
 15,
 38,
 29,
 11,
 32,
 11,
 24,
 24,
 11,
 18,
 37,
 38,
 17,
 19,
 28,
 11,
 16,
 16,
 15,
 29,
 38,
 18,
 11,


In [11]:
# for _ in range(max_sequence_length):
#     # Prepend padding tokens
#     tokenized_training_data.insert(0, tokenizer.character_to_token('<pad>'))

In [12]:
tokenized_training_data

[0,
 0,
 0,
 0,
 0,
 0,
 13,
 11,
 30,
 29,
 38,
 28,
 31,
 22,
 15,
 38,
 30,
 18,
 15,
 38,
 33,
 25,
 28,
 22,
 14,
 37,
 38,
 14,
 25,
 17,
 29,
 38,
 11,
 28,
 15,
 38,
 30,
 18,
 15,
 38,
 12,
 15,
 29,
 30,
 37,
 38,
 15,
 22,
 15,
 26,
 18,
 11,
 24,
 30,
 29,
 38,
 18,
 11,
 32,
 15,
 38,
 22,
 25,
 24,
 17,
 38,
 30,
 28,
 31,
 24,
 21,
 29,
 37,
 38,
 23,
 25,
 24,
 21,
 15,
 35,
 29,
 38,
 22,
 19,
 21,
 15,
 38,
 12,
 11,
 24,
 11,
 24,
 11,
 29,
 37,
 38,
 26,
 11,
 24,
 14,
 11,
 29,
 38,
 15,
 11,
 30,
 38,
 12,
 11,
 23,
 12,
 25,
 25,
 37,
 38,
 30,
 19,
 17,
 15,
 28,
 29,
 38,
 11,
 28,
 15,
 38,
 14,
 11,
 24,
 17,
 15,
 28,
 25,
 31,
 29,
 37,
 38,
 36,
 15,
 12,
 28,
 11,
 29,
 38,
 18,
 11,
 32,
 15,
 38,
 29,
 30,
 28,
 19,
 26,
 15,
 29,
 37,
 38,
 22,
 19,
 25,
 24,
 29,
 38,
 11,
 28,
 15,
 38,
 30,
 18,
 15,
 38,
 21,
 19,
 24,
 17,
 29,
 38,
 25,
 16,
 38,
 30,
 18,
 15,
 38,
 29,
 11,
 32,
 11,
 24,
 24,
 11,
 18,
 37,
 38,
 17,
 19,
 28,
 11,
 16,
 16,
 

In [14]:
import torch

class TokenEmbedding(torch.nn.Module):
    """
    PyTorch module that converts tokens into embeddings.

    Input dimension is: (batch_size, sequence_length)
    Output dimension is: (batch_size, sequence_length, d_model)
    """

    def __init__(self, d_model, number_of_tokens):
        super().__init__()
        self.embedding_layer = torch.nn.Embedding(
            num_embeddings=number_of_tokens,
            embedding_dim=d_model
        )

    def forward(self, x):
        return self.embedding_layer(x)

In [15]:
class PositionalEncoding(torch.nn.Module):
    """
    Pytorch module that creates a positional encoding matrix. This matrix will later be added to the 
    transformer's input embeddings to provide a sense of position of the sequence elements.
    """

    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.d_model = d_model
        self.max_sequence_length = max_sequence_length
        self.positional_encoding = self.create_positional_encoding()

    def create_positional_encoding(self):
        """
        Creates a positional encoding matrix of size (max_sequence_length, d_model).
        """

        # Initialize positional encoding matrix
        positional_encoding = np.zeros((self.max_sequence_length, self.d_model))

        # Calculate positional encoding for each position and each dimension
        for pos in range(self.max_sequence_length):
            for i in range(0, self.d_model, 2):
                # Apply sin to even indices in the array; indices in Python start at 0 so i is even.
                positional_encoding[pos, i] = np.sin(pos / (10000 ** ((2 * i) / self.d_model)))
                
                if i + 1 < self.d_model:
                    # Apply cos to odd indices in the array; we add 1 to i because indices in Python start at 0.
                    positional_encoding[pos, i + 1] = np.cos(pos / (10000 ** ((2 * i) / self.d_model)))

        # Convert numpy array to PyTorch tensor and return it
        return torch.from_numpy(positional_encoding).float()

    def forward(self, x):
        """
        Adds the positional encoding to the input embeddings at the corresponding positions.
        """
        # Add positional encodings to input embeddings. The ":" indexing ensures we only add positional encodings up
        # to the length of the sequence in the batch. x.size(0) is the batch size, so this is a way to make sure 
        # we're not adding extra positional encodings.
        return x + self.positional_encoding[:x.size(1), :]