In [1]:
%cd /content/drive/My Drive/Colab Notebooks/nlp/apps/language_models

/content/drive/My Drive/Colab Notebooks/nlp/apps/language_models


# What we need to do

- You will start by converting a line of text into a tensor
- Then you will create a generator to feed data into the model
- You will train a neural network in order to predict the new set of characters of defined length.
- You will use embeddings for each character and feed them as inputs to your model.
    - Many natural language tasks rely on using embeddings for predictions.

- Your model will convert each character to its embedding, run the embeddings through a Gated Recurrent Unit GRU, and run it through a linear layer to predict the next set of characters.

- You will get the embeddings;
- Stack the embeddings on top of each other;
- Run them through two layers with a relu activation in the middle;
- Finally, you will compute the softmax. 

To predict the next character:
- Use the softmax output and identify the word with the highest probability.
- The word with the highest probability is the prediction for the next word.

In [2]:
!pip install -q -U trax

[K     |████████████████████████████████| 471kB 13.6MB/s 
[K     |████████████████████████████████| 2.6MB 55.6MB/s 
[K     |████████████████████████████████| 174kB 62.1MB/s 
[K     |████████████████████████████████| 1.1MB 59.3MB/s 
[K     |████████████████████████████████| 71kB 11.9MB/s 
[K     |████████████████████████████████| 3.7MB 50.7MB/s 
[K     |████████████████████████████████| 1.4MB 49.0MB/s 
[K     |████████████████████████████████| 348kB 51.2MB/s 
[K     |████████████████████████████████| 2.9MB 50.3MB/s 
[K     |████████████████████████████████| 890kB 32.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [3]:
import trax
import trax.fastmath.numpy as np
import numpy
import random
import itertools
from trax import fastmath



# Get the data

We will treat the sherlock novels are our data. Then, we will treat each line as a sentence, because we are going to predict characters instead of words, we need to convert each sentence into characters. After this, each character line is going to be stored in a list. In other words, we are going to have a list of list of characters.       
Finally, we will create a generator that takes the batch_size and max_length. Where max_length is the sentence with the maximum size.



In [4]:
path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/sherlock_novels.txt'
testing_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/study in scarlet.txt'
output_dir = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/language_models/models'

In [5]:
def get_sentences(path):
    """
    Reads a txt file and returns each line (sentence)
    in a list
    """
    with open(path) as f:
        sentences = f.readlines()

    return sentences

def get_max_length(sentences):
    """
    Takes a list of sentences and search for the
    longest one.
    """
    sentence = max(sentences, key=len)
    max_length = len(sentence)
    return max_length, sentence


# Preprocess



In [6]:
def preprocess(sentences):
    """
    Takes a list of sentences to clean and lowercase them
    """
    for i, sentence in enumerate(sentences):
        sentences[i] = sentence.strip().lower()

    return sentences


# Create validation and test set

In [7]:
def create_train_val(sentences):
    """
    It takes a list of sentences and divides them into
    90% train and 10% validation
    """
    n = len(sentences)
    pct = int(n * 0.9)
    train = sentences[:pct]
    validation = sentences[pct:]

    return train, validation

# Convert sentences to tensors

Now, we need to convert our sentences into numbers, thus we can feed them into our model.

In [8]:
def sentence2tensor(sentence, end_token=1):
    """
    It takes the sentence and transforms each
    character to a number
    """
    tensor = [ord(char) for char in sentence]
    # append the end token to the sentence
    tensor.append(end_token)

    return tensor

    
    


# Generate batches

We will convert our text sentences into numpy arrays and we will add padding to each sentence. This padding will be determine by the sentence with the max_length in our corpus.

The batch is a tuple with three values: inputs, targets, mask. Mask will be 1 for all non-padding tokens.

In [9]:
def generate_batch(batch_size, max_length, sentences, sentence2tensor=sentence2tensor, shuffle=True):
    """
    It takes a list of sentences, 
    """
    index = 0
    current_batch = []
    num_sentences = len(sentences)

    # create an array with the indexes of sentences that can be shuffled
    sentences_index = [*range(num_sentences)]

    if shuffle:
        random.shuffle(sentences_index)

    while True:
        if index >= num_sentences:
            # reset index if we used all the sentences
            index = 0

            if shuffle:
                random.shuffle(sentences_index)

        # get a sentence
        sentence = sentences[sentences_index[index]]

        if len(sentence) < max_length:
            current_batch.append(sentence)

        index += 1

        # check if we already have our desire batch_size
        if len(current_batch) == batch_size:
            batch = []
            mask = []
            for batch_sentence in current_batch:
                # convert the batch sentence to a tensor
                tensor = sentence2tensor(batch_sentence)

                # add the padding
                pad = [0] * (max_length - len(tensor))
                tensor_padded = tensor + pad

                batch.append(tensor_padded)
                mask_tensor = [0 if i == 0 else 1 for i in tensor_padded]
                mask.append(mask_tensor)

            # convert the padded tensor into a trax tensor
            trax_batch = np.array(batch)
            trax_mask = np.array(mask)

            # yield two copies of the batch and mask
            yield trax_batch, trax_batch, trax_mask

            # reset current_batch to an empty list
            current_batch = []
                


Once we have our function to generate batches, we need to a way to cycle over them and thus we create multiple epochs. One way to do it is with the itertools.cycle() function.

```python
import itertools

infinite_generator = itertools.cycle(generate_batch(batch_size, max_length, sentences))
```

# Create the model

In [10]:
def create_model(vocab_size=256, emb_dim=300, n_layers=2, mode='train'):
    """
    Returns a GRU model.
    Args:
        vocab_size: int. the amount of unique char
        emb_dim: int. embeddings depth
        n_layers: int. number of GRU layers
        mode: str
    returns:
        model: a trax model
    """
    model = trax.layers.Serial(
        trax.layers.ShiftRight(mode=mode),
        trax.layers.Embedding(vocab_size, emb_dim),
        [trax.layers.GRU(emb_dim) for _ in range(n_layers)],
        trax.layers.Dense(vocab_size),
        trax.layers.LogSoftmax()
    )

    return model



# Training

In [11]:
sentences = get_sentences(testing_path)
pre_sentences = preprocess(sentences)
# get_max_length returns the len and the sentence
max_length, _ = get_max_length(pre_sentences)
batch_size = 32
train, val = create_train_val(pre_sentences)


In [12]:
# num_sentences = len(pre_sentences)
# print(num_sentences)
# print(int(num_sentences / batch_size))

def train_model(model, train_sentences, val_sentences, generate_batch, 
                max_lenght, learning_rate= 0.0001, batch_size=32, epochs=1, output_dir='models/'):
    """
    It trains our trax model
    Args:
        model: trax model
        train_sentences: list
        val_sentences: list
        generate_batch: func
        max_length: int. it is the max length of the longest sentence
        batch_size: int
        epochs: int
        output_dir: str
    returns:
        a trax Training loop for the model.
    """
    train_generator = generate_batch(batch_size, max_length, sentences)
    infinite_train_generator = itertools.cycle(train_generator)

    val_generator = generate_batch(batch_size, max_length, val_sentences)
    infinite_val_generator = itertools.cycle(val_generator)

    train_task = trax.supervised.training.TrainTask(
        labeled_data=infinite_train_generator,
        loss_layer=trax.layers.CrossEntropyLoss(),
        optimizer=trax.optimizers.Adam(learning_rate)
    )

    val_task = trax.supervised.training.EvalTask(
        labeled_data=infinite_val_generator,
        metrics=[trax.layers.CrossEntropyLoss(), trax.layers.Accuracy()],
        n_eval_batches=3
    )

    training_loop = trax.supervised.training.Loop(model, train_task, 
                                                  eval_tasks=val_task, output_dir=output_dir)
    
    training_loop.run(n_steps=epochs)

    return training_loop


In [None]:
%%timeit
training_loop = train_model(create_model(), train, val, generate_batch, max_length, epochs=1)

In [None]:
# sentences = get_sentences(path)
# # n_sen = len(sen)
# # print(f"Number of sen: {n_sen}")
# # print(f"Sample line at position 0 {sen[0]}")
# # print(f"Sample line at position 1000 {sen[1000]}")

# sentences = preprocess(sentences)
# train, val = create_train_val(sentences)
# # print(len(train))
# # print(len(val))

# tensor = sentence2tensor(train[1000])
# # print(tensor)

# max_length, sen = get_max_length(sentences)
# print(max_length)
# print(sen)