In [2]:
%cd /content/drive/My Drive/Colab Notebooks/nlp/apps/language_models

/content/drive/My Drive/Colab Notebooks/nlp/apps/language_models


# What we need to do

- You will start by converting a line of text into a tensor
- Then you will create a generator to feed data into the model
- You will train a neural network in order to predict the new set of characters of defined length.
- You will use embeddings for each character and feed them as inputs to your model.
    - Many natural language tasks rely on using embeddings for predictions.

- Your model will convert each character to its embedding, run the embeddings through a Gated Recurrent Unit GRU, and run it through a linear layer to predict the next set of characters.

- You will get the embeddings;
- Stack the embeddings on top of each other;
- Run them through two layers with a relu activation in the middle;
- Finally, you will compute the softmax. 

To predict the next character:
- Use the softmax output and identify the word with the highest probability.
- The word with the highest probability is the prediction for the next word.

In [3]:
!pip install -q -U trax

[K     |████████████████████████████████| 471kB 9.0MB/s 
[K     |████████████████████████████████| 2.6MB 15.7MB/s 
[K     |████████████████████████████████| 174kB 47.1MB/s 
[K     |████████████████████████████████| 3.7MB 40.6MB/s 
[K     |████████████████████████████████| 71kB 8.5MB/s 
[K     |████████████████████████████████| 1.1MB 26.5MB/s 
[K     |████████████████████████████████| 348kB 43.0MB/s 
[K     |████████████████████████████████| 1.4MB 38.4MB/s 
[K     |████████████████████████████████| 2.9MB 46.3MB/s 
[K     |████████████████████████████████| 890kB 39.0MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [50]:
import trax
import trax.fastmath.numpy as np
import numpy
import random
from trax import fastmath
from trax import layers as tl

# Get the data

We will treat the sherlock novels are our data. Then, we will treat each line as a sentence, because we are going to predict characters instead of words, we need to convert each sentence into characters. After this, each character line is going to be stored in a list. In other words, we are going to have a list of list of characters.       
Finally, we will create a generator that takes the batch_size and max_length. Where max_length is the sentence with the maximum size.



In [6]:
path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/sherlock_novels.txt'
testing_path = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/data/study in scarlet.txt'

In [12]:
def get_sentences(path):
    """
    Reads a txt file and returns each line (sentence)
    in a list
    """
    with open(path) as f:
        sentences = f.readlines()

    return sentences


# Preprocess


In [24]:
def preprocess(sentences):
    """
    Takes a list of sentences to clean and lowercase them
    """
    for i, sentence in enumerate(sentences):
        sentences[i] = sentence.strip().lower()

    return sentences


# Create validation and test set

In [49]:
def create_train_val(sentences):
    """
    It takes a list of sentences and divides them into
    90% train and 10% validation
    """
    n = len(sentences)
    pct = int(n * 0.9)
    train = sentences[:pct]
    validation = sentences[pct:]

    return train, validation

# Convert sentences to tensors

Now, we need to convert our sentences into numbers, thus we can feed them into our model.

In [43]:
def sentence2tensor(sentence, end_token=1):
    """
    It takes the sentence and transforms each
    character to a number
    """
    tensor = [ord(char) for char in sentence]
    # append the end token to the sentence
    tensor.append(end_token)

    return tensor

    
    


# Generate batches

We will convert our text sentences into numpy arrays and we will add padding to each sentence. This padding will be determine by the sentence with the max_length in our corpus.

The batch is a tuple with three values: inputs, targets, mask. Mask will be 1 for all non-padding tokens.

In [None]:
def generate_batch(batch_size, max_length, sentences, sentence2tensor=sentence2tensor, shuffle=True):
    """
    It takes a list of sentences, 
    """
    index = 0
    current_batch = []
    num_sentences = len(sentences)

    # create an array with the indexes of sentences that can be shuffled
    sentences_index = [*range(num_lines)]

    if shuffle:
        random.shuffle(sentences_index)

    while True:
        if index >= num_sentences:
            # reset index if we used all the sentences
            index = 0

            if shuffle:
                random.shuffle(sentences_index)

        # get a sentence
        sentence = sentences[sentences_index[index]]

        if len(sentence) < max_length:
            current_batch.append(sentence)

        index += 1

        # check if we already have our desire batch_size
        if len(current_batch) == batch_size:
            batch = []
            mask = []
            for batch_sentence in current_batch:
                # convert the batch sentence to a tensor
                tensor = sentence2tensor(batch_sentence)

                # add the padding
                pad = [0] * (max_lenght - len(tensor))
                tensor_padded = tensor + pad

                batch.append(tensor_padded)
                mask_tensor = [0 if i == 0 else 1 for i in tensor_padded]
                mask.append(mask_tensor)

            # convert the padded tensor into a trax tensor
            trax_batch = np.array(batch)
            trax_mask = np.array(mask)

            # yield two copies of the batch and mask
            yield trax_batch, trax_batch, trax_mask

            # reset current_batch to an empty list
            current_batch = []
            

    


In [45]:
 sentences = get_sentences(testing_path)
# n_sen = len(sen)
# print(f"Number of sen: {n_sen}")
# print(f"Sample line at position 0 {sen[0]}")
# print(f"Sample line at position 1000 {sen[1000]}")

sentences = preprocess(sentences)
train, val = create_train_val(sentences)
# print(len(train))
# print(len(val))

tensor = sentence2tensor(train[1000])
print(tensor)


[34, 116, 104, 101, 32, 102, 105, 110, 103, 101, 114, 32, 110, 97, 105, 108, 115, 32, 97, 110, 100, 32, 116, 104, 101, 32, 116, 114, 105, 99, 104, 105, 110, 111, 112, 111, 108, 121, 44, 34, 32, 105, 32, 115, 117, 103, 103, 101, 115, 116, 101, 100, 46, 1]
