# Machine Translation Project

In [1]:
import helper
from keras.layers import GRU, LSTM, Input, Dense, TimeDistributed, Bidirectional, Dropout, SpatialDropout1D
from keras.models import Model
from keras.layers import Activation
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.layers import RepeatVector
from keras.layers.embeddings import Embedding

# Load English data
english_sentences = helper.load_data('data/small_vocab_en')
# Load French data
french_sentences = helper.load_data('data/small_vocab_fr')



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Dataset Loaded


### Files
Each line in `small_vocab_en` contains an English sentence with the respective translation in each line of `small_vocab_fr`.  View the first two lines from each file.

In [2]:
for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
small_vocab_fr Line 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
small_vocab_en Line 2:  the united states is usually chilly during july , and it is usually freezing in november .
small_vocab_fr Line 2:  les Ã©tats-unis est gÃ©nÃ©ralement froid en juillet , et il gÃ¨le habituellement en novembre .



### Vocabulary
The complexity of the problem is determined by the complexity of the vocabulary. A more complex vocabulary is a more complex problem. Let's look at the complexity of the dataset.

In [3]:
import collections

english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250 English words.
227 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"



## 2) Preprocess <a class="anchor" id="preprocess"></a>
For this project, I'll convert the text into sequences of integers using the following preprocess methods:
1. Tokenize the words into ids
2. Add padding to make all the sequences the same length.

### Tokenize
For a neural network to predict on text data, the data first has to be turned into data it can understand. Text data like "dog" is a sequence of ASCII character encodings.  Since a neural network is a series of multiplication and addition operations, the input data needs to be number(s).

We can turn each character into a number or each word into a number. These are called character and word ids, respectively. Character ids are used for character level models that generate text predictions for each character.  A word level model uses word ids that generate text predictions for each word. Word level models tend to learn better, since they are lower in complexity, so I'll use those.

In the following cell, `tokenize` turns each sentence into a sequence of words ids using Keras's [`Tokenizer`](https://keras.io/preprocessing/text/#tokenizer) function. Running the cell will run `tokenize` on sample data and show output for debugging.

In [4]:
import project_tests as tests
from keras.preprocessing.text import Tokenizer

def tokenize(x):
    """Tokenize x.
    
    Parameters:
    x: List of sentences/strings to be tokenized
    
    Returns: 
    Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    sequences = tokenizer.texts_to_sequences(x)
    return sequences, tokenizer

tests.test_tokenize(tokenize)

# Tokenize example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


### Padding
When batching the sequence of word ids together, each sequence needs to be the same length. Since sentences are dynamic in length, we can add padding to the end of the sequences to make them the same length.

Make sure all the English sequences have the same length and all the French sequences have the same length by adding padding to the **end** of each sequence using Keras's [`pad_sequences`](https://keras.io/preprocessing/sequence/#pad_sequences) function.

In [5]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

def pad(x, length=None):
    """Pad x.
    Parameters:
    x: List of sequences
    length: Length to pad the sequence to. If None, use length of longest sequence in x
    
    Returns: 
    Padded numpy array of sequences
    """
    if length == None:
        longest_sequence = max(x, key=len)
        return pad_sequences(x, len(longest_sequence), padding='post')
    return pad_sequences(x, length, padding='post')

tests.test_pad(pad)

# Pad tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


### Preprocess Pipeline
The `preprocess` function tokenizes and pads feature and label sentences.

In [6]:
def preprocess(x, y):
    """Preprocess x and y.
    
    Parameters:
    x: Feature list of sentences
    y: Label list of sentences
    
    Returns: 
    Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)

print('Data Preprocessed')

Data Preprocessed


In [7]:
print (preproc_english_sentences.shape)
print (preproc_french_sentences.shape)

(137861, 15)
(137861, 21, 1)


## 3) Models <a class="anchor" id="models"></a>
In this section, I will experiment with various neural network architectures.
I will begin by training four relatively simple architectures.
- Model 1 is a simple RNN
### Ids Back to Text
The neural network will be translating the input to words ids, which isn't the final form we want. We want the French translation. The function `logits_to_text` will bridge the gap between the logits from the neural network to the French translation.

In [8]:
def logits_to_text(logits, tokenizer):
    """Turn logits from a neural network into text using the tokenizer.
    
    Parameters:
    logits: Logits from a neural network
    tokenizer: Keras Tokenizer fit on the labels
    
    Returns: 
    String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    print ("logist shape: ", logits.shape)

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [9]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """Build and train a basic RNN on x and y.
    Parameters:
    input_shape: Tuple of input shape
    output_sequence_length: Length of output sequence
    english_vocab_size: Number of unique English words in the dataset
    french_vocab_size: Number of unique French words in the dataset
    
    Returns: 
    Keras model built, but not trained
    """
    learning_rate = 0.01
    inputs = Input(shape=input_shape[1:])
    x = GRU(512, return_sequences=True)(inputs)
    x = TimeDistributed(Dense(french_vocab_size, activation='relu'))(x)
    predictions = Activation('softmax')(x)
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model
tests.test_simple_model(simple_model)


# Reshape the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

# Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index) + 1, # Add 1 because padding introduces 0
    len(french_tokenizer.word_index) + 1) # Add 1 because padding introduces 0
simple_rnn_model.summary()
simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

# Print prediction(s)
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 21, 1)             0         
_________________________________________________________________
gru_2 (GRU)                  (None, 21, 512)           789504    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 21, 346)           177498    
_________________________________________________________________
activation_2 (Activation)    (None, 21, 346)           0         
Total params: 967,002
Trainable params: 967,002
Non-trainable params: 0
_________________________________________________________________
Train on 110288 samples, validate on 27573 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
logist shape:  (21, 346)
paris <PAD> est parfois chaud en mois et l' est chaud en avril <PAD> <PAD> <PAD> 

In [10]:
simple_rnn_model_scores = simple_rnn_model.evaluate(tmp_x, preproc_french_sentences, verbose=0)
print("Model Accuracy: %.2f%%" % (simple_rnn_model_scores[1]*100))

Model Accuracy: 70.66%
