In [183]:
import numpy as np
import pandas as pd


from tensorflow.keras.layers import Dense, Input, LSTM, Bidirectional 
from tensorflow.keras.models import Sequential, Model

SAMPLE_SIZE = 10000
LATENT_DIMS = 256
EPOCHS = 20
BATCH_SIZE = 64

## Data Preprocessing

In [41]:
lines = None

with open('dataset/deu.txt', encoding='utf-8') as file:
    lines = file.read().split('\n')

In [55]:
# Find the input texts, output texts
# Find the input characters and output characters
input_texts = []
output_texts = []
input_chars = set()
output_chars = set()

for line in lines[: min(SAMPLE_SIZE, len(lines)-1)]:
    words = line.split('\t')
    english = words[0]
    german = '\t' + words[1] + '\n'

    input_texts.append(english)
    output_texts.append(german)

    # Add the characters present in the words to the character list
    input_chars.update(list(english))
    output_chars.update(list(german))

In [58]:
input_chars = sorted(list(input_chars))
output_chars = sorted(list(output_chars))

num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(output_chars)

max_encoder_seq_len = max([len(x) for x in input_texts])
max_decoder_seq_len = max([len(x) for x in output_texts])

In [61]:
print(f'Number of input texts : {len(input_texts)}')
print(f'Number of encoder tokens : {num_encoder_tokens}')
print(f'Number of decoder tokens : {num_decoder_tokens}')
print(f'Maximum number of input sequences : {max_encoder_seq_len}')
print(f'Maximum number of output sequences : {max_decoder_seq_len}')

Number of input texts : 10000
Number of encoder tokens : 70
Number of decoder tokens : 85
Maximum number of input sequences : 15
Maximum number of output sequences : 51


In [87]:
# Convert input and output tokens to indexed form
input_chars_index = dict([(char,i) for (i,char) in enumerate(input_chars) ])
output_chars_index = dict([(char,i) for (i,char) in enumerate(output_chars) ])

### Prepare the input data for the model (3-dimensional data)
* 'encoder_input_data' is a 3D array of shape (num_pairs, max_english_sentence_length, num_english_characters) containing a one-hot vectorization of the English sentences.
* 'decoder_input_data' is a 3D array of shape (num_pairs, max_french_sentence_length, num_french_characters) containg a one-hot vectorization of the French sentences.
* 'decoder_target_data' is the same as decoder_input_data but offset by one timestep. decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :].

In [185]:
encoder_input_data = np.zeros((
                        len(input_texts),
                        max_encoder_seq_len,
                        num_encoder_tokens
                     ), dtype=np.float32)

decoder_input_data = np.zeros((
                        len(output_texts),
                        max_decoder_seq_len,
                        num_decoder_tokens
                     ), dtype=np.float32)

decoder_output_data = np.zeros((
                        len(output_texts),
                        max_decoder_seq_len,
                        num_decoder_tokens
                     ), dtype=np.float32)

In [186]:
encoder_input_data.shape, decoder_input_data.shape, decoder_output_data.shape

((10000, 15, 70), (10000, 51, 85), (10000, 51, 85))

In [187]:
CHAR_SPACE = ' '

# Fill the content in these empty (zero value) arrays
''' 
LOGIC:
1. All the positions are initialized to 0 by default.
2. Update the positions for the characters present in the text as 1.
3. Fill the rest of the positions (position for which there is no character available to fill) with SPACE
    e.g. if the max_encoder_seq_len = 10 and there are only 6 characters in the input, we need to fill the other 
       4 positions with SPACE character.
'''

for i,(inp, out) in enumerate(zip(input_texts, output_texts)):
    # Prepare the input data
    for t,char in enumerate(inp):
        encoder_input_data[i,t,input_chars_index[char]] = 1
    encoder_input_data[i,t+1:,input_chars_index[CHAR_SPACE]] = 1
    
    # Prepare the output data
    for t,char in enumerate(out):
        decoder_input_data[i,t,output_chars_index[char]] = 1
        
        # for decoder_output_data: This data is only one time step ahead of the decoder_input_data and 
        # the START character is not included here.
        if t>0:
            decoder_output_data[i,t-1,output_chars_index[char]] = 1
        
    decoder_input_data[i,t+1:,output_chars_index[CHAR_SPACE]] = 1
    decoder_output_data[i,t:,output_chars_index[CHAR_SPACE]] = 1

# Model building

### 1. Encoder

In [188]:
# Define an input sequence and process it.
ENC_IP = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(LATENT_DIMS, return_state=True)
encoder_outputs, state_h, state_c = encoder(ENC_IP)
# We discard `encoder_outputs` and only keep the states.
ENC_STATE = [state_h, state_c]

### 2. Decoder

In [189]:
# Set up the decoder, using `ENC_STATE` as initial state.
DEC_IP = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(LATENT_DIMS, return_sequences=True, return_state=True)
DEC_OP, _, _ = decoder_lstm(DEC_IP,initial_state=ENC_STATE)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
DEC_OUT = decoder_dense(DEC_OP)

## Define the model & Training the model

In [190]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([ENC_IP, DEC_IP], DEC_OUT)

In [191]:
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [192]:
model.fit(
    [encoder_input_data, decoder_input_data], 
    decoder_output_data, 
    epochs=EPOCHS, 
    batch_size = BATCH_SIZE, 
    validation_split=0.2
)

Train on 8000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1f16e0850>

## Inferencing

Let's generate the output sequence using the trained model

#### Define the model for testing

In [193]:
# Define the Encoder model
encoder_model = Model(ENC_IP, ENC_STATE)

# Define the Decoder model
DEC_STATE_IP_H = Input(shape=(LATENT_DIMS,))
DEC_STATE_IP_C = Input(shape=(LATENT_DIMS,))
DEC_STATES_inputs = [DEC_STATE_IP_H, DEC_STATE_IP_C]

DEC_OUT, state_h, state_c = decoder_lstm(DEC_IP, initial_state=DEC_STATES_inputs)
DEC_STATES = [state_h, state_c]

DEC_OUT = decoder_dense(DEC_OUT)
decoder_model = Model(
    [DEC_IP] + DEC_STATES_inputs,
    [DEC_OUT] + DEC_STATES)


In [194]:
DEC_OUT.shape

TensorShape([None, None, 85])

In [195]:
# Reverse lookup token index to decide sequences back to something readable
reverse_input_char_index = dict([(i, char) for (char,i) in input_chars_index.items() ])
reverse_output_char_index = dict([(i,char) for (char,i) in output_chars_index.items() ])

In [196]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, output_chars_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_output_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence 




In [197]:
for i in range(20):
    input_seq = encoder_input_data[i:i+1]
    decoded_seq = decode_sequence(input_seq)
    print(f'Input seq : {input_texts[i]}')
    print(f'Output seq : {decoded_seq}')

Input seq : Go.
Output seq : Verschwinde!

Input seq : Hi.
Output seq : Halle Tom!

Input seq : Hi.
Output seq : Halle Tom!

Input seq : Run!
Output seq : Halle dich!

Input seq : Run.
Output seq : Fangt Tom!

Input seq : Wow!
Output seq : Hol zu schnell!

Input seq : Wow!
Output seq : Hol zu schnell!

Input seq : Duck!
Output seq : Halte Tom!

Input seq : Fire!
Output seq : Werfen Sie Tom!

Input seq : Help!
Output seq : Halle!

Input seq : Help!
Output seq : Halle!

Input seq : Stay.
Output seq : Bleib das hin!

Input seq : Stop!
Output seq : Halte das!

Input seq : Stop!
Output seq : Halte das!

Input seq : Wait!
Output seq : Warte!

Input seq : Wait.
Output seq : Warten Sie es!

Input seq : Begin.
Output seq : Halte dich!

Input seq : Do it.
Output seq : Tuten Sie Tom!

Input seq : Do it.
Output seq : Tuten Sie Tom!

Input seq : Go on.
Output seq : Geh nicht!

