## Seq2Seq Language Translation model  English to French

### 1. Character level model

In [87]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.python.keras.models import Model


#### Get the inputs and encode them

In [88]:
sample_len= 10000
df = 'fra.txt'
input_texts = []
target_texts = []
ip_chars = set()
op_chars = set()
with open(df, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
    
#Inputs have two parts, the English text ie input_text and input target ie target_text    
for line in lines[: min(sample_len, len(lines) - 1)]:
    input_text, target_text = line.split('\t')[:2]
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in ip_chars:
            ip_chars.add(char)
    for char in target_text:
        if char not in op_chars:
            op_chars.add(char)

ip_chars = sorted(list(ip_chars))
op_chars = sorted(list(op_chars))
num_encoder_tokens = len(ip_chars)
num_decoder_tokens = len(op_chars)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])


In [89]:
ip_tokens = dict([(char, i) for i, char in enumerate(ip_chars)])
op_tokens = dict([(char, i) for i, char in enumerate(op_chars)])
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens),dtype='float32')
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens),dtype='float32')
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens),dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, ip_tokens[char]] = 1.
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, op_tokens[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, op_tokens[char]] = 1.

### Define the character-level model 

In [90]:
batch_size = 128 
epochs = 50
latent_dim = 256 

encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [91]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.summary()

Model: "functional_45"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_31 (InputLayer)           [(None, None, 71)]   0                                            
__________________________________________________________________________________________________
input_32 (InputLayer)           [(None, None, 93)]   0                                            
__________________________________________________________________________________________________
lstm_16 (LSTM)                  [(None, 256), (None, 335872      input_31[0][0]                   
__________________________________________________________________________________________________
lstm_17 (LSTM)                  [(None, None, 256),  358400      input_32[0][0]                   
                                                                 lstm_16[0][1]        

In [92]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.3)
model.save('s2s.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### Inference

In [93]:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in ip_tokens.items())
reverse_target_char_index = dict((i, char) for char, i in op_tokens.items())

In [94]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, op_tokens['\t']] = 1.
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.
        states_value = [h, c]

    return decoded_sentence

In [96]:
 for seq_index in range(50):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    number = str(seq_index)
    print('Sentence Number: '+ number)
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

Sentence Number: 0
Input sentence: Go.
Decoded sentence: Va !

Sentence Number: 1
Input sentence: Hi.
Decoded sentence: Salut !

Sentence Number: 2
Input sentence: Hi.
Decoded sentence: Salut !

Sentence Number: 3
Input sentence: Run!
Decoded sentence: Courez !

Sentence Number: 4
Input sentence: Run!
Decoded sentence: Courez !

Sentence Number: 5
Input sentence: Who?
Decoded sentence: Qui a fait ?

Sentence Number: 6
Input sentence: Wow!
Decoded sentence: Commence s'ast-mour.

Sentence Number: 7
Input sentence: Fire!
Decoded sentence: Vu veu !

Sentence Number: 8
Input sentence: Help!
Decoded sentence: Salue !

Sentence Number: 9
Input sentence: Jump.
Decoded sentence: Attrapez-moi.

Sentence Number: 10
Input sentence: Stop!
Decoded sentence: Arrête !

Sentence Number: 11
Input sentence: Stop!
Decoded sentence: Arrête !

Sentence Number: 12
Input sentence: Stop!
Decoded sentence: Arrête !

Sentence Number: 13
Input sentence: Wait!
Decoded sentence: Attends !

Sentence Number: 14
Input

Inference:
1. The character level model predicts almost all single words corectly. For example: Hi. -> Salut!. It even adds the exclamation mark!
2. The model translated smaller sentences with little ambiguity. For example: 
 * I fled. -> Je me suis partie. (which means "I left" from Google translate)
 * Go now. -> Marchez-le. (which means "walk it" from Google translate)


### 2. Word level model

#### Preprocessing the text

In [97]:
line= pd.read_table('fra.txt', nrows=10000, usecols=[0,1], names=['eng', 'fra'])

line.eng=line.eng.apply(lambda x: x.lower())
line.fra=line.fra.apply(lambda x: x.lower())

line.eng=line.eng.apply(lambda x: re.sub("'", '', str(x)))
line.fra=line.fra.apply(lambda x: re.sub("'", '', str(x)))
exclude = set(string.punctuation) # Set of all special characters

# Remove all the special characters
line.eng=line.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
line.fra=line.fra.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
line.fra = line.fra.apply(lambda x : 'START_ '+ x + ' _END')

remove_digits = str.maketrans('', '', string.digits)
line.eng=line.eng.apply(lambda x: x.translate(remove_digits))


#### Create the Word List

In [98]:
# Vocabulary of English
all_eng_words=set()
for eng in line.eng:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

            
all_german_words=set() # will have only unique words of frail
for fra in line.fra:
    for word in fra.split():
        if word not in all_german_words:
            all_german_words.add(word)

# Max Length of source sequence
lenght_list=[]
for l in line.eng:
    lenght_list.append(len(l.split(' ')))
    
max_length_src = np.max(lenght_list)
max_length_src

#Max Length of target sequence
lenght_list=[]
for l in line.fra:
    lenght_list.append(len(l.split(' ')))
    
max_length_tar = np.max(lenght_list)
max_length_tar

input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_german_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_german_words)
num_encoder_tokens, num_decoder_tokens

num_decoder_tokens += 1 # For zero padding
num_decoder_tokens

4568

In [99]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())
line.head()

Unnamed: 0,eng,fra
0,go,START_ va _END
1,hi,START_ salut _END
2,hi,START_ salut _END
3,run,START_ cours _END
4,run,START_ courez _END


In [100]:
# split train and test data
X, Y = line.eng, line.fra
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
X_train.shape, X_test.shape

((8000,), (2000,))

In [101]:
#Generating Zero arrays
def generate_batch(X = X_train, Y = Y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], Y[j:j+batch_size])):  
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

latent_dim = 50

### Define the word level model

In [102]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens+1, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens+1, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [103]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50

In [104]:
#Compile the Model 
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "functional_51"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_35 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
input_36 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 50)     107900      input_35[0][0]                   
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, None, 50)     228450      input_36[0][0]                   
______________________________________________________________________________________

In [105]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50

In [106]:
# Fit the Model
model.fit_generator(generator = generate_batch(X_train, Y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, Y_test, batch_size =batch_size),
                    validation_steps = val_samples//batch_size)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x146887a10>

The accuracy of word level model is 59%

### Inference

In [112]:
# Encode the input sequence to get the "Context vectors"
encoder_model = Model(encoder_inputs, encoder_states)
# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]
# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs)
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_state_input)
decoder_states2 = [state_h2, state_c2]
# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)
# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_state_input,
    [decoder_outputs2] + decoder_states2)

In [113]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [114]:
train_gen = generate_batch(X_train, Y_train, batch_size = 1)
k=-1

In [134]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Source sentence:', X_train[k:k+1].values[0])
print('Actual Target Translation:', Y_train[k:k+1].values[0][6:-4])
print('Predicted Target Translation:', decoded_sentence[:-4])

Input Source sentence: are you tom
Actual Target Translation:  êtesvous tom  
Predicted Target Translation:  vous êtes perdu 


In [145]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual French Translation:', Y_train[k:k+1].values[0][6:-4])
print('Predicted French Translation:', decoded_sentence[:-4])

Input English sentence: i want to walk
Actual French Translation:  je veux marcher 
Predicted French Translation:  je veux des travail 


In [148]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual French Translation:', Y_train[k:k+1].values[0][6:-4])
print('Predicted French Translation:', decoded_sentence[:-4])

Input English sentence: grab my hand
Actual French Translation:  saisissezmoi la main  
Predicted French Translation:  faisle tranquille 


## Inference:
1. The word level model is not able to predict the noun in some cases. For example: In "Are you tom", "are you" is predicted correctly. But not Tom.
2. The verb is not predicted correctly in the second sentence. "I want to walk"is predicted as "je veux des travail"(I want work)