In [1]:
import re
import string
from unicodedata import normalize
import numpy as np
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, Attention, GRU,GlobalAveragePooling1D
from tensorflow.keras.models import Model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [3]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs


In [4]:
def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)

In [5]:
filename = "fra.txt"
doc = load_doc(filename)
pairs = to_pairs(doc)
# choose sample size
n_train = 20000
clean_pairs = clean_data(pairs)[0:n_train, :]
# clean_pairs = clean_data(pairs)
input_texts = clean_pairs[:, 0]
target_texts = ['\t' + text + '\n' for text in clean_pairs[:, 1]]
max_encoder_seq_length = max(len(line) for line in input_texts)
max_decoder_seq_length = max(len(line) for line in target_texts)

In [6]:
print('Length of input_texts:  ' + str(input_texts.shape))
print('Length of target_texts: ' + str(input_texts.shape))

Length of input_texts:  (20000,)
Length of target_texts: (20000,)


In [7]:
print('max length of input  sentences: %d' % (max_encoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_seq_length))

max length of input  sentences: 17
max length of target sentences: 56


In [8]:
# split data into training and testing 
X_train, X_test, y_train, y_test = train_test_split(input_texts, target_texts, test_size=0.2, random_state=42)

In [9]:
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index

In [10]:
encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, X_train)
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length, y_train)

In [11]:
print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))

shape of encoder_input_seq: (16000, 17)
shape of input_token_index: 27
shape of decoder_input_seq: (16000, 56)
shape of target_token_index: 29


In [12]:
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

print('num_encoder_tokens: ' + str(num_encoder_tokens))
print('num_decoder_tokens: ' + str(num_decoder_tokens))

num_encoder_tokens: 28
num_decoder_tokens: 30


In [13]:
def onehot_encode(sequences, max_len, vocab_size):
    n = len(sequences)
    data = np.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i, :, :] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

In [14]:
# encode and decode data
encoder_input_data = onehot_encode(encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data = onehot_encode(decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)
decoder_target_seq = np.zeros(decoder_input_seq.shape)
decoder_target_seq[:, 0:-1] = decoder_input_seq[:, 1:]
decoder_target_data = onehot_encode(decoder_target_seq,
                                    max_decoder_seq_length,
                                    num_decoder_tokens)

In [15]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)

(16000, 17, 28)
(16000, 56, 30)


In [23]:
latent_dim = 256

# inputs of the encoder network
encoder_inputs = Input(shape=(None, num_encoder_tokens), 
                       name='encoder_inputs')

# set the LSTM layer
encoder_lstm = LSTM(latent_dim, return_state=True,return_sequences=True, 
                    dropout=0.5, name='encoder_lstm')
_, state_h, state_c = encoder_lstm(encoder_inputs)

# build the encoder network model
encoder_model = Model(inputs=encoder_inputs, 
                      outputs=[_, state_h, state_c],
                      name='encoder')

In [36]:
encoder_model.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_inputs (InputLayer)  [(None, None, 28)]        0         
_________________________________________________________________
encoder_lstm (LSTM)          [(None, None, 256), (None 291840    
Total params: 291,840
Trainable params: 291,840
Non-trainable params: 0
_________________________________________________________________


In [38]:
decoder_input_h = Input(shape=(latent_dim,), name='decoder_input_h')
decoder_input_c = Input(shape=(latent_dim,), name='decoder_input_c')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')
attention_encoder_h = Input(shape=(None, 256), name='attention_encoder_h')

# set the LSTM layer
decoder_lstm = LSTM(latent_dim, return_sequences=True, 
                    return_state=True, dropout=0.5, name='decoder_lstm')
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_input_x, 
                                                      initial_state=[decoder_input_h, decoder_input_c])

# set the attention layer
attention_layer = Attention(use_scale=True, name='attention_layer')
# attention_outputs = attention_layer([decoder_input_h, state_h])
attention_outputs= attention_layer([attention_encoder_h, state_h])


# cancatenate lstm output layer with attention layer
concatenate_outputs = Concatenate(name='concatenate_outputs')([decoder_lstm_outputs,attention_outputs])

# set the dense layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_lstm_outputs)

# build the decoder network model
decoder_model = Model(inputs=[decoder_input_x, decoder_input_h, decoder_input_c],
                      outputs=[decoder_outputs, state_h, state_c],
                      name='decoder')

In [39]:
decoder_model.summary()

Model: "decoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input_x (InputLayer)    [(None, None, 30)]   0                                            
__________________________________________________________________________________________________
decoder_input_h (InputLayer)    [(None, 256)]        0                                            
__________________________________________________________________________________________________
decoder_input_c (InputLayer)    [(None, 256)]        0                                            
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 256),  293888      decoder_input_x[0][0]            
                                                                 decoder_input_h[0][0]      

In [None]:
# input layers
encoder_input_x = Input(shape=(None, num_encoder_tokens), name='encoder_input_x')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# connect encoder to decoder
encoder_final_states = encoder_model([encoder_input_x])
decoder_lstm_output, _, _ = decoder_lstm(decoder_input_x, initial_state=encoder_final_states)
decoder_pred = decoder_dense(decoder_lstm_output)

model = Model(inputs=[encoder_input_x, decoder_input_x], 
              outputs=decoder_pred, 
              name='model_training')

In [None]:
# inputs of the decoder network
decoder_input_h = Input(shape=(latent_dim,), name='decoder_input_h')
decoder_input_c = Input(shape=(latent_dim,), name='decoder_input_c')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# set the LSTM layer
decoder_lstm = LSTM(latent_dim, return_sequences=True, 
                    return_state=True, dropout=0.5, name='decoder_lstm')
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_input_x, 
                                                      initial_state=[decoder_input_h, decoder_input_c])

# set the attention layer
attention_layer = Attention(use_scale=True, name='attention_layer')
# attention_outputs = attention_layer([decoder_input_h, state_h])
attention_outputs= attention_layer([encoder_outputs, decoder_lstm_output])

attention_value = GlobalAveragePooling1D()(
    attention_outputs)

# cancatenate lstm output layer with attention layer
concatenate_outputs = Concatenate(name='concatenate_outputs')([decoder_lstm_output,attention_outputs])

# set the dense layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_lstm_outputs)

# build the decoder network model
decoder_model = Model(inputs=[decoder_input_x, decoder_input_h, decoder_input_c],
                      outputs=[decoder_outputs, state_h, state_c],
                      name='decoder')

In [None]:
# input layers
encoder_input_x = Input(shape=(None, num_encoder_tokens), name='encoder_input_x')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# connect encoder to decoder
encoder_final_states = encoder_model([encoder_input_x])
decoder_lstm_output, _, _ = decoder_lstm(decoder_input_x, initial_state=encoder_final_states)
decoder_pred = decoder_dense(decoder_lstm_output)

model = Model(inputs=[encoder_input_x, decoder_input_x], 
              outputs=decoder_pred, 
              name='model_training')

In [95]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')


In [96]:
model.summary()

Model: "model_training"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input_x (InputLayer)    [(None, None, 28)]   0                                            
__________________________________________________________________________________________________
decoder_input_x (InputLayer)    [(None, None, 30)]   0                                            
__________________________________________________________________________________________________
encoder_lstm (LSTM)             [(None, None, 256),  291840      encoder_input_x[0][0]            
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 256),  293888      decoder_input_x[0][0]            
                                                                 encoder_lstm[0][1]  

In [97]:
filepath = "seq2seq-1106-loss.hdf5"        
# checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=False, mode='auto', period=1)        
# checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='auto', period=1)
# his = LossHistory()
model.fit([encoder_input_data, decoder_input_data],  # training data
          decoder_target_data,                       # labels 
          epochs=10,
          verbose=0)


ValueError: Dimension 1 in both shapes must be equal, but are 56 and 17. Shapes are [32,56] and [32,17]. for 'model_training/concatenate_outputs/concat' (op: 'ConcatV2') with input shapes: [32,56,256], [32,17,256], [] and with computed input tensors: input[2] = <2>.

In [26]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [29]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # this line of code is greedy selection
        # try to use multinomial sampling instead (with temperature)
        sampled_token_index = np.argmax(output_tokens[0, :])
        
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence

In [31]:
for seq_index in range(500, 520):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    print(input_seq.shape)
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('English:       ', input_texts[seq_index])
    print('French (true): ', target_texts[seq_index][1:-1])
    print('French (pred): ', decoded_sentence[0:-1])

(1, 17, 28)
-
English:        come over
French (true):  viens
French (pred):  je suis en ente le parte
(1, 17, 28)
-
English:        come over
French (true):  venez
French (pred):  tom est pas ente
(1, 17, 28)
-
English:        come over
French (true):  venez ici
French (pred):  ette le pas ente
(1, 17, 28)
-
English:        come over
French (true):  viens chez nous
French (pred):  etes en e te pas ente
(1, 17, 28)
-
English:        come over
French (true):  venez chez nous
French (pred):  cest mon ent pas ente
(1, 17, 28)
-
English:        come over
French (true):  viens chez moi
French (pred):  je suis en ente le parte
(1, 17, 28)
-
English:        come over
French (true):  venez chez moi
French (pred):  sois ene coure
(1, 17, 28)
-
English:        come soon
French (true):  viens bientot
French (pred):  cest me te te pait
(1, 17, 28)
-
English:        come soon
French (true):  venez bientot
French (pred):  je suis en ente
(1, 17, 28)
-
English:        cool down
French (true):  calmez

In [247]:
#select a set of sentences from the testing data, and pringth translation results from the model:
test_encoder_input, test_input_token_index = text2sequences(max_encoder_seq_length, X_test[:30])
test_decoder_input_seq, test_target_token_index = text2sequences(max_decoder_seq_length, y_test[:30])
test_onehot_input = onehot_encode(test_encoder_input, max_encoder_seq_length, num_encoder_tokens)
test_onehot_input.shape
trans = []
for i in range(30):
#     print('English: ', X_test[i])
    translated_sentence = decode_sequence(test_onehot_input[i:i+1])
    trans.append(translated_sentence[:-1])

In [33]:
def translate(X_test, y_test):
    #select a set of sentences from the testing data, and pringth translation results from the model:
    test_encoder_input, test_input_token_index = text2sequences(max_encoder_seq_length, X_test[:30])
    test_decoder_input_seq, test_target_token_index = text2sequences(max_decoder_seq_length, y_test[:30])
    test_onehot_input = onehot_encode(test_encoder_input, max_encoder_seq_length, num_encoder_tokens)
    # make predictions
    for i in range(len(X_test)):
        
        translated_sentence = decode_sequence(test_onehot_input[i:i+1])
        trans.append(translated_sentence[:-1])
        print('-')
        print('English:       ', X_test[i])
        
        print('French (true): ', y_test[i].strip())
        print('French (pred): ', translated_sentence[:-1])
    return trans

In [34]:
translated_French = translate(X_test[:30], y_test[:30])

-
English:        i hate them all
French (true):  je les deteste tous
French (pred):  je suis en ente le parte
-
English:        he helps us
French (true):  il nous aide
French (pred):  vous etes pas ente
-
English:        no one knew it
French (true):  personne ne le sut
French (pred):  cest me te te pas
-
English:        come to me
French (true):  venez a moi
French (pred):  este le pas ente
-
English:        hes a bit tipsy
French (true):  il est un peu emeche
French (pred):  este le pas ente
-
English:        shes very tall
French (true):  elle est tres grande
French (pred):  etes eus en ere
-
English:        i was satisfied
French (true):  jetais satisfait
French (pred):  je suis en ente le parte
-
English:        whats this
French (true):  cest quoi
French (pred):  aresez e sous en ere
-
English:        lets draw straws
French (true):  tirons a la courtepaille
French (pred):  ne sons ne pas en ente
-
English:        i will sue you
French (true):  je vous poursuivrai en justice
Fr

In [35]:
from nltk.translate.bleu_score import sentence_bleu

In [36]:
sum = 0
for i in range(len(X_test[:30])):
    reference = y_test[i].split()
    candidate = translated_French[i].split()
    
    score = sentence_bleu(reference, candidate, weights=(1,0,0,0))
    sum += score
    print('-')
    print('English:       ', X_test[i])

    print('Spanish (true): ', reference)
    print('Spanish (pred): ', candidate)
    print(score)
avg = sum/30
print('SCORE IS :', avg)

-
English:        i hate them all
Spanish (true):  ['je', 'les', 'deteste', 'tous']
Spanish (pred):  ['je', 'suis', 'en', 'ente', 'le', 'parte']
0
-
English:        he helps us
Spanish (true):  ['il', 'nous', 'aide']
Spanish (pred):  ['vous', 'etes', 'pas', 'ente']
0
-
English:        no one knew it
Spanish (true):  ['personne', 'ne', 'le', 'sut']
Spanish (pred):  ['cest', 'me', 'te', 'te', 'pas']
0
-
English:        come to me
Spanish (true):  ['venez', 'a', 'moi']
Spanish (pred):  ['este', 'le', 'pas', 'ente']
0
-
English:        hes a bit tipsy
Spanish (true):  ['il', 'est', 'un', 'peu', 'emeche']
Spanish (pred):  ['este', 'le', 'pas', 'ente']
0
-
English:        shes very tall
Spanish (true):  ['elle', 'est', 'tres', 'grande']
Spanish (pred):  ['etes', 'eus', 'en', 'ere']
0
-
English:        i was satisfied
Spanish (true):  ['jetais', 'satisfait']
Spanish (pred):  ['je', 'suis', 'en', 'ente', 'le', 'parte']
0
-
English:        whats this
Spanish (true):  ['cest', 'quoi']
Spanish (p

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
