In [1]:
import re
import string
import numpy as np
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, Attention, 
from tensorflow.keras.models import Model
from keras.utils.vis_utils import model_to_dot, plot_model
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu

Using TensorFlow backend.


In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)

In [33]:
filename = "fra.txt"
doc = load_doc(filename)
pairs = to_pairs(doc)
# choose sample size
n_train = 50000
clean_pairs = clean_data(pairs)[0:n_train, :]
# clean_pairs = clean_data(pairs)
input_texts = clean_pairs[:, 0]
target_texts = ['\t' + text + '\n' for text in clean_pairs[:, 1]]
max_encoder_seq_length = max(len(line) for line in input_texts)
max_decoder_seq_length = max(len(line) for line in target_texts)

In [34]:
# n_test = 3000
# clean_test = clean_data(pairs)[n_train:n_test, :]
# input_test = clean_test[:, 0]
 # target_texts = ['\t' + text + '\n' for text in clean_test[:, 1]]

In [35]:
print('Length of input_texts:  ' + str(input_texts.shape))
print('Length of target_texts: ' + str(input_texts.shape))

Length of input_texts:  (50000,)
Length of target_texts: (50000,)


In [36]:
print('max length of input  sentences: %d' % (max_encoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_seq_length))

max length of input  sentences: 22
max length of target sentences: 71


In [6]:
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index

In [8]:
encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, input_texts)
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length, target_texts)

In [40]:
# split data into training and testing 
X_train, X_test, y_train, y_test = train_test_split(input_texts, target_texts, test_size=0.2, random_state=42)
encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, X_train)
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length, y_train)

In [41]:
print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))

shape of encoder_input_seq: (40000, 22)
shape of input_token_index: 27
shape of decoder_input_seq: (40000, 71)
shape of target_token_index: 29


In [42]:
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

print('num_encoder_tokens: ' + str(num_encoder_tokens))
print('num_decoder_tokens: ' + str(num_decoder_tokens))

num_encoder_tokens: 28
num_decoder_tokens: 30


In [43]:
def onehot_encode(sequences, max_len, vocab_size):
    n = len(sequences)
    data = np.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i, :, :] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

In [44]:
# encode and decode data
encoder_input_data = onehot_encode(encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data = onehot_encode(decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)
decoder_target_seq = np.zeros(decoder_input_seq.shape)
decoder_target_seq[:, 0:-1] = decoder_input_seq[:, 1:]
decoder_target_data = onehot_encode(decoder_target_seq,
                                    max_decoder_seq_length,
                                    num_decoder_tokens)

In [45]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)

(40000, 22, 28)
(40000, 71, 30)


In [48]:
# encoder model
latent_dim = 256

# encode input layers
encoder_inputs = Input(shape=(None, num_encoder_tokens), name='encoder_inputs')


# set the LSTM layer
encoder_lstm = LSTM(latent_dim, return_state=True, return_sequences=True,
                    dropout=0.5, name='encoder_lstm')
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs) # lstm1 is 3d (batch_size, timesteps, units)


# build the encoder network model
encoder_model = Model(inputs=encoder_inputs, 
                      outputs=[encoder_outputs, encoder_state_h, encoder_state_c],
                      name='encoder')

In [49]:
# inputs of the decoder network
attention_input = Input(shape=(None, 256), name='attention_input')
decoder_input_h = Input(shape=(latent_dim,), name='decoder_input_h')
decoder_input_c = Input(shape=(latent_dim,), name='decoder_input_c')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# set the LSTM layer
decoder_lstm = LSTM(latent_dim, return_sequences=True, 
                    return_state=True, dropout=0.5, name='decoder_lstm')
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_input_x, 
                                                      initial_state=[decoder_input_h, decoder_input_c])

# set the attention layer
attention_layer = Attention(use_scale=True, name='attention_layer')
# attention_outputs = attention_layer([decoder_input_h, state_h])
attention_outputs= attention_layer([decoder_lstm_outputs,attention_input])

# cancatenate lstm output layer with attention layer
concatenate_outputs = Concatenate(name='concatenate_outputs')([decoder_lstm_outputs,attention_outputs])

# set the dense layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(concatenate_outputs)

# build the decoder network model
decoder_model = Model(inputs=[decoder_input_x, attention_input, decoder_input_h, decoder_input_c],
                      outputs=[decoder_outputs, state_h, state_c],
                      name='decoder')

In [50]:
encoder_model.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_inputs (InputLayer)  [(None, None, 28)]        0         
_________________________________________________________________
encoder_lstm (LSTM)          [(None, None, 256), (None 291840    
Total params: 291,840
Trainable params: 291,840
Non-trainable params: 0
_________________________________________________________________


In [54]:
# connect encoder and decoder

# input layers
encoder_input_x = Input(shape=(encoder_input_data.shape[1:]), name='encoder_input_x')
decoder_input_x = Input(shape=(decoder_input_data.shape[1:]), name='decoder_input_x')

# connect encoder to decoder
encoder_outputs, encoder_state_h, encoder_state_c = encoder_model([encoder_input_x])
decoder_lstm_output, _, _ = decoder_lstm(decoder_input_x, initial_state=[encoder_state_h, encoder_state_c])

attention_outputs= attention_layer([decoder_lstm_output,encoder_outputs])

concatenate_outputs = Concatenate(name='concatenate_outputs')([decoder_lstm_output,attention_outputs])

decoder_pred = decoder_dense(concatenate_outputs)

model = Model(inputs=[encoder_input_x, decoder_input_x], 
              outputs=decoder_pred, 
              name='model_training')

In [55]:
class LossHistory(tensorflow.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.logs = []

    def on_batch_end(self, batch, logs={}):
        self.logs.append(logs)
        if self.i % 200 == 0:
            print('Info for {} iteration:'.format(self.i), logs)
        self.i += 1
                

In [58]:
filepath = "seq2seq-1113-rmsprop.hdf5"   
checkpoint = ModelCheckpoint(filepath, monitor='accuracy', verbose=2, save_best_only=True, mode='auto')
his = LossHistory()
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data],  # training data
          decoder_target_data,                        # labels 
          validation_split=0.2,
          batch_size=64,
          epochs=100,
          verbose=0,
          callbacks=[his])

Info for 0 iteration: {'batch': 0, 'size': 64, 'loss': 0.69010216, 'accuracy': 0.79005283}
Info for 200 iteration: {'batch': 200, 'size': 64, 'loss': 0.71332324, 'accuracy': 0.7926521}
Info for 400 iteration: {'batch': 400, 'size': 64, 'loss': 0.6723653, 'accuracy': 0.7936541}
Info for 600 iteration: {'batch': 100, 'size': 64, 'loss': 0.73230034, 'accuracy': 0.79975986}
Info for 800 iteration: {'batch': 300, 'size': 64, 'loss': 0.6356755, 'accuracy': 0.8008224}
Info for 1000 iteration: {'batch': 0, 'size': 64, 'loss': 0.66092414, 'accuracy': 0.8028169}
Info for 1200 iteration: {'batch': 200, 'size': 64, 'loss': 0.6314477, 'accuracy': 0.80528146}
Info for 1400 iteration: {'batch': 400, 'size': 64, 'loss': 0.6302875, 'accuracy': 0.80710965}
Info for 1600 iteration: {'batch': 100, 'size': 64, 'loss': 0.61117774, 'accuracy': 0.8100422}
Info for 1800 iteration: {'batch': 300, 'size': 64, 'loss': 0.6043601, 'accuracy': 0.81083155}
Info for 2000 iteration: {'batch': 0, 'size': 64, 'loss': 0.6

Info for 17400 iteration: {'batch': 400, 'size': 64, 'loss': 0.53757125, 'accuracy': 0.8442637}
Info for 17600 iteration: {'batch': 100, 'size': 64, 'loss': 0.5359948, 'accuracy': 0.84483075}
Info for 17800 iteration: {'batch': 300, 'size': 64, 'loss': 0.5136626, 'accuracy': 0.84502655}
Info for 18000 iteration: {'batch': 0, 'size': 64, 'loss': 0.5091352, 'accuracy': 0.84308976}
Info for 18200 iteration: {'batch': 200, 'size': 64, 'loss': 0.5131425, 'accuracy': 0.8451646}
Info for 18400 iteration: {'batch': 400, 'size': 64, 'loss': 0.523442, 'accuracy': 0.845229}
Info for 18600 iteration: {'batch': 100, 'size': 64, 'loss': 0.4962674, 'accuracy': 0.84669805}
Info for 18800 iteration: {'batch': 300, 'size': 64, 'loss': 0.52721804, 'accuracy': 0.8456824}
Info for 19000 iteration: {'batch': 0, 'size': 64, 'loss': 0.52202135, 'accuracy': 0.83978873}
Info for 19200 iteration: {'batch': 200, 'size': 64, 'loss': 0.5495677, 'accuracy': 0.8467259}
Info for 19400 iteration: {'batch': 400, 'size':

Info for 34600 iteration: {'batch': 100, 'size': 64, 'loss': 0.4291982, 'accuracy': 0.8534026}
Info for 34800 iteration: {'batch': 300, 'size': 64, 'loss': 0.49388304, 'accuracy': 0.8539566}
Info for 35000 iteration: {'batch': 0, 'size': 64, 'loss': 0.46014786, 'accuracy': 0.859375}
Info for 35200 iteration: {'batch': 200, 'size': 64, 'loss': 0.4621408, 'accuracy': 0.85322505}
Info for 35400 iteration: {'batch': 400, 'size': 64, 'loss': 0.48856503, 'accuracy': 0.85406697}
Info for 35600 iteration: {'batch': 100, 'size': 64, 'loss': 0.4874028, 'accuracy': 0.8536619}
Info for 35800 iteration: {'batch': 300, 'size': 64, 'loss': 0.462517, 'accuracy': 0.8545934}
Info for 36000 iteration: {'batch': 0, 'size': 64, 'loss': 0.48259446, 'accuracy': 0.85101235}
Info for 36200 iteration: {'batch': 200, 'size': 64, 'loss': 0.47393432, 'accuracy': 0.8545882}
Info for 36400 iteration: {'batch': 400, 'size': 64, 'loss': 0.4502625, 'accuracy': 0.85448295}
Info for 36600 iteration: {'batch': 100, 'size'

<tensorflow.python.keras.callbacks.History at 0x1a65535e80>

In [None]:
# model.load_model('s2s-1113.model')

In [59]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [60]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq) # return 3 values, output, state_h, state_c
    attention_input, state_h, state_c = states_value
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.
    
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, state_h, state_c = decoder_model.predict([target_seq, attention_input, state_h, state_c])

        # this line of code is greedy selection
        # try to use multinomial sampling instead (with temperature)
        sampled_token_index = np.argmax(output_tokens[0, :])
        
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

    return decoded_sentence

In [82]:
for seq_index in range(1000, 1010):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('English:       ', input_texts[seq_index])
    print('French (true): ', target_texts[seq_index][1:-1])
    print('French (pred): ', decoded_sentence[0:-1])

-
English:        whats up
French (true):  quoi de beau
French (pred):  vous etes tres occupes
-
English:        who cares
French (true):  qui sen preoccupe
French (pred):  ne le rener pas de mon coup
-
English:        who cares
French (true):  qui sen soucie
French (pred):  il peut faire une bonne chancon
-
English:        who cares
French (true):  a qui ceci importetil
French (pred):  allerai la moiture
-
English:        who is he
French (true):  qui estce
French (pred):  je ne lai jamais dit une conne
-
English:        who is he
French (true):  qui estil
French (pred):  jentends de la maison
-
English:        who is it
French (true):  qui estce
French (pred):  quelle est la boute
-
English:        who is it
French (true):  qui estil
French (pred):  jai desoin dune corte
-
English:        who knows
French (true):  qui sait
French (pred):  je le suis senti le courie
-
English:        who spoke
French (true):  qui a parle
French (pred):  je suis content


In [85]:
# encode and decode test data
test_encoder_input_seq, test_input_token_index = text2sequences(max_encoder_seq_length, X_test)
test_decoder_input_seq, test_target_token_index = text2sequences(max_decoder_seq_length, y_test)

test_encoder_input_data = onehot_encode(test_encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
test_decoder_input_data = onehot_encode(test_decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)
test_decoder_target_seq = np.zeros(test_decoder_input_seq.shape)
test_decoder_target_seq[:, 0:-1] = test_decoder_input_seq[:, 1:]
test_decoder_target_data = onehot_encode(test_decoder_target_seq,
                                    max_decoder_seq_length,
                                    num_decoder_tokens)

In [66]:
#select a set of sentences from the testing data, and pringth translation results from the model:
test_encoder_input, test_input_token_index = text2sequences(max_encoder_seq_length, X_test)
test_decoder_input_seq, test_target_token_index = text2sequences(max_decoder_seq_length, y_test)
test_onehot_input = onehot_encode(test_encoder_input, max_encoder_seq_length, num_encoder_tokens)


In [67]:
test_onehot_input.shape


(10000, 22, 28)

In [87]:
trans = []
english = []
french = []
for i in range(len(X_test[:1000])):
#     print('English: ', X_test[i])
    try:
        translated_sentence = decode_sequence(test_onehot_input[i:i+1])
        trans.append(translated_sentence[:-1])
        english.append(y_test[i])
        french.append(X_test[i])
    except KeyError:
        continue

In [88]:
def translate(X_test, y_test):
    trans = []
    english = []
    french = []
    #select a set of sentences from the testing data, and pringth translation results from the model:
    test_encoder_input, test_input_token_index = text2sequences(max_encoder_seq_length, X_test[:30])
    test_decoder_input_seq, test_target_token_index = text2sequences(max_decoder_seq_length, y_test[:30])
    test_onehot_input = onehot_encode(test_encoder_input, max_encoder_seq_length, num_encoder_tokens)
    # make predictions
    for i in range(len(X_test)):
        try:
            translated_sentence = decode_sequence(test_onehot_input[i:i+1])
            trans.append(translated_sentence[:-1])
            english.append(y_test[i])
            french.append(X_test[i])
        except KeyError:
            continue

#         print('-')
#         print('English:       ', X_test[i])
        
#         print('French (true): ', y_test[i].strip())
#         print('French (pred): ', translated_sentence[:-1])
    return trans, french, english

In [89]:
trans, french, english = translate(X_test[50:70], y_test[50:70])

In [92]:
for i in range(len(trans)):
    reference = trans[i]
    candidate = french[i]
    score = sentence_bleu(reference, candidate, weights=(1,0,0,0))
    sum += score

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [94]:
print(score/len(trans))

0.019047619047619046


In [91]:
# try 
sum = 0
start = 0
end = 2
for seq_index in range(start, end):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    reference = [target_texts[seq_index][1:-1].strip().split()]
    candidate = decoded_sentence[0:-1].strip().split()
    print(candidate)
    print(reference)
    score = sentence_bleu(reference, candidate, weights=(1,0,0,0))
    sum += score
    print('-')
    print('English:       ', input_texts[seq_index])

    print('French (true): ', reference)
    print('French (pred): ', candidate)
    print(score)
avg = sum/(end - start)
print('SCORE IS :', avg)

['astu', 'besoin', 'de', 'poisson']
[['va']]
-
English:        go
French (true):  [['va']]
French (pred):  ['astu', 'besoin', 'de', 'poisson']
0
['elles', 'ne', 'len', 'pas', 'dit']
[['salut']]
-
English:        hi
French (true):  [['salut']]
French (pred):  ['elles', 'ne', 'len', 'pas', 'dit']
0
SCORE IS : 0.0
