In [2]:
import re
import string
from unicodedata import normalize
import numpy as np
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, LSTM, Dense, Attention
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu

In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [4]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs


In [5]:
def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)

In [6]:
filename = "fra.txt"
doc = load_doc(filename)
pairs = to_pairs(doc)
# choose sample size
# n_train = 20000
# clean_pairs = clean_data(pairs)[0:n_train, :]
clean_pairs = clean_data(pairs)
input_texts = clean_pairs[:, 0]
target_texts = ['\t' + text + '\n' for text in clean_pairs[:, 1]]
max_encoder_seq_length = max(len(line) for line in input_texts)
max_decoder_seq_length = max(len(line) for line in target_texts)

In [7]:
print('Length of input_texts:  ' + str(input_texts.shape))
print('Length of target_texts: ' + str(input_texts.shape))

Length of input_texts:  (170651,)
Length of target_texts: (170651,)


In [8]:
print('max length of input  sentences: %d' % (max_encoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_seq_length))

max length of input  sentences: 281
max length of target sentences: 341


In [9]:
# split data into training and testing 
X_train, X_test, y_train, y_test = train_test_split(input_texts, target_texts, test_size=0.2, random_state=42)

In [10]:
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index

In [11]:
encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, X_train)
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length, y_train)

In [12]:
print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))

shape of encoder_input_seq: (136520, 281)
shape of input_token_index: 27
shape of decoder_input_seq: (136520, 341)
shape of target_token_index: 29


In [13]:
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

print('num_encoder_tokens: ' + str(num_encoder_tokens))
print('num_decoder_tokens: ' + str(num_decoder_tokens))

num_encoder_tokens: 28
num_decoder_tokens: 30


In [14]:
def onehot_encode(sequences, max_len, vocab_size):
    n = len(sequences)
    data = np.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i, :, :] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

In [15]:
# encode and decode data
encoder_input_data = onehot_encode(encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data = onehot_encode(decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)
decoder_target_seq = np.zeros(decoder_input_seq.shape)
decoder_target_seq[:, 0:-1] = decoder_input_seq[:, 1:]
decoder_target_data = onehot_encode(decoder_target_seq,
                                    max_decoder_seq_length,
                                    num_decoder_tokens)

In [16]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)

(136520, 281, 28)
(136520, 341, 30)


CREATE TRAINING MODEL

In [18]:
# encoder model
latent_dim = 256

# inputs of the encoder network
encoder_inputs = Input(shape=(None, num_encoder_tokens), 
                       name='encoder_inputs')

# set the LSTM layer
encoder_lstm = LSTM(latent_dim, return_state=True, 
                    dropout=0.5, name='encoder_lstm')
_, state_h, state_c = encoder_lstm(encoder_inputs)

# build the encoder network model
encoder_model = Model(inputs=encoder_inputs, 
                      outputs=[state_h, state_c],
                      name='encoder')

In [19]:
# decoder model

# inputs of the decoder network
decoder_input_h = Input(shape=(latent_dim,), name='decoder_input_h')
decoder_input_c = Input(shape=(latent_dim,), name='decoder_input_c')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# set the LSTM layer
decoder_lstm = LSTM(latent_dim, return_sequences=True, 
                    return_state=True, dropout=0.5, name='decoder_lstm')
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_input_x, 
                                                  initial_state=[decoder_input_h, decoder_input_c])

# set the attention layer
attention_layer = Attention(use_scale=True, name='attention_layer')
attention_outputs = attention_layer([decoder_input_h, state_h])

# set the dense layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
# decoder_outputs = decoder_dense(decoder_lstm_outputs)
decoder_outputs = decoder_dense(attention_outputs)

# build the decoder network model
decoder_model = Model(inputs=[decoder_input_x, decoder_input_h, decoder_input_c],
                      outputs=[decoder_outputs, state_h, state_c],
                      name='decoder')

In [20]:
# connect encoder and decoder

# input layers
encoder_input_x = Input(shape=(None, num_encoder_tokens), name='encoder_input_x')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# connect encoder to decoder
encoder_final_states = encoder_model([encoder_input_x])
decoder_lstm_output, _, _ = decoder_lstm(decoder_input_x, initial_state=encoder_final_states)
decoder_pred = decoder_dense(decoder_lstm_output)

model = Model(inputs=[encoder_input_x, decoder_input_x], 
              outputs=decoder_pred, 
              name='model_training')

In [21]:
print('shape of encoder_input_data' + str(encoder_input_data.shape))
print('shape of decoder_input_data' + str(decoder_input_data.shape))
print('shape of decoder_target_data' + str(decoder_target_data.shape))

shape of encoder_input_data(136520, 281, 28)
shape of decoder_input_data(136520, 341, 30)
shape of decoder_target_data(136520, 341, 30)


In [24]:
class LossHistory(tensorflow.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.logs = []

    def on_batch_end(self, batch, logs={}):
        self.logs.append(logs)
        if self.i % 200 == 0:
            print('Info for {} iteration:'.format(self.i), logs)
        self.i += 1
                

In [26]:
filepath = "seq2seq-attention.hdf5"        
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='auto', period=1)        
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
loss_history = LossHistory()
model.fit([encoder_input_data, decoder_input_data],  # training data
          decoder_target_data,                       # labels 
          batch_size=100, 
          epochs=5,
          verbose=0,
          callbacks=[loss_history, checkpoint])
#          callbacks=[loss_history])

model.save('seq2seq_attention_model.h5')

Info for 0 iteration: {'batch': 0, 'size': 100, 'loss': 3.4912047}
Info for 200 iteration: {'batch': 200, 'size': 100, 'loss': 0.27875063}
Info for 400 iteration: {'batch': 400, 'size': 100, 'loss': 0.25521582}
Info for 600 iteration: {'batch': 600, 'size': 100, 'loss': 0.24696939}
Info for 800 iteration: {'batch': 800, 'size': 100, 'loss': 0.24654016}
Info for 1000 iteration: {'batch': 1000, 'size': 100, 'loss': 0.22076812}
Info for 1200 iteration: {'batch': 1200, 'size': 100, 'loss': 0.24703893}
Info for 1400 iteration: {'batch': 34, 'size': 100, 'loss': 0.2526994}
Info for 1600 iteration: {'batch': 234, 'size': 100, 'loss': 0.21534775}
Info for 1800 iteration: {'batch': 434, 'size': 100, 'loss': 0.23758927}
Info for 2000 iteration: {'batch': 634, 'size': 100, 'loss': 0.2232011}
Info for 2200 iteration: {'batch': 834, 'size': 100, 'loss': 0.1989719}
Info for 2400 iteration: {'batch': 1034, 'size': 100, 'loss': 0.22706828}
Info for 2600 iteration: {'batch': 1234, 'size': 100, 'loss': 

In [1]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

NameError: name 'input_token_index' is not defined

In [None]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # this line of code is greedy selection
        # try to use multinomial sampling instead (with temperature)
        sampled_token_index = np.argmax(output_tokens[0, :])
        
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence

In [40]:
def translate(X_test, y_test):
    #select a set of sentences from the testing data, and pringth translation results from the model:
    test_encoder_input, test_input_token_index = text2sequences(max_encoder_seq_length, X_test[:30])
    test_decoder_input_seq, test_target_token_index = text2sequences(max_decoder_seq_length, y_test[:30])
    test_onehot_input = onehot_encode(test_encoder_input, max_encoder_seq_length, num_encoder_tokens)
    # make predictions
    trans = []
    for i in range(len(X_test)):
        translated_sentence = decode_sequence(test_onehot_input[i:i+1])
        trans.append(translated_sentence[:-1])
        print('-')
        print('English:       ', X_test[i])
        
        print('French (true): ', y_test[i].strip())
        print('French (pred): ', translated_sentence[:-1])
    return trans

In [42]:
for seq_index in range(500, 520):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    print(input_seq.shape)
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('English:       ', input_texts[seq_index])
    print('French (true): ', target_texts[seq_index][1:-1])
    print('French (pred): ', decoded_sentence[0:-1])

(1, 281, 28)
-
English:        come over
French (true):  viens
French (pred):  je ne suis pas de tout de monde a la porte
(1, 281, 28)
-
English:        come over
French (true):  venez
French (pred):  je ne suis pas de tout de monde a la porte
(1, 281, 28)
-
English:        come over
French (true):  venez ici
French (pred):  je ne suis pas de tout de monde a la porte
(1, 281, 28)
-
English:        come over
French (true):  viens chez nous
French (pred):  je ne suis pas de tout de monde a la porte
(1, 281, 28)
-
English:        come over
French (true):  venez chez nous
French (pred):  je ne suis pas de tout de monde a la porte
(1, 281, 28)
-
English:        come over
French (true):  viens chez moi
French (pred):  je ne suis pas de tout de monde a la porte
(1, 281, 28)
-
English:        come over
French (true):  venez chez moi
French (pred):  je ne suis pas de tout de monde a la porte
(1, 281, 28)
-
English:        come soon
French (true):  viens bientot
French (pred):  je ne suis pas de

In [41]:
translated_French = translate(X_test[:30], y_test[:30])

-
English:        i only wish that were possible
French (true):  je souhaiterais seulement que ce fut possible
French (pred):  je ne suis pas de tout de monde a la porte
-
English:        you have to stay fit
French (true):  vous devez garder la forme
French (pred):  je ne suis pas de tout de monde a la porte
-
English:        we want to help
French (true):  nous voulons aider
French (pred):  je ne suis pas de tout de monde a la porte
-
English:        why is life so full of suffering
French (true):  pourquoi la vie estelle si remplie de souffrance
French (pred):  je ne suis pas de tout de monde a la porte
-
English:        i think so too
French (true):  oui je le pense aussi
French (pred):  je ne suis pas de tout de monde a la porte
-
English:        he came from another country
French (true):  il venait dun autre pays
French (pred):  je ne suis pas de tout de monde a la porte
-
English:        my son has a black beard
French (true):  mon fils a la barbe noire
French (pred):  je ne su

In [None]:
sum = 0
for i in range(len(X_test)):
    reference = y_test[i].split()
    candidate = translated_French[i].split()
    
    score = sentence_bleu(reference, candidate, weights=(1,0,0,0))
    sum += score
    print('-')
    print('English:       ', X_test[i])

    print('Spanish (true): ', reference)
    print('Spanish (pred): ', candidate)
    print(score)
avg = sum/30
print('SCORE IS :', avg)

In [None]:
print('SCORE IS :', avg)