In [1]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

In [2]:
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [3]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [4]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [5]:
# load dataset
filename = 'fra.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-french.pkl')
# spot check
for i in range(10):
    print('%s -- %s' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-french.pkl
go -- va
run -- cours
run -- courez
fire -- au feu
help -- a laide
jump -- saute
stop -- ca suffit
stop -- stop
stop -- arretetoi
wait -- attends


In [1]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [2]:
# load dataset
raw_dataset = load_clean_sentences('english-french.pkl')
raw_dataset.shape

(154883, 2)

In [3]:
#test = 51628
#train = 103255
#.30 split
dataset = raw_dataset[:15000, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:13000], dataset[13000:]
# save
save_clean_data(dataset, 'english-french-both.pkl')
save_clean_data(train, 'english-french-train.pkl')
save_clean_data(test, 'english-french-test.pkl')

Saved: english-french-both.pkl
Saved: english-french-train.pkl
Saved: english-french-test.pkl


In [1]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [3]:
# load datasets
dataset = load_clean_sentences('english-french-both.pkl')
train = load_clean_sentences('english-french-train.pkl')
test = load_clean_sentences('english-french-test.pkl')

In [4]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# prepare french tokenizer
fre_tokenizer = create_tokenizer(dataset[:, 1])
fre_vocab_size = len(fre_tokenizer.word_index) + 1
fre_length = max_length(dataset[:, 1])
print('french Vocabulary Size: %d' % fre_vocab_size)
print('french Max Length: %d' % (fre_length))

English Vocabulary Size: 2892
English Max Length: 5
french Vocabulary Size: 5778
french Max Length: 10


In [5]:
# prepare training data
trainX = encode_sequences(fre_tokenizer, fre_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

# prepare validation data
testX = encode_sequences(fre_tokenizer, fre_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [6]:
# define model
model = define_model(fre_vocab_size, eng_vocab_size, fre_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 256)           1479168   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 2892)           743244    
Total params: 3,273,036
Trainable params: 3,273,036
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')
model.fit(trainX, trainY, 
          epochs=100, 
          batch_size=64, 
          validation_data=(testX, testY), 
          callbacks=[checkpoint], 
          verbose=2)

Train on 13000 samples, validate on 2000 samples
Epoch 1/100
 - 15s - loss: 4.3234 - val_loss: 3.6822

Epoch 00001: val_loss improved from inf to 3.68218, saving model to model.h5
Epoch 2/100
 - 13s - loss: 3.5257 - val_loss: 3.5223

Epoch 00002: val_loss improved from 3.68218 to 3.52230, saving model to model.h5
Epoch 3/100
 - 12s - loss: 3.3257 - val_loss: 3.3584

Epoch 00003: val_loss improved from 3.52230 to 3.35839, saving model to model.h5
Epoch 4/100
 - 12s - loss: 3.1116 - val_loss: 3.2073

Epoch 00004: val_loss improved from 3.35839 to 3.20734, saving model to model.h5
Epoch 5/100
 - 12s - loss: 2.9083 - val_loss: 3.0779

Epoch 00005: val_loss improved from 3.20734 to 3.07794, saving model to model.h5
Epoch 6/100
 - 12s - loss: 2.7166 - val_loss: 2.9416

Epoch 00006: val_loss improved from 3.07794 to 2.94158, saving model to model.h5
Epoch 7/100
 - 12s - loss: 2.5247 - val_loss: 2.8102

Epoch 00007: val_loss improved from 2.94158 to 2.81023, saving model to model.h5
Epoch 8/10

 - 12s - loss: 0.0889 - val_loss: 2.3174

Epoch 00071: val_loss did not improve from 2.03217
Epoch 72/100
 - 12s - loss: 0.0885 - val_loss: 2.3304

Epoch 00072: val_loss did not improve from 2.03217
Epoch 73/100
 - 12s - loss: 0.0902 - val_loss: 2.3300

Epoch 00073: val_loss did not improve from 2.03217
Epoch 74/100
 - 12s - loss: 0.0884 - val_loss: 2.3377

Epoch 00074: val_loss did not improve from 2.03217
Epoch 75/100
 - 12s - loss: 0.0866 - val_loss: 2.3468

Epoch 00075: val_loss did not improve from 2.03217
Epoch 76/100
 - 12s - loss: 0.0879 - val_loss: 2.3553

Epoch 00076: val_loss did not improve from 2.03217
Epoch 77/100
 - 12s - loss: 0.0878 - val_loss: 2.3567

Epoch 00077: val_loss did not improve from 2.03217
Epoch 78/100
 - 12s - loss: 0.0864 - val_loss: 2.3425

Epoch 00078: val_loss did not improve from 2.03217
Epoch 79/100
 - 15s - loss: 0.0866 - val_loss: 2.3533

Epoch 00079: val_loss did not improve from 2.03217
Epoch 80/100
 - 14s - loss: 0.0847 - val_loss: 2.3757

Epoc

<keras.callbacks.History at 0x2661736d8d0>

In [6]:
from pickle import load

from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [7]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
 
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
 
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)
 
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X
 
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
 
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [8]:
# evaluate the skill of the model

def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, 
                                     weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, 
                                     weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, 
                                     weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, 
                                     predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [9]:
# load datasets
dataset = load_clean_sentences('english-french-both.pkl')
train = load_clean_sentences('english-french-train.pkl')
test = load_clean_sentences('english-french-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

# prepare freman tokenizer
fre_tokenizer = create_tokenizer(dataset[:, 1])
fre_vocab_size = len(fre_tokenizer.word_index) + 1
fre_length = max_length(dataset[:, 1])

# prepare data
trainX = encode_sequences(fre_tokenizer, fre_length, train[:, 1])
testX = encode_sequences(fre_tokenizer, fre_length, test[:, 1])

In [None]:
# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)