In [1]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

ModuleNotFoundError: No module named 'keras'

In [2]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [36]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

In [4]:
# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[das wird nicht funktionieren], target=[it wont work], predicted=[it wont work]
src=[hor auf damit], target=[stop that], predicted=[stop that]
src=[ich hatte spa], target=[i had fun], predicted=[i had fun]
src=[tom hat nicht gewonnen], target=[tom didnt win], predicted=[tom didnt win]
src=[guter junge], target=[that a boy], predicted=[no a boy]
src=[ist es das], target=[is that it], predicted=[is it yours]
src=[bilden sie zwei reihen], target=[form two lines], predicted=[form two lines]
src=[ihre lippen trafen sich], target=[their lips met], predicted=[their lips met]
src=[ich bin sehr dick], target=[im very fat], predicted=[im very fat]
src=[ruhe dich ein wenig aus], target=[get some rest], predicted=[get some rest]


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.074738
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000
test
src=[maria ist ein madchen], target=[mary is a girl], predicted=[mary is a]
src=[komm herauf], target=[come up here], predicted=[come on up]
src=[warte mal kurz], target=[wait a moment], predicted=[wait a moment]
src=[ich liebe sport], target=[i love sports], predicted=[i like sports]
src=[du bist verruckt], target=[youre nuts], predicted=[you are]
src=[hab vertrauen], target=[have faith], predicted=[do me]
src=[ich schulde ihm], target=[i owe him], predicted=[i hugged him]
src=[wir hatten einen plan], target=[we had a plan], predicted=[we need a plan]
src=[ich esse obst], target=[i eat fruit], predicted=[i smell taller]
src=[er wird schlimmer], target=[it gets worse], predicted=[he is apart]
BLEU-1: 0.077198
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000


In [23]:
dataset[:,0]

array(['it wont work', 'stop that', 'i had fun', ..., 'im dancing',
       'they are busy', 'they voted'], dtype='<U527')

In [13]:
create_tokenizer()

(10000, 2)

In [166]:
a = array([['maria ist ein madchen'],
          ['komm herauf'],
          ['warte mal kurz'],
          ['ich wunsche dir einen schonen tag'],
          ['was ist dir name'],
          ['Ich mage dicht'],
          ['sie sind beschaftigt'],
          ['sie ist ein Mädchen']
          ],dtype='U527')

In [167]:
a.shape

(8, 1)

In [168]:
z = encode_sequences(ger_tokenizer, ger_length, a[:,0])

In [121]:
evaluate_model(model, eng_tokenizer,z,a)

src=[maria ist ein madchen], target=[mary is a girl], predicted=[mary is a]
src=[komm herauf], target=[come up here], predicted=[come on up]
src=[warte mal kurz], target=[wait a moment], predicted=[wait a moment]
src=[ich wunsche dir einen schonen tag], target=[i wish you a great day], predicted=[have a nice day]
src=[was ist dir name], target=[what is your name], predicted=[what it for]
src=[Ich mage dicht], target=[i like you], predicted=[wonderful]
src=[sie sind beschaftigt], target=[they are busy], predicted=[youre busy]
BLEU-1: 0.157895
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [169]:
# evaluate the skill of the model
def custom_evaluate_model(model, tokenizer, sources, raw_dataset):
    predicted = list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target = raw_dataset[i]
        if i < 10:
            print('target=[%s], predicted=[%s]' % (raw_target, translation))
        #actual.append(raw_target.split())
        predicted.append(translation.split())
    # calculate BLEU score
    #print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    #print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    #print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    #print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [163]:
z

array([[ 111,    3,   15,  428,    0,    0,    0,    0,    0,    0],
       [  50,  930,    0,    0,    0,    0,    0,    0,    0,    0],
       [ 286,   48,  256,    0,    0,    0,    0,    0,    0,    0],
       [   1,  892,   45,   38,  613,  271,    0,    0,    0,    0],
       [  41,    3,   45, 1812,    0,    0,    0,    0,    0,    0],
       [   1,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   4,   16,   70,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)

In [170]:
custom_evaluate_model(model,eng_tokenizer,z,a)

target=[['maria ist ein madchen']], predicted=[mary is a]
target=[['komm herauf']], predicted=[come on up]
target=[['warte mal kurz']], predicted=[wait a moment]
target=[['ich wunsche dir einen schonen tag']], predicted=[have a nice day]
target=[['was ist dir name']], predicted=[what it for]
target=[['Ich mage dicht']], predicted=[wonderful]
target=[['sie sind beschaftigt']], predicted=[youre busy]
target=[['sie ist ein Mädchen']], predicted=[she is]
