In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from unicodedata import normalize
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, vis_utils
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, TimeDistributed, RepeatVector, Bidirectional
from keras.callbacks import ModelCheckpoint
from nltk.translate.bleu_score import corpus_bleu

Using TensorFlow backend.


In [2]:
#Loading of data from txt file
data = pd.read_csv("deu-eng/deu.txt", sep='\t', names=['en','de', 'extra information'])
data.tail(10)

Unnamed: 0,en,de,extra information
208476,I recommend contributing sentences in your own...,"Ich empfehle, muttersprachliche Sätze beizutra...",CC-BY 2.0 (France) Attribution: tatoeba.org #6...
208477,A building with high ceilings and huge rooms m...,Ein Gebäude mit hohen Decken und riesigen Räum...,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
208478,"As a prank, some students let three goats loos...",Als Streich ließen einige Schüler drei Ziegen ...,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
208479,"In today's world, we have to equip all our kid...",In der heutigen Welt müssen wir all unsere Kin...,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
208480,Death is something that we're often discourage...,"Wir werden oft davon abgehalten, über den Tod ...",CC-BY 2.0 (France) Attribution: tatoeba.org #1...
208481,"At a moment when our economy is growing, our b...","In einem Moment, in dem unsere Wirtschaft wäch...",CC-BY 2.0 (France) Attribution: tatoeba.org #3...
208482,Even if some sentences by non-native speakers ...,Auch wenn Sätze von Nichtmuttersprachlern mitu...,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
208483,If someone who doesn't know your background sa...,"Wenn jemand, der deine Herkunft nicht kennt, s...",CC-BY 2.0 (France) Attribution: tatoeba.org #9...
208484,If someone who doesn't know your background sa...,"Wenn jemand Fremdes dir sagt, dass du dich wie...",CC-BY 2.0 (France) Attribution: tatoeba.org #9...
208485,Doubtless there exists in this world precisely...,Ohne Zweifel findet sich auf dieser Welt zu je...,CC-BY 2.0 (France) Attribution: tatoeba.org #7...


In [3]:
# train_de = pd.read_csv("de_en_data/train.de", sep='\t', names=['de'])
# train_en = pd.read_csv("de_en_data/train.en", sep='\t', names=['en'])
# test_de = pd.read_csv("de_en_data/test.de", sep='\t', names=['de'])
# test_en = pd.read_csv("de_en_data/test.en", sep='\t', names=['en'])

# train_data = pd.concat([train_de, train_en], axis=1)
# test_data = pd.concat([test_de[:len(test_en)], test_en], axis=1)

#Extracting few data
data = data.drop(columns=['extra information'])
train_data = data.iloc[:100000]
train_data.tail(10)


Unnamed: 0,en,de
99990,Thanks for bringing Tom home.,"Danke, dass du Tom nach Hause gebracht hast!"
99991,Thanks for bringing Tom home.,"Danke, dass Sie Tom nach Hause gebracht haben!"
99992,Thanks for bringing Tom home.,"Danke, dass ihr Tom nach Hause gebracht habt!"
99993,Thanks for showing me around.,Danke für die Führung!
99994,Thanks for the fast response.,Danke für die schnelle Antwort!
99995,Thanks for your quick answer.,Danke für eure schnelle Antwort!
99996,Thanks for your quick answer.,Danke für Ihre schnelle Antwort!
99997,That article is out of stock.,Dieser Artikel ist nicht vorrätig.
99998,That bed is very comfortable.,Das Bett ist sehr gemütlich.
99999,That book had a lot of pages.,Das Buch hatte viele Seiten.


In [4]:
#Extracting the subset from the whole data
no_of_samples = int(35000) 
threshold = int((no_of_samples*0.10)+100)
train_de_en = train_data.iloc[:no_of_samples]
valid_de_en = train_data.iloc[no_of_samples-threshold:no_of_samples+threshold]
test_de_en = train_data.iloc[no_of_samples+threshold:no_of_samples+2*threshold]

In [5]:
# Clearning the data - Lowercase, Change the unicode, removing punctuations
# doc = [re.split('\s+', train_en['en'].iloc[i]) for i in range(len(train_en))]
def data_preprocessing(dataframe):

    doc = [re.split('\s+', dataframe.iloc[i]) for i in range(len(dataframe))]

    clean_doc = []
    for sent in doc:
        clean_sent = []
        clean_sent_str = ''
        for token in sent:
            token = token.casefold()
            # unicode
            token = normalize('NFD',token).encode('ascii', 'ignore')
            token = token.decode('UTF-8')
            # remove punctuations
            token = re.sub('[^\w|^\s]', '', token)
            if token != '':
                clean_sent.append(token)
        
        clean_sent_str = ' '.join(clean_sent)
        clean_doc.append(clean_sent_str)
    return clean_doc

#Processing the training data
train_de_processed = data_preprocessing(train_de_en['de'])
train_en_processed = data_preprocessing(train_de_en['en'])

In [6]:
#Processing the validation data
valid_de_processed = data_preprocessing(valid_de_en['de'])
valid_en_processed = data_preprocessing(valid_de_en['en'])
#Processing the test data
test_de_processed = data_preprocessing(test_de_en['de'])
test_en_processed = data_preprocessing(test_de_en['en'])

In [7]:
#Tokenizing
def create_tokenizer(data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    return tokenizer

In [8]:
#Tokenizing, getting vocab size and max length of the sentence
def get_vocab_size(data):
    tokenizer = create_tokenizer(data)
    vocab_size = len(tokenizer.word_index)+1
    max_length = max(len(sent.split()) for sent in data)
    return int(vocab_size), max_length, tokenizer

#Getting token, vocab size and max length for both english and german
de_vocab_size, de_length, de_tokenizer = get_vocab_size(train_de_processed)
en_vocab_size, en_length, en_tokenizer = get_vocab_size(train_en_processed)
print("German-the vocab size is {}, the sentence length {}, number of token {}"\
      .format(de_vocab_size, de_length, len(de_tokenizer.texts_to_sequences(train_de_processed))))
print("English-the vocab size is {}, the sentence length {}, number of token {}"\
      .format(en_vocab_size, en_length, len(en_tokenizer.texts_to_sequences(train_de_processed))))


German-the vocab size is 8178, the sentence length 12, number of token 35000
English-the vocab size is 5175, the sentence length 6, number of token 35000


In [9]:
#Padding the data
def encode_sequences(tokenizer, length, data):
    X = tokenizer.texts_to_sequences(data)
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

#Creating one hot vector for the output
def encode_output(sequences, vocab_size):
    ylist = list()
    for seq in sequences:
        encode = to_categorical(seq, num_classes=vocab_size)
        ylist.append(encode)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

#Preparing the train data for the neural model
trainX = encode_sequences(de_tokenizer, de_length, train_de_processed)
trainY = encode_sequences(en_tokenizer, en_length, train_en_processed)
trainY = encode_output(trainY, en_vocab_size)

#Preparing the validation data for the neural model
validX = encode_sequences(de_tokenizer, de_length, valid_de_processed)
validY = encode_sequences(en_tokenizer, en_length, valid_en_processed)
validY = encode_output(validY, en_vocab_size)

#Preparing the test data for the neural model
testX = encode_sequences(de_tokenizer, de_length, test_de_processed)
testY = encode_sequences(en_tokenizer, en_length, test_en_processed)
testY = encode_output(testY, en_vocab_size)

In [10]:
#To cancel the epoch once required accuracy is being reached
class myCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy')>0.94):
            print("\nReached 94% accuracy so cancelling training!")
            self.model.stop_training = True
callbacks = myCallback()

In [11]:
#Creating the Encoder Decoder Model
def define_model(de_vocab_size, en_vocab_size, de_length, en_length, n_units):
    #Define the model will be stacked linear layers
    model = Sequential()
    #Creating embedding for the sentence
    model.add(Embedding(de_vocab_size, n_units, input_length=de_length, mask_zero=True))
    #Creating Bidirectional LSTM
    model.add(Bidirectional(LSTM(n_units)))
    #Creating the context vector available every time for the decoder
    model.add(RepeatVector(en_length))
    #Creating Bidirectional LSTM as a decoder which returns the full sequence   
    model.add(Bidirectional(LSTM(n_units, return_sequences=True)))
    #Creating a wrapper to get the output in a single time step
    model.add(TimeDistributed(Dense(en_vocab_size, activation='softmax')))
    return model


model = define_model(de_vocab_size, en_vocab_size, de_length, en_length, 256)
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics=['accuracy', 'mse'])
#Print the created model structure
print(model.summary())
# vis_utils.plot_model(model, to_file="model.png", show_shapes=True)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 12, 256)           2093568   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               1050624   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 6, 512)            0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 6, 512)            1574912   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 6, 5175)           2654775   
Total params: 7,373,879
Trainable params: 7,373,879
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
fit_data = model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(validX, validY), verbose=2, callbacks=[callbacks])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 35000 samples, validate on 7200 samples
Epoch 1/30
 - 82s - loss: 3.4213 - accuracy: 0.4992 - mse: 1.1658e-04 - val_loss: 3.4235 - val_accuracy: 0.4477 - val_mse: 1.2910e-04
Epoch 2/30
 - 77s - loss: 2.3368 - accuracy: 0.6093 - mse: 9.4752e-05 - val_loss: 2.6424 - val_accuracy: 0.5330 - val_mse: 1.1514e-04
Epoch 3/30
 - 77s - loss: 1.7021 - accuracy: 0.6765 - mse: 8.1063e-05 - val_loss: 2.1897 - val_accuracy: 0.5845 - val_mse: 1.0655e-04
Epoch 4/30
 - 78s - loss: 1.2458 - accuracy: 0.7317 - mse: 6.9778e-05 - val_loss: 1.7963 - val_accuracy: 0.6391 - val_mse: 9.3972e-05
Epoch 5/30
 - 77s - loss: 0.9275 - accuracy: 0.7801 - mse: 5.9348e-05 - val_loss: 1.6008 - val_accuracy: 0.6779 - val_mse: 8.7109e-05
Epoch 6/30
 - 78s - loss: 0.7101 - accuracy: 0.8194 - mse: 5.0337e-05 - val_loss: 1.4710 - val_accuracy: 0.7086 - val_mse: 8.0588e-05
Epoch 7/30


In [None]:
model.save('MyModel.h5')
# loaded_model_h5 = tf.keras.models.load_model('MyModel.h5')

In [None]:
model = keras.models.load_model('MyModel.h5')

In [None]:
# remapping the vector to a word
def remap_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
# generate target from given source sequence
def predict_sequence(model, tokenizer, input):
    prediction = model.predict(input, verbose=0)[0]
    integers = [np.argmax(np.array(vector)) for vector in prediction]
    target = list()
    for i in integers:
        word = remap_to_word(i, tokenizer)
        if word is None:
            break
        target.append(word)
   
    return ' '.join(target)

In [None]:
#Predict the target (translating from German to English)
def predict(model, tokenizer, sources, de_data, en_data):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, tokenizer, source)
        if i<10:
            print("Source is [{}], Target is [{}], Predicted is [{}]".format(de_data[i], en_data[i], translation))
    
        actual.append([en_data[i].split()])
        predicted.append(translation.split())
    return actual, predicted
 
#Evaluate using bleu score
def evaluate_model(actual, predicted):
    print("BLEU-1 is {:.2f}".format(corpus_bleu(actual, predicted, weights=[1.0, 0.0, 0.0, 0.0])))
    print("BLEU-2 is {:.2f}".format(corpus_bleu(actual, predicted, weights=[0.5, 0.5, 0.0, 0.0])))
    print("BLEU-3 is {:.2f}".format(corpus_bleu(actual, predicted, weights=[0.3, 0.3, 0.3, 0.0])))
    print("BLEU-4 is {:.2f}".format(corpus_bleu(actual, predicted, weights=[0.25, 0.25, 0.25, 0.25])))
    


actual, predicted = predict(model, en_tokenizer, trainX, train_de_processed, train_en_processed)
#evaluate_model(actual, predicted)

In [None]:
evaluate_model(actual, predicted)

In [None]:
test_actual, test_predicted = predict(model, en_tokenizer, testX, test_de_processed, test_en_processed)
evaluate_model(test_actual, test_predicted)