In [None]:
!wget http://www.manythings.org/anki/deu-eng.zip

In [None]:
!unzip deu-eng.zip

In [None]:
import numpy as np

In [None]:
def read_text(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

def to_lines(text):
    pairs = text.strip().split('\n')
    sents = [p.split('\t') for p in pairs]
    return sents

In [None]:
data = read_text('deu.txt')
deu_eng = to_lines(data)
deu_eng = np.array(deu_eng)

In [None]:
deu_eng[0]

In [None]:
len(deu_eng)

In [None]:
deu_eng = deu_eng[:50000, :]

In [None]:
import string

deu_eng[:,0] = [s.translate(str.maketrans('','',string.punctuation)) for s in deu_eng[:,0]]

In [None]:
deu_eng[:,1] = [s.translate(str.maketrans('','',string.punctuation)) for s in deu_eng[:,1]]

In [None]:
deu_eng

In [None]:
for i in range(len(deu_eng)):
    deu_eng[i,0] = deu_eng[i,0].lower()
    deu_eng[i,1] = deu_eng[i,1].lower()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [None]:
eng_len = [len(s.split()) for s in deu_eng[:,0]]
deu_len = [len(s.split()) for s in deu_eng[:,1]]

In [None]:
lengths_df = pd.DataFrame({'eng':eng_len, 'deu': deu_len})
lengths_df.hist(bins=30)
plt.show()

In [None]:
print(max(eng_len))
print(max(deu_len))

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(deu_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
print(eng_vocab_size)

In [None]:
deu_tokenizer = Tokenizer()
deu_tokenizer.fit_on_texts(deu_eng[:, 1])
deu_vocab_size = len(deu_tokenizer.word_index) + 1
print(deu_vocab_size)

In [None]:
eng_max_length = 8
deu_max_length = 8

In [None]:
def encode_sequences(tokenizer, length, lines):
    sequence = tokenizer.texts_to_sequences(lines)
    sequence = pad_sequences(sequence, maxlen=length, padding='post')
    return sequence

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(deu_eng, test_size = 0.2, random_state = 12)

In [None]:
Xtrain = encode_sequences(deu_tokenizer,deu_max_length, train[:, 1])

Ytrain = encode_sequences(eng_tokenizer,eng_max_length, train[:, 0])

print(Xtrain.shape, Ytrain.shape)

In [None]:
print(Xtrain[0])

In [None]:
print(Ytrain[0])

In [None]:
Xtest = encode_sequences(deu_tokenizer,deu_max_length, test[:, 1])

Ytest = encode_sequences(eng_tokenizer,eng_max_length, test[:, 0])

In [None]:
print(Xtest.shape, Ytest.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras import optimizers
from keras.callbacks import ModelCheckpoint

In [None]:
model = Sequential()
model.add(Embedding(deu_vocab_size, 512, input_length=deu_max_length, mask_zero=True))
model.add(LSTM(512))
model.add(RepeatVector(eng_max_length))
model.add(LSTM(512, return_sequences= True))
model.add(Dense(eng_vocab_size, activation='softmax'))

In [None]:
model.summary()

In [None]:
rms = optimizers.RMSprop(learning_rate=0.001)

In [None]:
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [None]:
filename = 'model.h1'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', mode='min', save_best_only=True, verbose=1)

In [None]:
history = model.fit(Xtrain, Ytrain.reshape(40000, 8, 1), epochs=30, batch_size=512, validation_split=0.2, callbacks=[checkpoint], verbose=1)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train', 'validation'])
plt.show()

In [None]:
from keras.models import load_model

In [None]:
model = load_model(filename)

In [None]:
print(Xtest.shape)

In [None]:
predictions = model.predict_classes(Xtest)

dict_eng = { t:w for w,t in eng_tokenizer.word_index.items() }

In [None]:
eng_preds = []

for p in predictions: 
    sent = []
    for t in p:
        word = dict_eng.get(t) if t > 0 else ''
        sent.append(word)
    eng_preds.append(' '.join(sent))

In [None]:
pred_df = pd.DataFrame({'actual': test[:, 0], 'predicted': eng_preds})
pred_df.head()

In [None]:
pred_df.head(20)

In [None]:
pred_df.tail(20)

In [None]:
pred_df.sample(15)