In [1]:
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 200)

Using TensorFlow backend.


In [2]:
def read_text(filename):
        # open the file
        file = open(filename, mode='rt', encoding='utf-8')
        
        # read all text
        text = file.read()
        file.close()
        return text

In [3]:

data=read_text('../input/deu.txt')

In [4]:
def to_lines(text):
      sents = text.strip().split('\n')
      sents = [i.split('\t') for i in sents]
      return sents

In [5]:
deu_eng = to_lines(data)

In [6]:
deu_eng[:5]

[['Hi.', 'Hallo!'],
 ['Hi.', 'Grüß Gott!'],
 ['Run!', 'Lauf!'],
 ['Wow!', 'Potzdonner!'],
 ['Wow!', 'Donnerwetter!']]

In [7]:
deu_eng = array(deu_eng)
deu_eng[:5]

array([['Hi.', 'Hallo!'],
       ['Hi.', 'Grüß Gott!'],
       ['Run!', 'Lauf!'],
       ['Wow!', 'Potzdonner!'],
       ['Wow!', 'Donnerwetter!']], dtype='<U537')

In [8]:
deu_eng = deu_eng[:50000,:]

In [9]:
deu_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,0]]
deu_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,1]]
deu_eng

array([['Hi', 'Hallo'],
       ['Hi', 'Grüß Gott'],
       ['Run', 'Lauf'],
       ...,
       ['The man died of cancer', 'Der Mann starb an Krebs'],
       ['The man lay motionless', 'Der Mann lag bewegungslos da'],
       ['The man must be insane', 'Der Mann muss geistesgestört sein']],
      dtype='<U537')

In [10]:
for i in range(len(deu_eng)):
    deu_eng[i,0] = deu_eng[i,0].lower()
    deu_eng[i,1] = deu_eng[i,1].lower()


In [11]:
deu_eng

array([['hi', 'hallo'],
       ['hi', 'grüß gott'],
       ['run', 'lauf'],
       ...,
       ['the man died of cancer', 'der mann starb an krebs'],
       ['the man lay motionless', 'der mann lag bewegungslos da'],
       ['the man must be insane', 'der mann muss geistesgestört sein']],
      dtype='<U537')

In [12]:
eng_l = []
deu_l = []

for i in deu_eng[:,0]:
      eng_l.append(len(i.split()))

for i in deu_eng[:,1]:
      deu_l.append(len(i.split()))

In [13]:
t  = Tokenizer()
fit_text = "The earth is an awesome place live"
a=t.fit_on_texts(fit_text)
test_text = "The earth is an great place live"
sequences = t.texts_to_sequences(test_text)
print(sequences)


[[3], [4], [1], [], [1], [2], [8], [3], [4], [], [5], [6], [], [2], [9], [], [], [8], [1], [2], [3], [], [13], [7], [2], [14], [1], [], [7], [5], [15], [1]]


In [14]:
def tokenization(lines):
      tokenizer = Tokenizer()
      tokenizer.fit_on_texts(lines)
      return tokenizer
eng_tokenizer = tokenization(deu_eng[:, 0])
    

In [15]:
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 8

In [16]:
eng_vocab_size

6352

In [17]:
deu_tokenizer = tokenization(deu_eng[:, 1])
deu_vocab_size = len(deu_tokenizer.word_index) + 1


In [18]:
deu_vocab_size

10678

In [19]:
eng_length = 8
deu_length=8

In [20]:
def encode_sequences(tokenizer, length, lines):
         # integer encode sequences
         seq = tokenizer.texts_to_sequences(lines)
         # pad sequences with 0 values
         seq = pad_sequences(seq, maxlen=length, padding='post')
         return seq

In [21]:
from sklearn.model_selection import train_test_split

train,test=train_test_split(deu_eng,test_size=0.2,random_state=12)

In [22]:
train.shape

(40000, 2)

In [23]:
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(deu_tokenizer, deu_length, train[:, 1])


testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(deu_tokenizer, deu_length, test[:, 1])

In [24]:
def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units):
      model = Sequential()
      model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
      model.add(LSTM(units))
      model.add(RepeatVector(out_timesteps))
      model.add(LSTM(units, return_sequences=True))
      model.add(Dense(out_vocab, activation='softmax'))
      return model

In [25]:
model = define_model(eng_vocab_size, deu_vocab_size, eng_length, deu_length, 512)

In [26]:
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [27]:
filename = 'model.h1.24_jan_19'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=30, batch_size=512, validation_split = 0.2,callbacks=[checkpoint], 
                    verbose=1)

Train on 32000 samples, validate on 8000 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 3.39574, saving model to model.h1.24_jan_19
Epoch 2/30

Epoch 00002: val_loss improved from 3.39574 to 3.22432, saving model to model.h1.24_jan_19
Epoch 3/30

Epoch 00003: val_loss improved from 3.22432 to 3.08295, saving model to model.h1.24_jan_19
Epoch 4/30

Epoch 00004: val_loss improved from 3.08295 to 2.94408, saving model to model.h1.24_jan_19
Epoch 5/30

Epoch 00005: val_loss improved from 2.94408 to 2.85199, saving model to model.h1.24_jan_19
Epoch 6/30

Epoch 00006: val_loss improved from 2.85199 to 2.78314, saving model to model.h1.24_jan_19
Epoch 7/30

Epoch 00007: val_loss improved from 2.78314 to 2.71597, saving model to model.h1.24_jan_19
Epoch 8/30

Epoch 00008: val_loss improved from 2.71597 to 2.65235, saving model to model.h1.24_jan_19
Epoch 9/30

Epoch 00009: val_loss improved from 2.65235 to 2.58586, saving model to model.h1.24_jan_19
Epoch 10/30

Epoch 00010: va

In [28]:
model = load_model('model.h1.24_jan_19')
preds = model.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))

In [29]:
def get_word(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None

In [30]:
preds_text = []
for i in preds:
       temp = []
       for j in range(len(i)):
            t = get_word(i[j], deu_tokenizer)
            if j > 0:
                if (t == get_word(i[j-1], deu_tokenizer)) or (t == None):
                     temp.append('')
                else:
                     temp.append(t)
            else:
                   if(t == None):
                          temp.append('')
                   else:
                          temp.append(t) 

       preds_text.append(' '.join(temp))

In [33]:
pred_df = pd.DataFrame({'actual' : test[:,1], 'predicted' : preds_text})

In [34]:
pred_df.head(15)

Unnamed: 0,actual,predicted
0,er wollte reich sein,er wollte reich sein
1,ich liebe tom,ich liebe tom
2,lasst uns nach hause gehen,lass uns nach hause gehen
3,ich fahre für mein leben gern,ich mag sehr
4,das ist mein wörterbuch,das ist mein wörterbuch
5,hallo tom guten morgen,hallo tom hierher
6,warum ist sie so beliebt,warum ist es so
7,ich zeige euch mein zimmer,ich zeige dir mein zimmer
8,hat tom verschlafen,hat tom adoptiert
9,leiste nur weiterhin so gute arbeit,mach die zu
