# Here's a basic implementation of an **Encoder-Decoder LSTM** Network.
*English-Persian*
![Encoder-Decoder LSTM](https://drive.google.com/uc?id=1OjiGejKdTNyAnmaNzH-CMchifjsrUQkV)

# **Pre-Processing**

In [8]:
import pandas as pd
import numpy as np
import re, string
from sklearn.utils import shuffle

lines = pd.read_table('/content/drive/My Drive/Colab Notebooks/TxtSteganography/Autoencoder/pes-eng/pes.txt', names=['eng', 'per', 'attribution'])
print(lines.shape)
# lowercase English sentences
lines.eng = lines.eng.apply(lambda x: x.lower())
lines.per=lines.per.apply(lambda x: x.lower())

# Remove quotes
lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x))
lines.per=lines.per.apply(lambda x: re.sub("'", '', x))
exclude = set(string.punctuation) # Set of all special characters
# print(exclude)

# Remove all the special characters
lines.eng=lines.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines.per=lines.per.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(lines.per)

# Remove all numbers from text
digits = "0123456789"
remove_digits = str.maketrans('', '', digits) # third argument removes the chars that show up in this string
lines.eng=lines.eng.apply(lambda x: x.translate(remove_digits))
lines.per = lines.per.apply(lambda x: re.sub("[0123456789]", "", x)) #It's weird cause the training data contains Eng digits in Persian text

# this record contains digits!
# print(lines.per[1890])

# Remove extra spaces
lines.eng=lines.eng.apply(lambda x: x.strip())
lines.per=lines.per.apply(lambda x: x.strip())
lines.eng=lines.eng.apply(lambda x: re.sub(" +", " ", x))
lines.per=lines.per.apply(lambda x: re.sub(" +", " ", x))

# print(lines.per[1890])

# Add manual start and end tokens to target language
lines.per = lines.per.apply(lambda x : 'START_ '+ x + ' _END')

print(lines.per[1890])
print(type(lines.per))
print(lines.sample(10))


(2230, 3)
START_ دمای دقیق، درجه سلسیوس است _END
<class 'pandas.core.series.Series'>
                                      eng  ...                                        attribution
1287     think before you open your mouth  ...  CC-BY 2.0 (France) Attribution: tatoeba.org #4...
34                               terrific  ...  CC-BY 2.0 (France) Attribution: tatoeba.org #5...
847            i thanked tom for his time  ...  CC-BY 2.0 (France) Attribution: tatoeba.org #1...
1336    our landlord has lowered the rent  ...  CC-BY 2.0 (France) Attribution: tatoeba.org #7...
944          can you think of any reasons  ...  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1199      i saw him scolded by his mother  ...  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1427  i will finish the work in five days  ...  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
59                              you tried  ...  CC-BY 2.0 (France) Attribution: tatoeba.org #2...
767             how do you heat t

## **Create Vocabulary Sets**
NB: for both source and target languages we need to create two vocabulary sets

In [9]:
eng_words = set()
[eng_words.add(word) for eng in lines.eng for word in eng.split() if word not in eng_words]
# print(eng_words)
print('Number of English vocab:', len(eng_words))

per_words = set()
[per_words.add(word) for per in lines.per for word in per.split() if word not in per_words]
# print(per_words)
print('Number of Persian vocab:', len(per_words))

# Max length of source language (We need to compute cause Autoencoders work with fix length)
length_list = []
[length_list.append(len(l.split(' '))) for l in lines.eng]
max_length_eng = np.max(length_list)
print('Max length of English sentences:', max_length_eng)

# Max length of target language (We need to compute cause Autoencoders work with fix length)
length_list = []
[length_list.append(len(l.split(' '))) for l in lines.per]
max_length_per = np.max(length_list)
print('Max length of Persian sentences:', max_length_per)

input_words = sorted(list(eng_words))
target_words = sorted(list(per_words))

# Set encoder and decoder tokens based on source and target languges
number_encoder_tokens = len(eng_words)
number_decoder_tokens = len(per_words)
# Decoder zero padding
number_decoder_tokens +=1

# Create dictionary for both source and target language
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
print(input_token_index)
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])
# print(target_token_index)

# Reverse top dictionary, index-->token
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())
print(reverse_input_char_index)
# print(reverse_target_char_index)

lines = shuffle(lines)
lines.head(10)

Number of English vocab: 2493
Number of Persian vocab: 3676
Max length of English sentences: 31
Max length of Persian sentences: 30
{'a': 1, 'abandoned': 2, 'aberration': 3, 'able': 4, 'abnormality': 5, 'about': 6, 'above': 7, 'abroad': 8, 'abrupt': 9, 'abruptly': 10, 'absolute': 11, 'absolutely': 12, 'absorbed': 13, 'absorbs': 14, 'abstain': 15, 'abstained': 16, 'abuse': 17, 'accident': 18, 'accidents': 19, 'accomplished': 20, 'account': 21, 'accurate': 22, 'accuse': 23, 'accustomed': 24, 'acknowledge': 25, 'acquaintance': 26, 'acquaintances': 27, 'acquainted': 28, 'across': 29, 'act': 30, 'active': 31, 'activities': 32, 'actually': 33, 'adamant': 34, 'add': 35, 'added': 36, 'address': 37, 'admire': 38, 'admit': 39, 'adults': 40, 'adventure': 41, 'advertisement': 42, 'advertising': 43, 'advice': 44, 'affected': 45, 'affects': 46, 'afford': 47, 'afraid': 48, 'africa': 49, 'african': 50, 'after': 51, 'afternoon': 52, 'again': 53, 'against': 54, 'age': 55, 'agency': 56, 'ages': 57, 'ago'

Unnamed: 0,eng,per,attribution
1897,tom dialed marys number and got a busy signal,START_ تام با شماره ی ماری تماس گرفت و به بوق ...,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
1130,it was a difficult year for us,START_ سال سختی برای ما بود _END,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
47,im sorry,START_ من متاسفم _END,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
1945,a policeman asked the girls if the car was theirs,START_ پلیس از دختران پرسید که آیا ماشین مال آ...,CC-BY 2.0 (France) Attribution: tatoeba.org #8...
1515,this book contains forty photographs,START_ این کتاب حاوی چهل عکس است _END,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
705,i dont want to hurt you,START_ من نمی خواهم به تو آسیبی بزنم _END,CC-BY 2.0 (France) Attribution: tatoeba.org #7...
1451,there are a lot of tools in the box,START_ ابزارهای زیادی در جعبه وجود دارد _END,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
755,everybody seeks happiness,START_ همه به دنبال خوشبختی هستند _END,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
2007,no matter how rich he may be he is never conte...,START_ فرقی ندارد که او چقدر ثروتمند باشد، او ...,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
1633,i walk my dog in the park every morning,START_ من هر روز صبح سگم را برای قدم زدن به پا...,CC-BY 2.0 (France) Attribution: tatoeba.org #3...


## **Preparing Dataset (Splitting)**

In [10]:
from sklearn.model_selection import train_test_split
# reform the clean dataset by combing eng and per
# data = pd.concat([lines.eng, lines.per], axis=1)
X_train, X_test, y_train, y_test = train_test_split(lines.eng, lines.per, test_size=0.1)
print(X_train.shape)
print(X_test.shape)
# X_train resembelence to Train set and y_train is similar to Test set

(2007,)
(223,)


In [0]:
def generate_batch(X= X_train, y= y_train, batch_size= 128):
  while True:
    for j in range(0, len(X), batch_size): # initial, end, step of increament
      encoder_input_data = np.zeros((batch_size, max_length_eng), dtype='float32')
      # print(encoder_input_data.shape)
      # encode_output_data is discared here
      decoder_input_data = np.zeros((batch_size, max_length_per), dtype='float32')
      # print(decoder_input_data.shape)
      decoder_target_data = np.zeros((batch_size, max_length_per, number_decoder_tokens), dtype='float32')  # numpy 3D (# of matrix, rows, columns)
      # print(decoder_target_data.shape)
      # return X[j:j+batch_size]
      # return y[j:j+batch_size]
      # return zip(X[j:j+batch_size], y[j:j+batch_size])
      for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])): # zip shows an iterator of tuples where the first item in each passed iterator is paired together
        #('she lost her way and on top of that it began to rain', 'START_ او راهش را گم کرد و علاوه بر این، باران نیز شروع به باریدن کرد _END')
        # return (input_text, target_text)
        for t, word in enumerate(input_text.split()):
          encoder_input_data[i, t] = input_token_index[word]
          # result.append(word)
          # result.append(encoder_input_data[i, t])
        for t, word in enumerate(target_text.split()):
          if t<len(target_text.split())-1:
            # result.append(len(target_text.split()))
            decoder_input_data[i, t]= target_token_index[word]
            #result.append(t)
            # result.append(word)
            #result.append(decoder_input_data[i, t])
          if t>0:
            decoder_target_data[i, t - 1, target_token_index[word]] = 1.
    yield([encoder_input_data, decoder_input_data], decoder_target_data) #used yeild to produce a custom data generator object without saving result in memory

In [14]:
# np.set_printoptions(threshold=np.inf)
# pd.DataFrame(generate_batch()).to_csv('/content/drive/My Drive/Colab Notebooks/TxtSteganography/Autoencoder/batches.csv')
generate_batch()

<generator object generate_batch at 0x7fe363865360>

## **Training the Model With Tensorflow, Keras**

In [15]:
from keras.layers import Embedding, LSTM
from keras.models import Model
from keras.layers import Input, Dense

latent_dim = 50
batch_size = 128
# Encoder ---------------------------------
encoder_inputs = Input(shape=(None,))
print(encoder_inputs.shape)
# Keras Embedding for turning positive integeres into dense vectors of fixed size
encoder_emb = Embedding(number_encoder_tokens, latent_dim, mask_zero= True)(encoder_inputs)
print(encoder_emb.shape)
encoder_lstm = LSTM(latent_dim, return_state=True)
# Whether to return the last state in addition to the output. The returned elements of the states list are the hidden state and the cell state, respectively.
encoder_outputs, hidden_state, cell_state = encoder_lstm(encoder_emb)
print(encoder_outputs.shape, hidden_state.shape, cell_state.shape)
# output of the encoder should be discarded in our case
encoder_states = [hidden_state, cell_state]

# Decoder ---------------------------------
decoder_inputs = Input(shape=(None, ))
print(decoder_inputs.shape)
decoder_emb_layer = Embedding(number_decoder_tokens, latent_dim, mask_zero= True)
# teacher forcing
decoder_emb = decoder_emb_layer(decoder_inputs)
print(decoder_emb.shape)
decoder_lstm = LSTM(latent_dim, return_sequences= True, return_state= True)
decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states)
print(decoder_outputs.shape, _.shape, _.shape)

# Use a softmax to generate a probability distribution over the target vocabulary for each time step
decoder_dense = Dense(number_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
print(decoder_outputs.shape)

# Form a model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

(None, None)
(None, None, 50)
(None, 50) (None, 50) (None, 50)
(None, None)
(None, None, 50)
(None, None, 50) (None, 50) (None, 50)
(None, None, 3677)


## **Run Training**

In [16]:
from math import ceil
epochs = 50
train_samples = len(X_train)
validation_samples = len(X_test)
steps_per_epoch = ceil(train_samples / batch_size)
validation_steps = ceil(validation_samples / batch_size)

model.fit_generator(generator = generate_batch(),
                    steps_per_epoch=steps_per_epoch,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test),
                    validation_steps = validation_steps,
                    verbose=1)
model.save('/content/drive/My Drive/Colab Notebooks/TxtSteganography/Autoencoder/S2S.h5')
model.save_weights('/content/drive/My Drive/Colab Notebooks/TxtSteganography/Autoencoder/S2S_weights.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# **Load the weights if the application closes**

In [0]:
model.load_weights('/content/drive/My Drive/Colab Notebooks/TxtSteganography/Autoencoder/S2S_weights.h5')

## **Inference/ Predict**

In [0]:
# encode the input and feed on the encoder
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim, ))

decoder_state_input_c = Input(shape=(latent_dim, ))

# Pass c and h states of encoders to the decoder
decoder_states_input = [decoder_state_input_h, decoder_state_input_c]

decoder_emb2 = decoder_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_emb2, initial_state=decoder_states_input)
decoder_states2 = [state_h2, state_c2]

decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs]+ decoder_states_input,
    [decoder_outputs2] + decoder_states2
    )

In [0]:
def decode_sequence(input_seq):

  states_value = encoder_model.predict(input_seq)

  target_seq = np.zeros((1,1))

  target_seq[0, 0]= target_token_index['START_']

  stop_condition = False
  decoded_sentence = ''

  while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq]+ states_value)

    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = reverse_target_char_index[sampled_token_index]
    decoded_sentence += ' ' + sampled_char

    if(sampled_char == '_END' or len(decoded_sentence) > 50):
      stop_condition = True

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = sampled_token_index


    states_value = [h, c]

  return decoded_sentence

In [0]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [22]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Persian Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Persian Translation:', decoded_sentence[:-4])

Input English sentence: i am not concerned with this
Actual Persian Translation:  من نگران این نیستم 
Predicted Persian Translation:  دمکراسی بدترین نوع حاکمیت است، البته به جز انواع 
