### This is a seq2seq machine translation code on the Amazon Food Reviews Dataset, but you could change for any other translation dataset. It's recommended to train in the cloud on at least 1 GPU. This has a pretrained embedding layer using GloVe.

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf 
import numpy as np
import io
import json
from keras_preprocessing.text import tokenizer_from_json
import datetime
import os

In [None]:
#PRINT VERSION!!!
tf.__version__

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

## import dataset

### I'm using the amazon food reviews dataset here

In [None]:
train = pd.read_csv('~/your_long_path/reviews.csv')
train = train[['Summary','Text']]
train.head()

## getting word counts

In [None]:
train['text_length'] = train['Text'].str.count(' ')
train['text_length'].describe()

In [None]:
train['summary_length'] = train['Summary'].str.count(' ')
train['summary_length'].describe()

In [None]:
train.head()

## bounding data lengths

In [None]:
#get rid of weirdness in test/train set

train = train[train['summary_length']>=2].reset_index(drop=True)
train = train[train['summary_length']<=20].reset_index(drop=True)
train = train[train['text_length']<=100].reset_index(drop=True)

In [None]:
print(train.shape)
print(train.head())

## cleaning data and making and saving test set

In [None]:
train['text_lower'] = train['Text'].str.lower()
train['text_no_punctuation'] = train['text_lower'].str.replace('[^\w\s]','')

In [None]:
### adding "_start_" and "_end_" delimeters to summary this tells the model where to start

train['summary_lower'] = train["Summary"].str.lower()
train['summary_no_punctuation'] =  '_start_' + ' ' +train['summary_lower'].str.replace('[^\w\s]','')+ ' ' +'_end_'

In [None]:
#shuffle dataset and reset index

train = train.sample(frac=1).reset_index(drop=True)

test = train[0:100]
train = train[100:]

test.to_csv('test_set.csv')

## playing with max features

In [None]:
#setting max features and max len for text and summarty for model

max_features1 = 100000
maxlen1 = 100

max_features2 = 100000
maxlen2 = 20

## making tokenizers and saving them

In [None]:
tok1 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features1) 
tok1.fit_on_texts(list(train['text_no_punctuation'].astype(str))) #fit to cleaned text
tf_train_text =tok1.texts_to_sequences(list(train['text_no_punctuation'].astype(str)))
tf_train_text =tf.keras.preprocessing.sequence.pad_sequences(tf_train_text, maxlen=maxlen1) #let's execute pad step 

In [None]:
#save tokenizer for scoring later on

tokenizer1_json = tok1.to_json()
with io.open('tok1.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer1_json, ensure_ascii=False))

In [None]:
#the processing has to be done for both 
#two different tokenizers

In [None]:
tok2 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features2, filters = '*') 
tok2.fit_on_texts(list(train['summary_no_punctuation'].astype(str))) #fit to cleaned text
tf_train_summary = tok2.texts_to_sequences(list(train['summary_no_punctuation'].astype(str)))
tf_train_summary = tf.keras.preprocessing.sequence.pad_sequences(tf_train_summary, maxlen=maxlen2, padding ='post') 

In [None]:
tokenizer2_json = tok2.to_json()
with io.open('tok2.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer2_json, ensure_ascii=False))

## setting dimensions and getting the shapes

In [None]:
vectorized_summary = tf_train_summary
# For Decoder Input, you don't need the last word as that is only for prediction
# when we are training using Teacher Forcing.
decoder_input_data = vectorized_summary[:, :-1]

# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
decoder_target_data = vectorized_summary[:, 1:]

print(f'Shape of decoder input: {decoder_input_data.shape}')
print(f'Shape of decoder target: {decoder_target_data.shape}')

vectorized_text = tf_train_text
# Encoder input is simply the body of the text
encoder_input_data = vectorized_text
doc_length = encoder_input_data.shape[1]
print(f'Shape of encoder input: {encoder_input_data.shape}')

In [None]:
#setting size of vocabulary encoder and decoder

vocab_size_encoder = len(tok1.word_index) + 1 
vocab_size_decoder = len(tok2.word_index) + 1

In [None]:
#set latent dimension for embedding and hidden units

latent_dim = 100

## GloVe embedding layer

In [None]:
# Preparing GloVe

GLOVE_DIR = "/your_long_path/GloVe"

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.{}d.txt'.format(latent_dim)))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
#build embedding weights matrix for text

embedding_matrix = np.zeros((len(tok1.word_index) + 1, latent_dim))
for word, i in tok1.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# model

In [None]:
########################
#### Encoder Model ####

#setting Encoder Input
encoder_inputs = tf.keras.Input(shape=(doc_length,), name='Encoder-Input')

# GloVe Embeding for encoder
x = tf.keras.layers.Embedding(vocab_size_encoder, 
                              latent_dim, 
                              name='Body-Word-Embedding',
                              weights=[embedding_matrix],
                              mask_zero=False, 
                              trainable=False)(encoder_inputs)

#Batch normalization is used so that the distribution of the inputs 
#to a specific layer doesn't change over time
x = tf.keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x)


# We do not need the `encoder_output` just the hidden state
_, state_h = tf.keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Set the encoder as a separate entity so we can encode without decoding if desired
encoder_model = tf.keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')


seq2seq_encoder_out = encoder_model(encoder_inputs)



########################
#### Decoder Model ####
decoder_inputs = tf.keras.Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Embedding For Decoder, not GloVe 
dec_emb = tf.keras.layers.Embedding(vocab_size_decoder, 
                                    latent_dim, 
                                    name='Decoder-Word-Embedding',
                                    mask_zero=False, )(decoder_inputs)

#batch normalization
dec_bn = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = tf.keras.layers.GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
#the decoder "decodes" the encoder out
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = tf.keras.layers.Dense(vocab_size_decoder, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)


########################
#### Seq2Seq Model ####
seq2seq_Model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)


#parallelize data on N GPUs if desired
#seq2seq_Model = tf.keras.utils.multi_gpu_model(seq2seq_Model, gpus=N)

seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

** Examine Model Architecture Summary **

In [None]:
#from seq2seq_utils import viz_model_architecture
seq2seq_Model.summary()

## Generator

In [None]:
#find values for train/val split

data_len = len(encoder_input_data)
val_split = int(np.floor(data_len*.15))
train_split = int(np.floor(data_len*.85))

In [None]:
# separating into train and validation data

X_enc_train = encoder_input_data[0:train_split]
X_dec_train = decoder_input_data[0:train_split]
y_t_train = np.expand_dims(decoder_target_data, -1)[0:train_split]

X_enc_val = encoder_input_data[-val_split:-1]
X_dec_val = decoder_input_data[-val_split:-1]
y_t_val = np.expand_dims(decoder_target_data, -1)[-val_split:-1]

In [None]:
class generatorClass(Sequence):

    def __init__(self, X_enc, X_dec, y_t, batch_size):
        self.X_enc = X_enc
        self.X_dec = X_dec
        self.y_t = y_t
        self.batch_size = batch_size
        self.lock = threading.Lock()

    def __len__(self):
        return int(np.ceil(len(self.X_enc) / float(self.batch_size)))
            
    def __getitem__(self, idx):
        with self.lock:
            batch_index1 = idx * self.batch_size
            batch_index2 = (idx + 1) * self.batch_size
            batch_Xe = self.X_enc[batch_index1:batch_index2]
            batch_Xd = self.X_dec[batch_index1:batch_index2]
            batch_y = self.y_t[batch_index1:batch_index2]
            batch_X = [batch_Xe, batch_Xd]

            return batch_X, batch_y
    def on_epoch_end(self):
        """Method called at the end of every epoch.
        """
        pass

# train model

In [None]:
#tensorboard
log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

#checkpoints
checkpointer = tf.keras.callbacks.ModelCheckpoint(
    filepath='/tmp/weights.{epoch:02d}-{val_loss:.2f}.hdf5', verbose=1, save_best_only=True)

#early_stopping
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',  
                                              patience=10, 
                                              verbose=1, mode='auto', 
                                              restore_best_weights=True)

#model
epochs = 10
batch_size = 700
idx = 0
gen_instance = generatorClass(X_enc_train, X_dec_train, y_t_train, batch_size)
val_instance = generatorClass(X_enc_val, X_dec_val, y_t_val, batch_size)
#n_batches = (int(encoder_input_data.shape[0]) // batch_size) * epochs
history = seq2seq_Model.fit_generator(generator = gen_instance,
                            epochs=epochs ,  
                            max_queue_size=50, 
                            validation_data = val_instance,
                            validation_freq=1,
                            steps_per_epoch = int(encoder_input_data.shape[0]) // batch_size,
                            callbacks=[tensorboard_callback, checkpointer], #early_stop],
                            use_multiprocessing=True,
                            workers=7) 


#save final model
seq2seq_Model.save('your_model.h5')

# scoring

In [None]:
#load the model
seq2seq_Model = tf.keras.models.load_model('your_model.h5')

# Show the model architecture
seq2seq_Model.summary()

In [None]:
#open the tokenizers

with open('tok1.json') as f:
    data = json.load(f)
    tok1 = tokenizer_from_json(data)
    
with open('tok2.json') as f:
    data = json.load(f)
    tok2 = tokenizer_from_json(data)

In [None]:
#look at test set
test.head()

In [None]:
#pick a cell from the clean data to test and look at it
test_text = [test['text_no_punctuation'][6]]
test_text

In [None]:
# get the encoder's features for the decoder

tok1.fit_on_texts(test_text)

In [None]:
#tokenize test text

raw_tokenized = tok1.texts_to_sequences(test_text)
raw_tokenized = tf.keras.preprocessing.sequence.pad_sequences(raw_tokenized, maxlen=maxlen1)

In [None]:
#predict the encoder state of the new sentence
body_encoding = encoder_model.predict(raw_tokenized) 

In [None]:
#get output shapes of decoder word embedding
latent_dim = seq2seq_Model.get_layer('Decoder-Word-Embedding').output_shape[-1]

In [None]:
#get layer method for getting the embedding (word clusters)

decoder_inputs = seq2seq_Model.get_layer('Decoder-Input').input 
dec_emb = seq2seq_Model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
dec_bn = seq2seq_Model.get_layer('Decoder-Batchnorm-1')(dec_emb)

gru_inference_state_input = tf.keras.Input(shape=(latent_dim,), name='hidden_state_input')

gru_out, gru_state_out = seq2seq_Model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])

# Reconstruct dense layers
dec_bn2 = seq2seq_Model.get_layer('Decoder-Batchnorm-2')(gru_out)
dense_out = seq2seq_Model.get_layer('Final-Output-Dense')(dec_bn2)

In [None]:
decoder_model = tf.keras.Model([decoder_inputs, gru_inference_state_input],
                          [dense_out, gru_state_out])

In [None]:
# save the encoder's embedding before its updated by decoder for later
# optional
original_body_encoding = body_encoding

In [None]:
state_value = np.array(tok2.word_index['_start_']).reshape(1, 1)

In [None]:
state_value

In [None]:
decoded_sentence = []
stop_condition = False

In [None]:
vocabulary_inv = dict((v, k) for k, v in tok2.word_index.items())

In [None]:
vocabulary_inv

In [None]:
while not stop_condition:
    #print(1)
    preds, st = decoder_model.predict([state_value, body_encoding])

    pred_idx = np.argmax(preds[:, :, 2:]) + 2
    pred_word_str = vocabulary_inv[pred_idx]
    print(pred_word_str)
    if pred_word_str == '_end_' or len(decoded_sentence) >= maxlen2:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)

    # update the decoder for the next word
    body_encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)
    #print(state_value)

In [None]:
#compare to original summary

print([test['summary_no_punctuation'][6]])