In [1]:
import tensorflow as tf 

# A GRU based Encoder which encodes the input vector to a vector while also returing the output for each step

In [2]:
def create_encoder(vocab_size, embedding_dim, enc_units, max_sentence_length):
    """
    Creates an encoder with the necessary vocabulary size, embedding dimension and encoding units
    The model is very simple:
    
    Input -> Embedding -> GRU Layer -> Hidden Vector output which is to be passed to decoder
    """
    inputs = tf.keras.layers.Input((max_sentence_length, ))
    embed = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
    output, state = tf.keras.layers.GRU(enc_units, 
                                        return_sequences=True, 
                                        return_state=True,
                                        reset_after = True,
                                        recurrent_initializer='glorot_uniform')(embed)
    encoder = tf.keras.models.Model(inputs, [output, state])
    return encoder

In [3]:
class BahdanauAttention(tf.keras.layers.Layer):
    """
    Create a layer that implements Bahadanau attention mechanism
    More details at https://www.tensorflow.org/tutorials/text/nmt_with_attention
    """
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        self.R = tf.keras.layers.Dropout(0.25)
        
    def call(self, query, values):
        # hidden_size = num_units
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        
        score = self.R(score)  # perform regularization

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector

In [4]:
def create_decoder(vocab_size, embedding_dim, enc_units, dec_units, max_sentence_length):
    """
    Implements a simple Decoder model. Network is as follows:
    
    Encoder Output                  Hidden State of Encoder                 Decoder input of previous word
              \                         /                                            |                                                      
                  Bahadanau Attetion                                             Embedding
                        |                                                            | 
                    Context Vector                                                  /
                                 \                                                 /
                                            Concatenated Input
                                                 |
                                             GRU Cell
                                                 |
                                            Dense Layer
                                                 |
                                Predicted probabilities on the vocabulary
        
    """
    enc_output = tf.keras.layers.Input((max_sentence_length, enc_units, ))
    hidden = tf.keras.layers.Input((enc_units, ))
    context_vector = BahdanauAttention(dec_units)(hidden, enc_output)
    
    dec_input = tf.keras.layers.Input((1, ))
    dec_embed = tf.keras.layers.Embedding(vocab_size, embedding_dim)(dec_input)
    
    context_expand = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, 1))(context_vector)
    full_context = tf.keras.layers.Concatenate(axis = -1)([context_expand, dec_embed])
    output, state = tf.keras.layers.GRU(dec_units, 
                                        return_sequences=True, 
                                        return_state=True,
                                        reset_after = True,
                                        recurrent_initializer='glorot_uniform')(full_context)
    flat_output = tf.keras.layers.Flatten()(output)
    final = tf.keras.layers.Dense(vocab_size, activation = 'softmax')(flat_output)
    
    decoder = tf.keras.models.Model([enc_output, hidden, dec_input], [final, state])
    return decoder

In [5]:
def loss_func(target, pred):
    """
    Returns the total loss over all batch, using categorical crossentropy
    """
    # 求和
    return tf.math.reduce_sum(tf.keras.losses.sparse_categorical_crossentropy(target, pred))

In [7]:
# Making the necessary imports
import tensorflow as tf
import json
import numpy as np
import glob
import argparse
import os
from keras.preprocessing.text import tokenizer_from_json
# This will force execute tensorflow in eager execution mode 
tf.enable_eager_execution()

Using TensorFlow backend.


In [8]:
inp_lang_path = './processed_data_en-fra/inp_lang.json'

In [10]:
# load the tokenziers
# The JSON files contain the encoding corresponding to each word in the input and output lines (done in preprocess.py)
with open(inp_lang_path, 'r') as f:
    json_data = json.load(f)
    inp_lang = tokenizer_from_json(json_data)
    f.close()

print('Input Language Loaded...') 

Input Language Loaded...


In [11]:
targ_lang_path = './processed_data_en-fra/targ_lang.json'

In [12]:
with open(targ_lang_path, 'r') as f:
    json_data = json.load(f)
    targ_lang = tokenizer_from_json(json_data)
    f.close()

print('Target Language Loaded...') 

Target Language Loaded...


In [13]:
data_npz = './processed_data_en-fra/data.npz'

In [14]:
npzfile = np.load(data_npz)

In [17]:
len(npzfile['arr_0'])

121935

In [18]:
# define hyperparameters
BUFFER_SIZE = len(npzfile['arr_0'])
BATCH_SIZE = 64
steps_per_epoch = len(npzfile['arr_0'])#BATCH_SIZE
embedding_dim = 128
units = 256
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1
max_sentence_length = 15

In [19]:
# create tensorflow dataset pipeline for faster processing
dataset = tf.data.Dataset.from_tensor_slices((npzfile['arr_0'], npzfile['arr_1'])).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
print('Loaded dataset into memory...')

Loaded dataset into memory...


In [21]:
# create encoder from Chatbot class
encoder = create_encoder(vocab_inp_size, embedding_dim, units, max_sentence_length)
encoder.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 15)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 15, 128)           1898240   
_________________________________________________________________
gru (GRU)                    [(None, 15, 256), (None,  296448    
Total params: 2,194,688
Trainable params: 2,194,688
Non-trainable params: 0
_________________________________________________________________


In [22]:
# create decoder from Chatbot class
decoder = create_decoder(vocab_tar_size, embedding_dim, units, units, max_sentence_length)
decoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 15, 256)]    0                                            
__________________________________________________________________________________________________
bahdanau_attention (BahdanauAtt (None, 256)          131841      input_3[0][0]                    
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
____________________________________________________________________________________________

In [23]:
# there are lots of parameters, so more training would yield better results
optimizer = tf.keras.optimizers.Adam(1e-2)

In [34]:
# the training step function that performs the optimization
@tf.function
def train_step(inp, targ):
    loss = 0
    # Firstly <start> is passed to the decoder to begin the process of teacher forcing
    # Check out this article https://machinelearningmastery.com/teacher-forcing-for-recurrent-neural-networks/
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp)  # pass the input to the encoder, get encoder_output and state
        dec_hidden = enc_hidden   # set the decoder hidden state same as encoder final state
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden = decoder([enc_output, dec_hidden, dec_input])

            loss += loss_func(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [35]:
def best_weight(files):
    files.sort(key=lambda x:float(x[len(files)-9:-3]))
    return files[0]

In [36]:
def get_epoch(files):
    files.sort(key=lambda x:int(x.split("_")[2][6:]), reverse=True)
    eno = files[0].split("_")[2][6:]
    return int(eno)

In [None]:
## Here you have the training step
def training(EPOCHS, name, resume = False):
    show_output = 10
    start=0
    if resume:
        enc_models = glob.glob(f'models_'+name+'/encoder_*.h5', recursive=True)
        dec_models = glob.glob(f'models_'+name+'/decoder_*.h5', recursive=True)
        encoder.load_weights(best_weight(enc_models))
        decoder.load_weights(best_weight(dec_models))
        start = get_epoch(enc_models)
        print("Encoder Model: ",best_weight(enc_models))
        print("Decoder Model: ",best_weight(dec_models))
        print("Weights Loaded")
        print("Resuming from Epoch", start+1)

    if not os.path.exists('models_'+name):
        os.mkdir('models_'+name)
        
    for epoch in range(start, EPOCHS):
        print('=' * 80)
        print('EPOCH: ', epoch+1)
        total_loss = 0

        for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = train_step(inp, targ)
            total_loss += batch_loss

            if batch % show_output == 0:
                print(str(batch/show_output) + '\t\t Loss: ' + str(batch_loss))

        print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))

        r_loss = str(total_loss / steps_per_epoch)
        r_loss = r_loss[10:16]
        if epoch % 50 == 0:
            # after training save the weights
            encoder.save_weights('models_'+name+'/encoder_epoch-{}_loss-{}.h5'.format(str(epoch+1), str(r_loss)))
            decoder.save_weights('models_'+name+'/decoder_epoch-{}_loss-{}.h5'.format(str(epoch+1), str(r_loss)))

In [None]:
# when performing training for first time, First_time = True, else First_time = False
training(1000, 'en-fra', resume = False) 