### Attention Based Neural translation model

In [1]:
import numpy as np

import tensorflow as tf
from nmt_utils import *

In [2]:
#loading the dataset
m = 10000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)

100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 16695.93it/s]


In [3]:
#Exploring data

dataset[:10]

[('9 may 1998', '1998-05-09'),
 ('10.11.19', '2019-11-10'),
 ('9/10/70', '1970-09-10'),
 ('saturday april 28 1990', '1990-04-28'),
 ('thursday january 26 1995', '1995-01-26'),
 ('monday march 7 1983', '1983-03-07'),
 ('sunday may 22 1988', '1988-05-22'),
 ('08 jul 2008', '2008-07-08'),
 ('8 sep 1999', '1999-09-08'),
 ('thursday january 1 1981', '1981-01-01')]

In [4]:
#maximum length for input and output dates, in order to make all input data in the same length
Tx = 30
Ty = 10  # xxxx-xx-xx output will be in this format so all outputs will be in 10 character long
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

# Each character has its index X and Y we represent data as a list of indices. Then, we change each indices to 
# one hot encoding vector in depth axis 

print("X.shape:", X.shape)
print("Y.shape:", Y.shape)
print("Xoh.shape:", Xoh.shape)
print("Yoh.shape:", Yoh.shape)

X.shape: (10000, 30)
Y.shape: (10000, 10)
Xoh.shape: (10000, 30, 37)
Yoh.shape: (10000, 10, 11)


In [5]:
#Sum of exploration and understanding

index = 5
print("Source date:", dataset[index][0])
print("Target date:", dataset[index][1])
print('\n')
print("\nSource after preprocessing (indices):\n", X[index])
print("\nTarget after preprocessing (indices):\n", Y[index])
print('\n')
print("\nSource after preprocessing (one-hot):\n", Xoh[index])
print("\nTarget after preprocessing (one-hot):\n", Yoh[index])


Source date: monday march 7 1983
Target date: 1983-03-07



Source after preprocessing (indices):
 [24 26 25 16 13 34  0 24 13 28 15 20  0 10  0  4 12 11  6 36 36 36 36 36
 36 36 36 36 36 36]

Target after preprocessing (indices):
 [ 2 10  9  4  0  1  4  0  1  8]



Source after preprocessing (one-hot):
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]

Target after preprocessing (one-hot):
 [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]


In [6]:
def softmax(x, axis=1):
    """Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

In [30]:
len_human_vocab = 37
len_machine_vocab = 11

n_s = 64 # number of units for the post-attention LSTM's hidden state "s"
n_a = 32 # number of units for the pre-attention, bi-directional LSTM's hidden state 'a' 

In [23]:
from tensorflow.keras import layers

class neural_translation_model(layers.Layer):
    
    def __init__(self, Tx = 30, Ty = 10, n_a = 32, n_s = 64, human_vocab_size = len_human_vocab
                                                           , machine_vocab_size = len_machine_vocab):
        
        # Default parameter for model
        self.Tx = Tx
        self.Ty = Ty        
        self.n_a = n_a # number of units for the pre-attention, bi-directional LSTM's hidden state 'a' 
        self.n_s = n_s # number of units for the post-attention LSTM's hidden state "s"
        self.human_vocab_size = human_vocab_size
        self.machine_vocab_size = machine_vocab_size
        
        
        
        # We will share weights with those layer. In order to prevent them to be intialized for each time step we can either 
        # define them as a global variable or we can create their object
        self.repeator = layers.RepeatVector(Tx)
        self.concatenator =  layers.Concatenate(axis=-1)
        self.densor1 = layers.Dense(10, activation = "tanh")
        self.densor2 = layers.Dense(1, activation = "relu")
        self.activator = layers.Activation(softmax, name='attention_weights') # We are using a custom softmax(axis = 1) loaded in this notebook
        self.dotor = layers.Dot(axes = 1)
        
        self.post_activation_LSTM_cell = layers.LSTM(n_s, return_state = True) # Please do not modify this global variable.
        self.output_layer = layers.Dense(len(machine_vocab), activation=softmax)
        
    def a_step_attention(self, a, s_prev):
        #it is same activation that will be shared for all t_delta activations to calculate alpha
        s_prev = self.repeator(s_prev)
        #concatenate the activations with hidden state of post attention LSTM 
        concatenation = self.concatenator([a,s_prev])
        
        #Here is the small fully connected neural network to find attention weights 
        # intermediate energies
        e = self.densor1(concatenation)
        # Energies
        energies = self.densor2(e)
        #softmax to calculate alphas
        alpha = self.activator(energies)
        
        # context = sum_over_t_x( alpha(t_y,t_x)) * a(t_x)
        context = self.dotor([alpha,a])
        
        return context
    
    def model(self):
        
        X  = layers.Input(shape = (self.Tx,self.human_vocab_size))
        s0 = layers.Input(shape = (self.n_s,), name ='s0')
        c0 = layers.Input(shape = (self.n_s,), name ='c0')
        
        s = s0 
        c = c0 
        
        a = layers.Bidirectional(layers.LSTM(self.n_a ,return_sequences= True))(X)
        
        outputs = []
        
        for t in range(self.Ty):
            
            context = self.a_step_attention(a, s)
            
            s, _, c = self.post_activation_LSTM_cell(context,initial_state=[s, c])
            
            out = self.output_layer(s)
            
            outputs.append(out)
            
        model = tf.keras.Model(inputs = [X,s0,c0] , outputs = outputs)
        
        return model
    
    
    
    

In [24]:
attention_model = neural_translation_model().model()

In [25]:
attention_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 30, 37)]     0           []                               
                                                                                                  
 s0 (InputLayer)                [(None, 64)]         0           []                               
                                                                                                  
 bidirectional_1 (Bidirectional  (None, 30, 64)      17920       ['input_5[0][0]']                
 )                                                                                                
                                                                                                  
 repeat_vector_4 (RepeatVector)  (None, 30, 64)      0           ['s0[0][0]',                 

                                                                  'attention_weights[4][0]',      
                                                                  'bidirectional_1[0][0]',        
                                                                  'attention_weights[5][0]',      
                                                                  'bidirectional_1[0][0]',        
                                                                  'attention_weights[6][0]',      
                                                                  'bidirectional_1[0][0]',        
                                                                  'attention_weights[7][0]',      
                                                                  'bidirectional_1[0][0]',        
                                                                  'attention_weights[8][0]',      
                                                                  'bidirectional_1[0][0]',        
          

In [28]:
opt = tf.keras.optimizers.Adam(learning_rate = 0.005,beta_1 = 0.9,beta_2 = 0.999,decay = 0.01) 
attention_model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])

In [31]:
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(Yoh.swapaxes(0,1))

In [38]:
attention_model.fit([Xoh, s0, c0], outputs, epochs=15, batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15


Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1c9efbb6310>

In [40]:
EXAMPLES = ['3 May 1979', '5 April 09', '21 of August 2016', 'Tue 10 Jul 2007', 'Saturday May 9 2018', 'March 3 2001', 'March 3rd 2001', '1 March 2001']
s00 = np.zeros((1, n_s))
c00 = np.zeros((1, n_s))
for example in EXAMPLES:
    source = string_to_int(example, Tx, human_vocab)
    source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source))).swapaxes(0,1)
    source = np.swapaxes(source, 0, 1)
    source = np.expand_dims(source, axis=0)
    prediction = attention_model.predict([source, s00, c00])
    prediction = np.argmax(prediction, axis = -1)
    output = [inv_machine_vocab[int(i)] for i in prediction]
    print("source:", example)
    print("output:", ''.join(output),"\n")

source: 3 May 1979
output: 1979-05-03 

source: 5 April 09
output: 2099-04-04 

source: 21 of August 2016
output: 2016-08-01 

source: Tue 10 Jul 2007
output: 2007-07-10 

source: Saturday May 9 2018
output: 2018-05-09 

source: March 3 2001
output: 2001-03-03 

source: March 3rd 2001
output: 2001-03-30 

source: 1 March 2001
output: 2001-03-01 



Results are pretty good for such a small training time. We can see that model succesfully extracts months and gets correspending month value for them. It also seeable that when we identify date with st,nd,rd (1st,2nd,3rd) it messes up partially. We could fix it with further training forward and adding more similar labels. 