In [1]:
import math
import numpy as np
import tensorflow as tf
import tensorflow.contrib.seq2seq as seq2seq

from tensorflow.python.ops.rnn_cell import GRUCell
from tensorflow.python.ops.rnn_cell import LSTMCell
from tensorflow.python.ops.rnn_cell import MultiRNNCell
from tensorflow.python.ops.rnn_cell import DropoutWrapper, ResidualWrapper

from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.layers.core import Dense
from tensorflow.python.util import nest


from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder

from preprocess import *
from loading_util import *

  from ._conv import register_converters as _register_converters


In [2]:
#Resetter
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [3]:
#embedding parameters
embedding_size = 50

#data parameters
eMax_allowed_length = 100
dMax_allowed_length = 20

In [4]:
#Fetching data
#default directory: 'data/data_10.csv'
X,Y= read_csv()

In [5]:
#Fetching glove vectors
#default directory: "./glove.6B.50d.txt"
embedding_size = 50
wi,iw,wv = read_glove_vecs()

In [6]:
#Adding extra tokens to glove dictionary
go_index,eos_index,unk_index = add_extra_to_dict(wi,iw,wv,embedding_size)
emb = map_dict_to_list(iw,wv)

In [7]:
#preprocessing data
#Mapping each word in a sentence to its glove index
eInput,eLengths = fit_encoder_text(data= X[1:],word_to_index = wi,max_allowed_seq_length = eMax_allowed_length)
dInput,dOutput,dLengths = fit_decoder_text(data= Y[1:],word_to_index = wi,max_allowed_seq_length = dMax_allowed_length)
dLengths

[8, 19, 13, 18, 11, 18]

In [8]:
class Seq2SeqModel():
    def __init__(self,config,mode):
        assert mode.lower() in ['train','decode']
        
        self.mode = mode.lower()
        
        #num_encoder_symbols and num_decoder_symbols
        self.encoder_vocab_size = 400003
        self.decoder_vocab_size = 400003

        
        self.config = config
        
        
        self.cell_type = config['cell_type']
        self.hidden_units = config['hidden_units']
        self.depth = config['depth']
        self.attention_type = config['attention_type']
        self.embedding_size = config['embedding_size']
        
        self.use_residual = config['use_residual']
        self.attn_input_feeding = config['attn_input_feeding']
        self.use_dropout = config['use_dropout']
        self.keep_prob = 1.0 - config['dropout_rate']
        
        self.optimizer = config['optimizer']
        self.learning_rate = config['learning_rate']
        self.max_gradient_norm = config['max_gradient_norm']
        self.global_step = tf.Variable(0, trainable = False, name = 'global_step')
        self.global_epoch_step = tf.Variable(0,trainable=False, name = "global_epoch_step")
        self.global_epoch_step_op= tf.assign(self.global_epoch_step,self.global_epoch_step+1)
        
        self.dtype = tf.float16 if config['use_float16'] else tf.float32
        self.keep_prob_placeholder = tf.placeholder(self.dtype, shape=[], name = 'keep_prob')
        
        self.use_beamsearch_decode = False
        if self.mode == 'decode':
            self.beam_width = config['beam_width']
            self.use_beamsearch_decode = True if self.beam_width > 1 else False
            self.max_decode_step = config['max_decode_step']
        
        self.build_model()
    
    def build_model(self):
            print('building model..')

            #building encoder and decoder networks
            self.init_placeholders()
            '''
            self.build_encoder()
            self.build_decoder()
            self.summary_op = tf.summary.merge_all()
            '''    
    def init_placeholders(self):
            #encoder inputs: [batch_size, max_time_steps]
            self.encoder_inputs = tf.placeholder(dtype = tf.int32, shape = (None,None), name = 'encoder_inputs')
            #encoder_inputs_length: [batch_size]
            self.encoder_inputs_length = tf.placeholder(dtype=tf.int32, shape=(None,) , name = 'encoder_inputs_length')
            
            #get dynamic batch_size
            self.batch_size = tf.shape(self.encoder_inputs)[0]
            
            if(self.mode=='train'):
                
                #decoder_inputs: [batch_size,max_time_steps]
                self.decoder_inputs = tf.placeholder(dtype=tf.int32,shape=(None,None), name ='decoder_inputs')
                #decoder_inputs_length: [batch_size]
                self.decoder_inputs_length = tf.placeholder(dtype=tf.int32, shape=(None,), name='decoder_inputs_length')
                
                self.decoder_targets = tf.placeholder(dtype=tf.int32,shape=(None,None), name ='decoder_targets')
                
                '''
                #No need, already preprocessed
                decoder_start_token=tf.ones(shape=[self.batch_size,1], dtype=tf.int32)*data_utils.start_token
                
                decoder_end_token=tf.ones(shape=[self.batch_size,1], dtype=tf.int32)*data_utils.end_token
                '''
                
    def build_single_cell(self):
        cell_type = LSTMCell
        if(self.cell_type.lower() == 'gru'):
            cell_type = GRUCell
        cell = cell_type(self.hidden_units)
        
        if self.use_dropout:
            cell = DropoutWrapper(cell,dtype=self.dtype,
                                 output_keep_prob = self.keep_prob_placeholder)
            
        if self.use_residual:
            cell = ResidualWrapper(cell)
            
        return cell

    def build_encoder_cell (self):
        return MultiRNNCell([self.build_single_cell() for i in range(self.depth)])
    
    def build_decoder_cell(self):
        encoder_outputs = self.encoder_outputs
        encoder_last_state = self.encoder_last_state
        encoder_inputs_length = self.encoder_inputs_length
        
        if self.use_beamsearch_decode:
            print('using beamsearch..')
            encoder_outputs = seq2seq.tile_batch(self.encoder_outputs,
                                                 multiplier=self.beam_width)
            encoder_last_state = nest.map_structure( lambda s: seq2seq.tile_batch(s,self.beam_width),
                                                   self.encoder_last_state)
            encoder_inputs_length = seq2seq.tile_batch(self.encoder_inputs_length,
                                                       multiplier=self.beam_width)
            
            
        #Building attention mechanism: Default Bahdanau
        #'Bahdanau' style attention
        self.attention_mechanism = attention_wrapper.BahdanauAttention(
        num_units=self.hidden_units, memory=encoder_outputs,
        memory_sequence_length=encoder_inputs_length,
        name='BahdanauAttention')
        
        # 'Luong' style attention:
        if self.attention_type.lower() == 'luong':
            self.attention_mechanism = attention_wrapper.LuongAttention(
            num_units = self.hidden_units, memory=encoder_outputs,
            memory_sequence_length=encoder_inputs_length,
            name='LuongAttention')
                
        #Building decoder_cell
        self.decoder_cell_list = [self.build_single_cell() for i in range(self.depth)]
        decoder_initial_state = encoder_last_state
        
        def attn_decoder_input_fn(inputs,attention):
            if not self.attn_input_feeding:
                return inputs
            
            _input_layer = Dense(self.hidden_units,dtype = self.dtype,
                                name = 'attn_input_feeding')
            return _input_layer(array_ops.concat([inputs,attention],-1))
        
        self.decoder_cell_list[-1] = attention_wrapper.AttentionWrapper(
        cell = self.decoder_cell_list[-1],
        attention_mechanism=self.attention_mechanism,
        attention_layer_size=self.hidden_units,
        cell_input_fn=attn_decoder_input_fn,
        initial_cell_state=encoder_last_state[-1],
        alignment_history=False,
        name='Attention_Wrapper')
        
        # Encoder last state must be compatible with AttentionWrapper
        #Attentionwrapper.zero_state is used for the purpose
        
        batch_size = self.batch_size if not self.use_beamsearch_decode else self.batch_size*self.beam_width
        initial_state = [state for state in encoder_last_state]
        
        initial_state[-1]= self.decoder_cell_list[-1].zero_state(
        batch_size = batch_size, dtype=self.dtype)
        decoder_initial_state = tuple(initial_state)
        
        return MultiRNNCell(self.decoder_cell_list),decoder_initial_state
        
    
    def init_optimizer(self):
        print("Setting optimizer..")
        #Gradients and SGD update operaton for training the model
        trainable_params = tf.trainable_variables()
        if self.optimizer.lower() == 'adadelta':
            self.opt = tf.train.AdamOptimizer(learning_rate = self.learning_rate)
        elif self.optimizer.lower() == 'adam':
            self.opt = tf.train.AdamOptimizer(learning_rate = self.learning_rate)
        elif self.optimizer.lower() == 'rmsprop':
            self.opt = tf.train.RMSPropOptimizer(learning_rate = self.learning_rate)
        else:
            self.opt = tf.train.GradientDescentOptimizer(learning_rate = self.learning_rate)
            
        
        #Compute gradients of loss w.r.t all trainable variables
        gradients = tf.gradients(self.loss,trainable_params)
        
        #Clip gradients of loss w.r.t all trainable variables
        clip_gradients,_ = tf.clip_by_global_norm(gradients,self.max_gradient_norm)
        
        #Update the model
        self.updates = self.opt.apply_gradients(zip(clip_gradients,trainable_params),
                                                global_step = self.global_step)
        
    def save(self,sess,path,var_list=None,global_step=None):
        saver = tf.train.Saver(var_list)
        
        save_path = saver.save(sess,save_path=path,global_step=step)
        print('model saved at ',save_path)
        
    def restore(self,sess,path,var_list=None):
        saver = tf.train.Saver(var_list)
        saver.restore(sess, save_path = path)
        print('model restored from ',path)
    
    def build_encoder(self):
        print('Building Encoder..')
        with tf.variable_scope('encoder'):
            self.encoder_cell = self.build_encoder_cell()
            
            #Instantiating pretrained embeddings
            embedding_variable = tf.Variable(tf.constant(0.0, shape = [self.encoder_vocab_size, embedding_size]),trainable = False, name = 'embedding')
                           
            self.encoder_embedding_placeholder = tf.placeholder(tf.float32, shape=[self.encoder_vocab_size,embedding_size], name = 'embedding_placeholder' )
            self.encoder_embeddings = embedding_variable.assign(self.encoder_embedding_placeholder)
            self.encoder_inputs_embedded=tf.nn.embedding_lookup(self.encoder_embeddings,self.encoder_inputs)
            
    
            #instantiating dense layer
            input_layer = Dense(self.hidden_units, dtype = self.dtype, name = 'input_projection')
            #passing the embedding through dense layer
            self.encoder_inputs_embedded = input_layer(self.encoder_inputs_embedded)
            
            #Encode input sequences into context vectors
            #encoder_outputs: [batch_size, max_time_step, cell_output_size]
            #encoder_state: [batch_size,cell_output_size]
            self.encoder_outputs, self.encoder_last_state = tf.nn.dynamic_rnn(cell = self.encoder_cell,
                                                                               inputs=self.encoder_inputs_embedded,
                                                                               sequence_length=self.encoder_inputs_length,
                                                                               dtype=self.dtype,
                                                                               time_major=False)
            
            '''
            init = tf.global_variables_initializer()
            with tf.Session() as sess:
                sess.run(init)
                enc_outputs,enc_laststate=sess.run([self.encoder_outputs,self.encoder_last_state], 
                                                   feed_dict={self.encoder_embedding_placeholder:emb ,
                                                              self.encoder_inputs:eInput, 
                                                              self.encoder_inputs_length: eLengths })
                print('encoder Outputs:',enc_outputs.shape)
                print(enc_outputs)
                print()
                print('Encoder last state:',len(enc_laststate))
                print(enc_laststate)
            '''
        

    def build_decoder(self):
        print('Building decoder and attention...')
        with tf.variable_scope('decoder'):
                
            #Recheck this code
            self.decoder_cell,self.decoder_initial_state = self.build_decoder_cell()
            
            #Instantiating pretrained embeddings
            embedding_variable = tf.Variable(tf.constant(0.0, shape = [self.decoder_vocab_size, embedding_size]),trainable = False, name = 'embedding')

            self.decoder_embedding_placeholder = tf.placeholder(tf.float32, shape=[self.decoder_vocab_size,embedding_size], name = 'embedding_placeholder' )
            self.decoder_embeddings = embedding_variable.assign(self.decoder_embedding_placeholder)

                
            #instantiating dense layer --> DOUBT
            input_layer = Dense(self.hidden_units, dtype = self.dtype, name = 'input_projection')
                
            #Output projection layer to convert cell outputs to logits --> DOUBT
            output_layer = Dense(self.decoder_vocab_size,name = "output_projection")
                
            if self.mode == 'train':
                #decoder_inputs_embedded: [batch_size,max_time_step,embedding_size]
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(self.decoder_embeddings,
                                                                           self.decoder_inputs)
                    
                #Embedded inputs going through projection layer
                self.decoder_inputs_embedded=input_layer(self.decoder_inputs_embedded)
                    
                #Helper to feed inputs for training: read inputs from dense ground truth vectors
                training_helper = seq2seq.TrainingHelper(inputs = self.decoder_inputs_embedded,
                                                            sequence_length=self.decoder_inputs_length,
                                                            time_major=False,
                                                            name='training_helper')
                training_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell,
                                                           helper = training_helper,
                                                           initial_state = self.decoder_initial_state,
                                                           output_layer = output_layer)
                                                           #output_layer = output_layer
                    
                #Maximum decoder time_steps in current batch
                max_decoder_length = tf.reduce_max(self.decoder_inputs_length)
                
                # decoder_outputs_train: BasicDecoderOutput
                #                        namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False
                #                                   [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True
                # decoder_outputs_train.sample_id: [batch_size], tf.int32
                
                (self.decoder_outputs_train, self.decoder_last_state_train,
                self.decoder_outputs_length_train) = (seq2seq.dynamic_decode(decoder=training_decoder,
                                                                            output_time_major=False,
                                                                            impute_finished=True,
                                                                            maximum_iterations=max_decoder_length))
                
                    

                
    
                    
                
                # More efficient to do the projection on the batch-time-concatenated tensor
                # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols]
                # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output)
                self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output) 
                # Use argmax to extract decoder symbols to emit
                self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1,
                                                        name='decoder_pred_train')
                    
                # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1]
                masks = tf.sequence_mask(lengths=self.decoder_inputs_length, 
                                         maxlen=max_decoder_length, dtype=self.dtype, name='masks')

                # Computes per word average cross-entropy over a batch
                # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default
                self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train, 
                                                  targets=self.decoder_targets,
                                                  weights=masks,
                                                  average_across_timesteps=True,
                                                  average_across_batch=True,)
                # Training summary for the current batch_loss
                    
                # Training summary for the current batch_loss
                tf.summary.scalar('loss', self.loss)

                # Contruct graphs for minimizing loss
                self.init_optimizer()
                
                #return self.decoder_logits_train,self.decoder_pred_train
                #The above return can be removed from comment to test.
                
            
            #When decoding. The output of every time step will go as an input
            #to the next time step. Similar to a language model.
            elif self.mode == 'decode':
                
                #Must be of the size [batch_size,] --> int32 vector
                start_tokens = tf.ones([self.batch_size,],tf.int32)*go_index
                #Must be scalar to be passed to greedyEmbeddingHelper
                end_token = eos_index
                
                def embed_and_input_proj(inputs):
                    return input_layer(tf.nn.embedding_lookup(self.decoder_embeddings,inputs))
                
                
                if not self.use_beamsearch_decode:
                    #Helper to feed inputs for greedy decoding : Uses argmax of the output
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=start_tokens,
                                                                   end_token=end_token,
                                                                   embedding=embed_and_input_proj)
                    
                    #Basic decoder performs greedy decoding at each time step
                    print("Building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell,
                                                            helper = decoding_helper,
                                                            initial_state=self.decoder_initial_state,
                                                            output_layer=output_layer)
                else:
                    #Less greedy approach since we see outputs from a few paths.
                    print("building beamsearch decoder..")
                    inference_decoder = beam_search_decoder.BeamSearchDecoder(cell=self.decoder_cell,
                                                                             embedding=embed_and_input_proj,
                                                                             start_tokens=start_tokens,
                                                                             end_token=end_token,
                                                                             initial_state=self.decoder_initial_state,
                                                                             beam_width=self.beam_width,
                                                                             output_layer=output_layer)
                    
                (self.decoder_outputs_decode, self.decoder_last_state_decode,
                self.decoder_outputs_length_decode) = (seq2seq.dynamic_decode(
                decoder=inference_decoder,
                output_time_major=False,
                #impute_finished=True, #Could be an error
                maximum_iterations=self.max_decode_step))
                
                if not self.use_beamsearch_decode:
                    self.decoder_pred_decode = tf.expand_dims(self.decoder_outputs_decode.sample_id,-1)
                else:
                    self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
                    
                return self.decoder_pred_decode
                
            

In [9]:
#Testing Seq2Seq
reset_graph()

config ={'cell_type': 'lstm',
         'hidden_units': 64 ,
         'depth': 2,
         'attention_type': 'bahdanou',
          'embedding_size': 50,
           'use_residual': True,
          'attn_input_feeding': False ,
           'use_dropout': True,
        'dropout_rate' : 0.3,
        'optimizer' : 'Adam',
        'learning_rate' : 0.001,
        'max_gradient_norm': 1.0,
        'use_float16': False,
        'beam_width': 3,
        'max_decode_step': 18 }



phase = 'decode'
obj = Seq2SeqModel(config,phase)
obj.build_model()
obj.build_encoder()

if(phase=='train'):
    dec_logits,dec_argmax = obj.build_decoder()
    
if(phase=='decode'):
    decoder_predicted = obj.build_decoder()


init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    eie,enc_outputs,enc_laststate=sess.run([obj.encoder_inputs_embedded,obj.encoder_outputs,obj.encoder_last_state], 
                                                   feed_dict={obj.encoder_embedding_placeholder:emb ,
                                                              obj.encoder_inputs:eInput, 
                                                              obj.encoder_inputs_length: eLengths,
                                                              obj.keep_prob_placeholder : 0.3 })
    
    if(phase=='train'):
        dec_logits,dec_argmax = sess.run([dec_logits,dec_argmax],feed_dict={obj.decoder_embedding_placeholder:emb ,
                                                                  obj.decoder_inputs_length: dLengths,
                                                                  obj.decoder_targets:dOutput,
                                                                  obj.decoder_inputs: dInput,
                                                                  obj.encoder_embedding_placeholder:emb,
                                                                  obj.encoder_inputs:eInput, 
                                                                  obj.encoder_inputs_length: eLengths,
                                                                  obj.keep_prob_placeholder : 0.3} )
    
    if(phase=='decode'):
        decoder_predicted = sess.run(decoder_predicted,feed_dict={obj.decoder_embedding_placeholder:emb,
                                                                  obj.encoder_embedding_placeholder:emb,
                                                                  obj.encoder_inputs:eInput, 
                                                                  obj.encoder_inputs_length: eLengths,
                                                                  obj.keep_prob_placeholder : 0.3})

    print('encoder Outputs:',enc_outputs.shape)
    #print(enc_outputs[0])
    print()
    #print('Encoder last state:',len(enc_laststate))
    #print(enc_laststate)
    print()
    
    if(phase=='train'):
        print("Decoder Logits:",dec_logits.shape)
        print(dec_logits)
        print()
        print()
        print("Decoder Argmax:",dec_argmax.shape)
        print()
        print(dec_argmax)
        print()
        
    if(phase=='decode'):
        print("Decoder predicted:",decoder_predicted.shape)
        print(decoder_predicted)
        print()
        

building model..
building model..
Building Encoder..
Building decoder and attention...
using beamsearch..
building beamsearch decoder..
encoder Outputs: (6, 100, 64)


Decoder predicted: (6, 18, 3)
[[[203508 203508 203508]
  [327153 327153 327153]
  [218513 218513 218513]
  [131971 131971 131971]
  [108134 108134 108134]
  [142068 142068 142068]
  [335172 335172 335172]
  [267093 267093 267093]
  [277221 277221 277221]
  [ 73386  73386  73386]
  [251818 251818 251818]
  [128052 128052 128052]
  [318215 318215 318215]
  [254936 254936 254936]
  [214997 214997 214997]
  [ 73386  73386  73386]
  [238380 238380 238380]
  [363439 125410 361897]]

 [[392954 392954 392954]
  [245144 245144 245144]
  [ 52425  52425  52425]
  [ 69651  69651  69651]
  [  1174   1174   1174]
  [381634 381634 381634]
  [393803 393803 393803]
  [395522 395522 395522]
  [203914 203914 203914]
  [145147 145147 145147]
  [206348 206348 206348]
  [306376 306376 306376]
  [115473 115473 115473]
  [280450 280450 280450]
