In [None]:
import tensorflow as tf

In [None]:
class HierarchyAttentionNetwork:
    def __init__(self, W_embedding):
        with tf.name_scope('Inputs'):
            self._X1_inputs = tf.placeholder(
                tf.int64, [None, self.title_len], name='X1_inputs')
            self._X2_inputs = tf.placeholder(
                tf.int64, [None, self.doc_len * self.sent_len], name='X2_inputs')
            self._y_inputs = tf.placeholder(
                tf.float32, [self.n_class], name='y_input')

        with tf.variable_scope('embedding'):
            self.embedding = tf.get_variable(
                name='embedding', shape=W_embedding.shape,
                initializer=tf.constant_initializer(W_embedding), trainable=True)
        self.embedding_size = W_embedding.shape[1]

        with tf.variable_scope('bigru_text'):
            output_title = self.bigru_inference(self._X1_inputs)

        with tf.variable_scope('han_content'):
            output_content = self.han_inference(self._X2_inputs)

        with tf.variable_scope('fc-bn-layer'):
            output = tf.concat([output_title, output_content], axis=1)
            W_fc = self.weight_variable(
                [self.hidden_size * 4, self.fc_hidden_size], name='Weight_fc')
            h_fc = tf.matmul(output, W_fc, name='h_fc')
            self.fc_relu = tf.nn.relu(h_fc, name="relu")
            fc_drop = tf.nn.dropout(self.fc_relu, self.keep_prob)

        with tf.variable_scope('out_layer'):
            W_out = self.weight_variable(
                [self.fc_hidden_size, self.n_class], name='Weight_out')
            b_out = self.bias_variable([self.n_class], name='bias_out')
            self._y_pred = tf.nn.xw_plus_b(fc_drop, W_out, b_out, name='y_pred')

        with tf.name_scope('loss'):
            self._loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self._y_pred, labels=self._y_inputs))
    
    def gru_cell(self):
        with tf.name_scope('gru_cell'):
            cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
        return cell
    
    def bi_gru(self, inputs, seg_num):
        cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
        cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
        initial_states_fw = [cell_fw.zero_state(seg_num, tf.float32) for cell_fw in cells_fw]
        initial_states_bw = [cell_bw.zero_state(seg_num, tf.float32) for cell_bw in cells_bw]
        outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(
            cells_fw, cells_bw, inputs,
            initial_states_fw = initial_states_fw, 
            initial_states_bw = initial_states_bw,
            dtype=tf.float32)
        # output_size: [seg_num, timesteps, hidden_size * 2]
        return outputs
    
    def attention(self, inputs, output_size,
                                initializer=layers.xavier_initializer(),
                                activation_fn=tf.tanh, scope=None):
        with tf.variable_scope(scope or 'attention') as scope:
            # attention: [output_size]
            attention_context_vector = tf.get_variable(
                name='attention_context_vector',
                shape=[output_size],
                initializer=initializer,
                dtype=tf.float32)
            
            # [seg_num, timesteps, input_size] -> [seg_num, timesteps, output_size]
            input_projection = layers.fully_connected(
                inputs, output_size, activation_fn=activation_fn, scope=scope)
            
            # [seg_num, timesteps, output_size] -> [seg_num, timesteps]
            vector_attn = tf.reduce_sum(
                tf.multiply(input_projection, attention_context_vector),
                axis=2,
                keep_dims=True)
            
            attention_weights = tf.nn.softmax(vector_attn, dim=1)

            # inputs: [seg_num, timesteps, input_size]
            # attention_weights: [seg_num, timesteps]
            weighted_projection = tf.multiply(inputs, attention_weights)
            
            # outputs: [seg_num, input_size]
            outputs = tf.reduce_sum(weighted_projection, axis=1)
            return outputs
    def han_inference(self, X_inputs):
        # inputs: [batch_size, doc_len, sent_len, embedding_size]
        inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
        
        # sent_inputs [batch_size * doc_len, sent_len, embedding_size]
        sent_inputs = tf.reshape(inputs,
                                 [self.batch_size*self.doc_len,
                                  self.sent_len,
                                  self.embedding_size])
        
        with tf.variable_scope('sentence_encoder'):
            sent_outputs = self.bi_gru(sent_inputs, seg_num=self.batch_size*self.doc_len)
            
            # sent_attn_outputs: [seg_num, hidden_size*2]
            sent_attn_outputs = self.attention(sent_outputs, self.hidden_size*2)
        with tf.variable_scope('doc_encoder'): 
            # doc_inputs: [batch_size, doc_len, hidden_size*2]
            doc_inputs = tf.reshape(sent_attn_outputs, [self.batch_size, self.doc_len, self.hidden_size*2])
            
            # doc_outputs: [batch_size, doc_len, hidden_size*2]
            doc_outputs = self.bi_gru(doc_inputs, self.batch_size)
            
            # doc_attn_outputs: [batch_size, hidden_size * 2]
            doc_attn_outputs = self.attention(doc_outputs, self.hidden_size*2)
        return doc_attn_outputs
        