In [67]:
from __future__ import print_function 
import numpy as np
import tensorflow as tf 

import os
import sys

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Lambda, TimeDistributed
from keras.layers import Conv1D, MaxPooling1D, Embedding, Activation, Reshape, merge, Merge
from keras.layers import SimpleRNN, GRU, LSTM, Bidirectional
from keras.engine.topology import Layer
from keras import initializers

import keras.backend as K

## Load the dataset

In [2]:
f = np.load('data_and_embedding.npz')

In [11]:
num_words = int(f['num_words'])
embedding_dim = int(f['embedding_dim'])
max_sequence_length = int(f['max_sequence_length'])
max_sentence_length = 107 #int(f['max_sentence_length'])

data = f['data']
labels = f['labels']

embedding_matrix = f['embedding_matrix']

In [4]:
validation_split = 0.2 
epochs = 60

In [5]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

## Hierarchical Attention

### Build the model 

In [13]:
embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)

In [62]:
# # Reference codes from Dusans 

# def get_birnn_attention(max_sequence_length, outputs=1):
#     input_layer = Input(shape=(max_sequence_length,), dtype='int32')
#     embedded_sequences = embedding_layer(input_layer)

#     bidir_layer = Bidirectional(GRU(50, return_sequences=True))(embedded_sequences)
#     print(bidir_layer.shape)

#     attlayer = TimeDistributed(Dense(100), input_shape=(max_sequence_length,))(bidir_layer)  # 2nd dimension does not matter here
#     attlayer = TimeDistributed(Activation('tanh'))(attlayer)
#     attlayer = TimeDistributed(Dense(1))(attlayer)
#     attlayer = Flatten()(attlayer)
#     attlayer = Activation('softmax')(attlayer)
#     attlayer = Reshape((-1,1), name='attention_weights')(attlayer)
#     print(attlayer.shape)
#     context_layer = merge([bidir_layer, attlayer], mode = lambda x: K.batch_dot(x[0], x[1], axes=[1,1]), output_shape= lambda x: (x[0][0], x[0][2]))
# #     final_model = merge([ll, attlayer], mode = lambda x: K.dot(K.transpose(x[0]), x[1]), output_shape= lambda x: (x[0][0],x[0][2]))
#     print(context_layer.shape)
#     return context_layer

# get_birnn_attention(101)



In [61]:
# sequence_input = Input(shape=(max_sentence_length, max_sequence_length, ), dtype='int32')
# # print(sequence_input.shape)

# x = TimeDistributed(embedding_layer)(sequence_input)

# x = TimeDistributed(Bidirectional(GRU(50, return_sequences=True)))(x)
# print(x)

# word_attn = TimeDistributed(TimeDistributed(Dense(100), input_shape=(max_sequence_length,)))(x)

# word_attn = TimeDistributed(TimeDistributed(Activation('tanh')))(word_attn)

# word_attn = TimeDistributed(TimeDistributed(Dense(1)))(word_attn)

# word_attn = TimeDistributed(Flatten())(word_attn)

# word_attn = TimeDistributed(Activation('softmax'))(word_attn)

# word_attn = TimeDistributed(Reshape((-1,1)))(word_attn)
# print(word_attn)

In [60]:
# def atten_dot(x):
#     # x[0] has shape (?, 107, 1000, 100), x[1] has shape (?, 107, 1000, 1)
#     xshape = x[0].shape
#     print('kk')
#     vec = np.reshape(x[0], (x[0].shape[0]*x[0].shape[1], x[0].shape[2], x[0].shape[3]))
#     wei = np.reshape(x[1], (x[1].shape[0]*x[1].shape[1], x[1].shape[2], x[1].shape[3]))
#     comp = np.multiply(vec, wei)
#     compshape = comp.shape
# #     print(compshape)
#     comp_reshaped = tf.reshape(comp, (xshape[0], xshape[1], compshape[1], compshape[2]))
# #     print(comp_reshaped.shape)
#     re = tf.reduce_mean(comp_reshaped, axis=2)
# #     print(re.shape)
#     return re

# def outputshape(input_shape):
#     print(type(input_shape))
#     shape = list(input_shape)
# #     assert len(shape)==2
#     outshape = (shape[0][0], shape[0][1], shape[0][3])
#     return tuple(outshape) 

# # print(atten_dot([np.ones((2,107,1000,100)), np.ones((2,107,1000,1))]))

# sentence_attn = TimeDistributed(Merge([x, word_attn], mode = atten_dot, output_shape = outputshape ))

# # Lambda(embedding_mean)


# # sentence_attn = TimeDistributed()()



# # word_level_att = TimeDistributed(get_birnn_attention(max_sentence_length))(sequence_input)

# # sentence_level_att = get_birnn_attention(max_sequence_length)(word_level_att)

# # preds = Dense(6, activation='softmax')(sentence_level_att)

# # model_attention = Model(sequence_input, preds)

In [79]:
class AttLayer(Layer):
    def __init__(self, output_dim=None, **kwargs):
        self.output_dim = output_dim
#         self.init = initializations.get('normal')
        super(AttLayer, self).__init__(**kwargs)
        
    def build(self, input_shape):
#         assert len(input_shape) == 3
#         self.W = self.init((input_shape[-1], 1))
#         self.trainable_weights = [self.W]
        self.kernel = self.add_weight(name='kernel',
                                     shape=(input_shape[-1], 1),
                                     initializer='normal',
                                     trainable=True)
        
        super(AttLayer, self).build(input_shape)
        
    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.kernel))
        
        ai = K.exp(eij)
        weights = tf.expand_dims(ai/K.sum(ai, axis=1), -1) #ai/K.sum(ai, axis=1).dimshuffle(0, 'x')
        
        weighted_input = tf.expand_dims(x*weights, -1) #x*weights.dimshuffle(0, 1, 'x')
        return tf.reduce_sum(weighted_input, axis=1)
    
#     def get_output_shape_for(self, input_shape):
#         return (input_shape[0], input_shape[-1])
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [82]:
sentence_input = Input(shape=(max_sequence_length, ), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
gru_word = Bidirectional(GRU(50, return_sequences=True))(embedded_sequences)
dense_word = TimeDistributed(Dense(100))(gru_word)
att_word = AttLayer()(dense_word)
sentEncoder = Model(sentence_input, att_word)

review_input = Input(shape=(max_sentence_length, max_sequence_length), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
gru_sent = Bidirectional(GRU(50, return_sequences=True))(review_encoder)
dense_sent = TimeDistributed(Dense(100))(gru_sent)
att_sent = AttLayer()(dense_sent)
preds = Dense(6, activation='softmax')(att_sent)
model_attention = Model(review_input, preds)

In [83]:
model_attention.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_32 (InputLayer)        (None, 107, 1000)         0         
_________________________________________________________________
time_distributed_87 (TimeDis (None, 107, 100)          2055500   
_________________________________________________________________
bidirectional_23 (Bidirectio (None, 107, 100)          45300     
_________________________________________________________________
time_distributed_88 (TimeDis (None, 107, 100)          10100     
_________________________________________________________________
att_layer_7 (AttLayer)       (None, 100)               100       
_________________________________________________________________
dense_36 (Dense)             (None, 6)                 606       
Total params: 2,111,606.0
Trainable params: 111,606.0
Non-trainable params: 2,000,000.0
______________________________________________________

### Train the model

In [None]:
model_attention.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
import time 
start_time = time.time()

model_attention.fit(x_train, y_train,
          batch_size=128,
          epochs=epochs,
          validation_data=(x_val, y_val))

print("Training time: ", time.time() - start_time)

### Save the model 

In [None]:
model_conv_rnn.save('models/Attention.h5')