In [83]:
from __future__ import print_function 
import numpy as np
import tensorflow as tf 
import matplotlib.pyplot as plt
%matplotlib inline

import os
import sys
import pickle 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Lambda, TimeDistributed
from keras.layers import Conv1D, MaxPooling1D, Embedding, Activation, Reshape, merge, Merge
from keras.layers import SimpleRNN, GRU, LSTM, Bidirectional
from keras.engine.topology import Layer
from keras import initializers
from keras.callbacks import ModelCheckpoint

import keras.backend as K

## Load the dataset

In [85]:
f = np.load('data_and_embedding100.npz')

In [86]:
num_words = int(f['num_words'])
embedding_dim = int(f['embedding_dim'])
max_sequence_length = int(f['max_sequence_length'])
max_sentence_length = int(f['max_sentence_length'])

data = f['data']
labels = f['labels']

with open('word2index.pickle', 'rb') as handle:
    word_index = pickle.load(handle)

embedding_matrix = f['embedding_matrix']
sequences_sentences = f['sequences_sentences']

In [23]:
print(type(data), type(labels), type(sequences_sentences))
print(len(data), len(labels), len(sequences_sentences))
print(type(data[0]), type(labels[0]), type(sequences_sentences[0]), type(sequences_sentences[0][0]))
print(data[0].shape, labels[0].shape, len(sequences_sentences[0][0]))
print(data[0], labels[0])
print()
for i in range(len(sequences_sentences[0])):
    print(sequences_sentences[0][i])
    print()

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
389877 389877 389877
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'list'> <class 'list'>
(100,) (6,) 6
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  127  327   11   26 4992    1  750   19    2  327    9   22  107    3 1383
    8    5   34  107   45    5 1443   73    1  496    3  175    2  107    6
   37 6682   10 1598    9  170    1    7   15   40  532    4  281   71 9483
    1   13   38  261 3957  313   14    1    3  469   14  661    1   80   39
  281   60   29   36  132   51    9    2  255    1] [ 0.  0.  1.  0.  0.  0.]

[20, 30, 194, 310, 48, 198]

[368, 188, 6, 33, 19, 13, 33, 193, 149, 104, 38]

[357, 250, 11, 628, 1687]

[618, 102, 421, 9, 48, 892, 6, 549, 2079, 1113]



In [24]:
validation_split = 0.2 
epochs = 2

In [25]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [26]:
print(x_train.shape)

(311902, 100)


## Hierarchical Attention

### Build the model 

In [27]:
embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)

In [28]:
class AttLayer(Layer):
    def __init__(self, output_dim=None, **kwargs):
        self.output_dim = output_dim
        self.attention_weights = None 
        super(AttLayer, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.kernel = self.add_weight(name='kernel',
                                     shape=(input_shape[-1], 1),
                                     initializer='normal',
                                     trainable=True)
        #print(self.kernel.get_shape())

        super(AttLayer, self).build(input_shape)
        
    def call(self, x, mask=None):
        eij = K.dot(x, self.kernel)
        #print(eij.get_shape(), 'eij')
        
        ai = K.exp(eij)
        weights = ai/tf.expand_dims(K.sum(ai, axis=1), -1) #ai/K.sum(ai, axis=1).dimshuffle(0, 'x')
        self.attention_weights = weights 
        #print(weights.get_shape(), 'weights')
        
        weighted_input = x*weights #tf.expand_dims(weights, -1) #x*weights.dimshuffle(0, 1, 'x')
        #print(weighted_input.get_shape(), 'weighted_input')
        return tf.reduce_sum(weighted_input, axis=1)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [29]:
sentence_input = Input(shape=(max_sequence_length, ), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
gru_word = Bidirectional(GRU(50, return_sequences=True))(embedded_sequences)
dense_word = TimeDistributed((Dense(100)))(gru_word)
tanh_word = TimeDistributed(Activation('tanh'))(dense_word)
att_word = AttLayer()(tanh_word)
preds = Dense(6, activation='softmax')(att_word)
model_attnBidirGRU = Model(sentence_input, preds)

In [30]:
model_attnBidirGRU.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 100)          2000000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 100)          45300     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 100, 100)          10100     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 100, 100)          0         
_________________________________________________________________
att_layer_2 (AttLayer)       (None, 100)               100       
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 606       
Total para

In [31]:
# load the model with trained weights 

model_attnBidirGRU.load_weights('models/WordlevelAttention-01-0.8720.hdf5')

### Train the model

In [32]:
model_attnBidirGRU.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [34]:
# # evaluate the model after loading weights 

# model_attnBidirGRU.evaluate(x_val[:1000], y_val[:1000], verbose=1)
# print(model_attnBidirGRU.metrics_names)



[0.40624880757927895, 0.87]

In [None]:
# filepath = "models/WordlevelAttention-{epoch:02d}-{val_acc:.4f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
# callbacks_list = [checkpoint]

In [None]:
# import time 
# start_time = time.time()

# model_attnBidirGRU.fit(x_train, y_train,
#                       batch_size=128,
#                       epochs=epochs,
#                       validation_data=(x_val, y_val),
#                       callbacks=callbacks_list)

# print("Training time: ", time.time() - start_time)

Train on 311902 samples, validate on 77975 samples
Epoch 1/2

### Save the model 

In [None]:
# model_attnBidirGRU.save('models/AttentionBidirGRU.h5')

## Attention Visualization

### Retrieve attention weights

In [39]:
for i, layer in enumerate(model_attnBidirGRU.layers):
    print(i, layer)

0 <keras.engine.topology.InputLayer object at 0x11bc00898>
1 <keras.layers.embeddings.Embedding object at 0x11e21d828>
2 <keras.layers.wrappers.Bidirectional object at 0x11d2f4160>
3 <keras.layers.wrappers.TimeDistributed object at 0x11915de10>
4 <keras.layers.wrappers.TimeDistributed object at 0x1195054e0>
5 <__main__.AttLayer object at 0x11910fe80>
6 <keras.layers.core.Dense object at 0x11953c518>


In [61]:
print(x_val[0])
print(len(x_val[0]))

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0   10   65  211   10   14    8  350    4
   13   38  143  390   54   99   92   13 1282    1]
100


In [47]:
get_layer_output = K.function([model_attnBidirGRU.layers[0].input, K.learning_phase()], [model_attnBidirGRU.layers[4].output])
test_seq = x_val[:1]
out = get_layer_output([test_seq, 0])[0]  # test mode

In [49]:
print(out.shape)

(1, 100, 100)


In [56]:
attn_w = model_attnBidirGRU.layers[5].get_weights()[0]
print(attn_w.shape)

(100, 1)


In [60]:
eij = np.dot(out, attn_w)
print(eij.shape)
ai = np.exp(eij)
print(ai.shape)
weights = ai/np.sum(ai)

(1, 100, 1)
(1, 100, 1)


In [67]:
print(weights.shape)
print(weights)
weight_reshaped = np.reshape(weights, 100)
print(weight_reshaped.shape)

(1, 100, 1)
[[[  1.85092395e-05]
  [  1.33841386e-05]
  [  1.10222727e-05]
  [  1.01951318e-05]
  [  1.02347649e-05]
  [  1.07651058e-05]
  [  1.15378343e-05]
  [  1.23702266e-05]
  [  1.31317638e-05]
  [  1.37435472e-05]
  [  1.41736600e-05]
  [  1.44259093e-05]
  [  1.45259546e-05]
  [  1.45090999e-05]
  [  1.44117266e-05]
  [  1.42664712e-05]
  [  1.41000310e-05]
  [  1.39327340e-05]
  [  1.37789120e-05]
  [  1.36477302e-05]
  [  1.35440896e-05]
  [  1.34694865e-05]
  [  1.34228503e-05]
  [  1.34012753e-05]
  [  1.34006314e-05]
  [  1.34161255e-05]
  [  1.34428092e-05]
  [  1.34759221e-05]
  [  1.35112559e-05]
  [  1.35453583e-05]
  [  1.35756882e-05]
  [  1.36006602e-05]
  [  1.36196295e-05]
  [  1.36327872e-05]
  [  1.36409963e-05]
  [  1.36455374e-05]
  [  1.36478566e-05]
  [  1.36493409e-05]
  [  1.36509807e-05]
  [  1.36532235e-05]
  [  1.36558701e-05]
  [  1.36580975e-05]
  [  1.36586550e-05]
  [  1.36562385e-05]
  [  1.36499621e-05]
  [  1.36399058e-05]
  [  1.36275667e-05]
 

In [77]:
print(model_attnBidirGRU.layers[0].input_shape)

(None, 100)


In [91]:
print(type(word_index))

index2word = {index:word for word, index in word_index.items()}

print(index2word[10])

<class 'dict'>
in


In [95]:
def get_layer_output(model, layer, seq):
    seq_length = model.layers[layer].input_shape[1]
    func = K.function([model.layers[0].input, K.learning_phase()], [model.layers[layer].output])
    if isinstance(seq[0], list):
        test_seq = [pad_sequences(se, seq_length) if len(se) < seq_length else se[:seq_length] for se in seq]
    else:
        if len(seq) < seq_length:
            test_seq = [pad_sequences(seq, seq_length)]
        else:
            test_seq = [seq[:seq_length]] 
    out = func([test_seq, 0])[0]  # test mode
    return out 

def get_attention_weights(model, layer, out):
    attn_w = model.layers[layer+1].get_weights()[0]
    eij = np.dot(out, attn_w)
    ai = np.exp(eij)
    weights = ai/np.sum(ai)
    return weights

def get_top_weights_words(seq, weights, index2word):
    top_indices = np.argsort(weights, axis=0)
    top_weights = weights[top_indices]
    top_words = [(index2word[seq[top_indices[i]]], top_weights[i]) for i in range(len(top_indices))]
    return top_words
     
def visualize_words_attention(model, layer, seq, index2word):
    out = get_layer_output(model, layer, seq)
    weights = get_attention_weights(model, layer, out)
    top_words = get_top_weights_words(seq, weights, index2word)
    return top_words


In [96]:
visualize_words_attention(model_attnBidirGRU, 4, x_val[0], index2word)

TypeError: unhashable type: 'numpy.ndarray'

### Visualize

In [None]:
K = 10
topKeys = np.argpartition(weights,-K)[-K:]
print topKeys
print test_seq[0][topKeys]