In [83]:
from __future__ import print_function 
import numpy as np
import tensorflow as tf 
import matplotlib.pyplot as plt
%matplotlib inline

import os
import sys
import pickle 
import time 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Lambda, TimeDistributed
from keras.layers import Conv1D, MaxPooling1D, Embedding, Activation, Reshape, merge, Merge
from keras.layers import SimpleRNN, GRU, LSTM, Bidirectional
from keras.engine.topology import Layer
from keras import initializers
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

import keras.backend as K

from Classifier import Classifierfier 
from Attention_BidirGRU import WordAttentionClassifier, AttLayer

In [None]:
class AttentionVisualizer():
    """ the class contains various methods to visualize the attention effects in the model
    """
    def __init__(self, model_path=None, weights_path='models/word_attention/weights-improvement-10-0.95-0.93.hdf5'):
        
        # load the trained model 
        if model_path != None:
            # load model from model file
            self.model = load_model(model_path)
        elif weights_path != None:
            # create a model and load trained weights 
            self.model = WordAttentionClassifier(batch_size=128, epochs=20, raw_data_path=None, embedded_data_path='data/data_and_embedding100.npz', embedding_dim=100)
            self.model.load_weights(weights_path)
        else:
            # train a new model 
            self.model = WordAttentionClassifier(batch_size=128, epochs=20, raw_data_path=None, embedded_data_path='data/data_and_embedding100.npz', embedding_dim=100)
            self.model.build()
            self.model.train()
            self.model.evaluate()
            
            
        

## Hierarchical Attention

### Build the model 

In [27]:
embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)

In [28]:
class AttLayer(Layer):
    def __init__(self, output_dim=None, **kwargs):
        self.output_dim = output_dim
        self.attention_weights = None 
        super(AttLayer, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.kernel = self.add_weight(name='kernel',
                                     shape=(input_shape[-1], 1),
                                     initializer='normal',
                                     trainable=True)
        #print(self.kernel.get_shape())

        super(AttLayer, self).build(input_shape)
        
    def call(self, x, mask=None):
        eij = K.dot(x, self.kernel)
        #print(eij.get_shape(), 'eij')
        
        ai = K.exp(eij)
        weights = ai/tf.expand_dims(K.sum(ai, axis=1), -1) #ai/K.sum(ai, axis=1).dimshuffle(0, 'x')
        self.attention_weights = weights 
        #print(weights.get_shape(), 'weights')
        
        weighted_input = x*weights #tf.expand_dims(weights, -1) #x*weights.dimshuffle(0, 1, 'x')
        #print(weighted_input.get_shape(), 'weighted_input')
        return tf.reduce_sum(weighted_input, axis=1)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [29]:
sentence_input = Input(shape=(max_sequence_length, ), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
gru_word = Bidirectional(GRU(50, return_sequences=True))(embedded_sequences)
dense_word = TimeDistributed((Dense(100)))(gru_word)
tanh_word = TimeDistributed(Activation('tanh'))(dense_word)
att_word = AttLayer()(tanh_word)
preds = Dense(6, activation='softmax')(att_word)
model_attnBidirGRU = Model(sentence_input, preds)

In [30]:
model_attnBidirGRU.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 100)          2000000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 100)          45300     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 100, 100)          10100     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 100, 100)          0         
_________________________________________________________________
att_layer_2 (AttLayer)       (None, 100)               100       
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 606       
Total para

In [31]:
# load the model with trained weights 

model_attnBidirGRU.load_weights('models/WordlevelAttention-01-0.8720.hdf5')

### Train the model

In [32]:
model_attnBidirGRU.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [34]:
# # evaluate the model after loading weights 

# model_attnBidirGRU.evaluate(x_val[:1000], y_val[:1000], verbose=1)
# print(model_attnBidirGRU.metrics_names)



[0.40624880757927895, 0.87]

In [None]:
# filepath = "models/WordlevelAttention-{epoch:02d}-{val_acc:.4f}.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
# callbacks_list = [checkpoint]

In [None]:
# import time 
# start_time = time.time()

# model_attnBidirGRU.fit(x_train, y_train,
#                       batch_size=128,
#                       epochs=epochs,
#                       validation_data=(x_val, y_val),
#                       callbacks=callbacks_list)

# print("Training time: ", time.time() - start_time)

Train on 311902 samples, validate on 77975 samples
Epoch 1/2

### Save the model 

In [None]:
# model_attnBidirGRU.save('models/AttentionBidirGRU.h5')

## Attention Visualization

### Retrieve attention weights

In [39]:
for i, layer in enumerate(model_attnBidirGRU.layers):
    print(i, layer)

0 <keras.engine.topology.InputLayer object at 0x11bc00898>
1 <keras.layers.embeddings.Embedding object at 0x11e21d828>
2 <keras.layers.wrappers.Bidirectional object at 0x11d2f4160>
3 <keras.layers.wrappers.TimeDistributed object at 0x11915de10>
4 <keras.layers.wrappers.TimeDistributed object at 0x1195054e0>
5 <__main__.AttLayer object at 0x11910fe80>
6 <keras.layers.core.Dense object at 0x11953c518>


In [164]:
index2word = {index:word for word, index in word_index.items()}

def get_layer_output(model, layer, seq):
    seq_length = model.layers[layer].input_shape[1]
    func = K.function([model.layers[0].input, K.learning_phase()], [model.layers[layer].output])
    if len(seq.shape) > 1:
        test_seq = [pad_sequences(se, seq_length) if len(se) < seq_length else se[:seq_length] for se in seq]
    else:
        if len(seq) < seq_length:
            test_seq = [pad_sequences(seq, seq_length)]
        else:
            test_seq = [seq[:seq_length]] 
    out = func([test_seq, 0])[0]  # test mode
    return out 

def get_attention_weights(model, layer, out):
    attn_w = model.layers[layer+1].get_weights()[0]
    eij = np.dot(out, attn_w)
    ai = np.exp(eij)
    weights = ai/np.sum(ai)
    return weights

def get_top_weights_words(seq, weights, index2word, top):
    if len(seq.shape) == 1:
        seq = [seq]
    top_indices = np.argsort(np.squeeze(weights, axis=2), axis=1)
    top_indices = top_indices.tolist()
    top_weights = np.squeeze(weights, axis=2)
    top_words = [[(index2word.get(seq[i][idx]), top_weights[i][idx]) for idx in top_indices[i][::-1][:top]] for i in range(len(top_indices))]
    return top_words
     
def visualize_words_attention(model, layer, seq, index2word):
    out = get_layer_output(model, layer, seq)
    weights = get_attention_weights(model, layer, out)
    top_words = get_top_weights_words(seq, weights, index2word, 10)
    return top_words


In [165]:
visualize_words_attention(model_attnBidirGRU, 4, x_val[0], index2word)

[[('and', 0.7966454),
  ('my', 0.061176118),
  ("i'm", 0.026619527),
  ('never', 0.024806533),
  ('hours', 0.02436862),
  ('when', 0.024179246),
  ('feet', 0.011610609),
  ('my', 0.0081744259),
  ('wearing', 0.0059362808),
  ('in', 0.0049668676)]]

### Visualize

In [166]:
top_words_weights = visualize_words_attention(model_attnBidirGRU, 4, x_val[:4], index2word)

for l in top_words_weights:
    print(l)

[('and', 0.24031627), ('my', 0.018454414), ("i'm", 0.0080300542), ('never', 0.0074831522), ('hours', 0.0073510441), ('when', 0.0072939144), ('feet', 0.0035024595), ('my', 0.0024658996), ('wearing', 0.001790741), ('in', 0.0014983066)]
[('and', 0.017910801), ('good', 0.0030010997), ('good', 0.0028796026), ('i', 0.0023982744), ('every', 0.0023295416), ('was', 0.0020864937), ('day', 0.002032995), ('end', 0.001385105), ('very', 0.00088452862), ('end', 0.00073413132)]
[('every', 0.061171599), ('penny', 0.056040905), ('nice', 0.02631308), ('end', 0.020942625), ('comfortable', 0.017963026), ('i', 0.013953256), ('everyday', 0.01353569), ('end', 0.0042797718), ('wear', 0.0035446836), ('end', 0.0033914854)]
[('end', 0.19966494), ('color', 0.13983636), ('love', 0.019674947), ('still', 0.014997981), ('different', 0.014063908), ('i', 0.011758228), ('well', 0.0037981439), ('end', 0.0032080233), ('beat', 0.0020150593), ('and', 0.0019658704)]
