## Seq2Seq model with Attention in sentiment analysis :

### Importing Libraries :

In [102]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb
from keras.layers import *
from keras.models import *
from keras import backend as K

### Data :

In [103]:
n_unique_words = 10000
(x_train, y_train),(x_test, y_test) = imdb.load_data(num_words=n_unique_words)

In [104]:
maxlen = 100
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
y_train = np.array(y_train)
y_test = np.array(y_test)

### Initial LSTM model (bi-directional) without attention :

In [105]:
model = Sequential()
model.add(Embedding(n_unique_words, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
model.summary()

Model: "sequential_35"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_35 (Embedding)    (None, 100, 128)          1280000   
                                                                 
 bidirectional_33 (Bidirecti  (None, 128)              98816     
 onal)                                                           
                                                                 
 dropout_34 (Dropout)        (None, 128)               0         
                                                                 
 dense_34 (Dense)            (None, 1)                 129       
                                                                 
Total params: 1,378,945
Trainable params: 1,378,945
Non-trainable params: 0
_________________________________________________________________


In [106]:
history1=model.fit(x_train, y_train, batch_size=256, epochs=12)
train_mse_wo_attn = model.evaluate(x_train, y_train)
test_mse_wo_attn = model.evaluate(x_test, y_test)
print("Train set MSE without attention = ", train_mse_wo_attn)
print("Test set MSE without attention = ", test_mse_wo_attn)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Train set MSE without attention =  [0.024160468950867653, 0.9955199956893921]
Test set MSE without attention =  [0.712272584438324, 0.8227599859237671]


### Building with Attention layer :

In [107]:
class attention(Layer):
    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences

        super(attention,self).__init__()

    def build(self, input_shape):
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1), initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1), initializer="zeros")

        
        super(attention,self).build(input_shape)


    def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()


In [108]:
model2 = Sequential()
model2.add(Embedding(n_unique_words, 128, input_length=maxlen))
model2.add(Bidirectional(LSTM(64, return_sequences=True)))
model2.add(attention(return_sequences=False))
model2.add(Dropout(0.5))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
model2.summary()

Model: "sequential_36"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_36 (Embedding)    (None, 100, 128)          1280000   
                                                                 
 bidirectional_34 (Bidirecti  (None, 100, 128)         98816     
 onal)                                                           
                                                                 
 attention_19 (attention)    (None, 128)               228       
                                                                 
 dropout_35 (Dropout)        (None, 128)               0         
                                                                 
 dense_35 (Dense)            (None, 1)                 129       
                                                                 
Total params: 1,379,173
Trainable params: 1,379,173
Non-trainable params: 0
___________________________________________

In [101]:
history2 = model2.fit(x_train, y_train, batch_size=256, epochs=12)
train_mse_attn = model2.evaluate(x_train, y_train)
test_mse_attn = model2.evaluate(x_test, y_test)
print("Train set MSE with attention = ", train_mse_attn)
print("Test set MSE with attention = ", test_mse_attn)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Train set MSE with attention =  [0.009753917343914509, 0.9973599910736084]
Test set MSE with attention =  [0.6855257749557495, 0.8439199924468994]
