# Sentiment Classification


## Loading the dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from keras.datasets import imdb

vocab_size = 10000 #vocab size

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.

Using TensorFlow backend.


In [0]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 #vocab size
maxlen = 300  #number of word used from each review

## Train test split

In [0]:
#load dataset as a list of ints
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
#make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [5]:
print(x_train.shape)
print(x_test.shape)

(25000, 300)
(25000, 300)


In [6]:
len(x_train[0])

300

In [7]:
# Get the words and create a dictionary with index as key and the word as value
words=imdb.get_word_index()
print(words['the'])

index=dict([(v,k) for (k,v) in words.items() if v<=10000])
index[1]

1


u'the'

In [8]:
len(index)

10000

In [0]:
# Load the embeddings from Glove
from zipfile import ZipFile

with ZipFile('/content/drive/My Drive/Colab Notebooks/NLP/glove.6B.zip') as f:
  f.extractall()

In [10]:
import numpy as np

# load the whole embedding into memory
embeddings_index = dict()
f = open('./glove.6B.100d.txt')

for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [11]:
# Map the embedding vector with the words in the index:word dictionary defined before and store in embedding_matrix
embedding_matrix = np.zeros((vocab_size+1, 100))

for i,word in index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

(10001, 100)


## Build Keras Embedding Layer Model
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [0]:
%tensorflow_version 2.x
import tensorflow
tensorflow.__version__
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, TimeDistributed,Bidirectional
from tensorflow.keras.layers import Embedding
import tensorflow as tf
from sklearn.metrics import confusion_matrix

input_layer=Input(shape=(maxlen,),dtype=tensorflow.int64)
embedding = Embedding(vocab_size+1,100,weights=[embedding_matrix],trainable=True, input_length=maxlen)(input_layer)

In [0]:
lstm = Bidirectional(LSTM(20))(embedding)
out = (Dense(2, activation="softmax"))(lstm)

In [0]:
from tensorflow.keras import backend as K
def custom_sparse_categorical_accuracy(y_true, y_pred):
    return K.cast(K.equal(K.max(y_true, axis=-1),
                          K.cast(K.argmax(y_pred, axis=-1), K.floatx())),
                  K.floatx())

In [16]:
model = Model(input_layer, out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=[custom_sparse_categorical_accuracy])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 300)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 100)          1000100   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 40)                19360     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 82        
Total params: 1,019,542
Trainable params: 1,019,542
Non-trainable params: 0
_________________________________________________________________


In [17]:
x_train=x_train[:10000,:]
y_train=y_train[:10000]

model.fit(np.array(x_train),np.array(y_train),batch_size=20, epochs=5, verbose=1)

Train on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f72b8208f10>

In [18]:
x_test=x_test[:5000,:]
y_test=y_test[:5000]

test_pred = model.predict(np.array(x_test), verbose=1)



In [19]:
# Mapping the greater value as 1 and the lesser probability as 0
test_pred = [1 if j>i else 0 for i,j in test_pred]

test_pred[1]

1

In [20]:
confusion_matrix(y_test, test_pred)

array([[2082,  489],
       [ 351, 2078]])

## Retrive the output of each layer in keras for a given single test sample from the trained model you built

In [21]:
model.layers[2].output

<tf.Tensor 'bidirectional_1/Identity:0' shape=(None, 40) dtype=float32>

In [0]:
layer_outputs=[out.output for out in model.layers][1:]
model_input = model.input

In [23]:
functions = [K.function([model_input],[out]) for out in layer_outputs]

outs = [func([x_test]) for func in functions]

print(outs[1])    # Output of Bidirectional layer

print(outs[2])    # Output of Dense softmax layer

print(outs[0])    # Output of the embedding layer

[array([[-0.06290968,  0.01574332, -0.08873846, ..., -0.00488403,
         0.00516531, -0.00949626],
       [-0.83777916,  0.3790793 , -0.24576673, ..., -0.00471083,
         0.00494218, -0.00946333],
       [-0.5466458 ,  0.24356303, -0.13953371, ..., -0.3159853 ,
         0.13335492, -0.00292477],
       ...,
       [-0.8677911 ,  0.11779376, -0.36396813, ..., -0.00488403,
         0.00516531, -0.00949626],
       [-0.81795   ,  0.22889858, -0.21950084, ..., -0.00488403,
         0.00516531, -0.00949626],
       [-0.7861555 ,  0.05340945, -0.30148953, ..., -0.00488403,
         0.00516531, -0.00949626]], dtype=float32)]
[array([[1.21324725e-01, 8.78675222e-01],
       [3.18076258e-04, 9.99681950e-01],
       [7.54319504e-03, 9.92456794e-01],
       ...,
       [6.83021091e-04, 9.99316931e-01],
       [5.02069760e-03, 9.94979322e-01],
       [3.71700473e-04, 9.99628305e-01]], dtype=float32)]
[array([[[-0.01995359, -0.03995002, -0.0019082 , ...,  0.03327145,
         -0.04077213, -0.01