# Sentiment Classification


## Loading the dataset

In [2]:
from keras.datasets import imdb

vocab_size = 10000 #vocab size

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.

In [3]:
(x_train.shape, y_train.shape), (x_test.shape, y_test.shape)

(((25000,), (25000,)), ((25000,), (25000,)))

In [4]:
import numpy as np

# concatenating both the train and test data as both are same length
data = np.concatenate((x_train, x_test), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

print("Categories:", np.unique(targets))
print("Number of unique words:", len(np.unique(data)))

Categories: [0 1]
Number of unique words: 49579


In [5]:
# Let's look at a single training example (first movie review indices)

print("Label:", targets[0])

Label: 1
print(data[0])

Label: 1
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [6]:
# Getting all the unique words and thier frequencies in the whole dataset
key_value = imdb.get_word_index()

# Reversing the key, value pairs
reverse = dict()
for (key, value) in key_value.items():
    reverse.update([(value, key)])

In [43]:
reverse

{34701: 'fawn',
 52006: 'tsukino',
 52007: 'nunnery',
 16816: 'sonja',
 63951: 'vani',
 1408: 'woods',
 16115: 'spiders',
 2345: 'hanging',
 2289: 'woody',
 52008: 'trawling',
 52009: "hold's",
 11307: 'comically',
 40830: 'localized',
 30568: 'disobeying',
 52010: "'royale",
 40831: "harpo's",
 52011: 'canet',
 19313: 'aileen',
 52012: 'acurately',
 52013: "diplomat's",
 25242: 'rickman',
 6746: 'arranged',
 52014: 'rumbustious',
 52015: 'familiarness',
 52016: "spider'",
 68804: 'hahahah',
 52017: "wood'",
 40833: 'transvestism',
 34702: "hangin'",
 2338: 'bringing',
 40834: 'seamier',
 34703: 'wooded',
 52018: 'bravora',
 16817: 'grueling',
 1636: 'wooden',
 16818: 'wednesday',
 52019: "'prix",
 34704: 'altagracia',
 52020: 'circuitry',
 11585: 'crotch',
 57766: 'busybody',
 52021: "tart'n'tangy",
 14129: 'burgade',
 52023: 'thrace',
 11038: "tom's",
 52025: 'snuggles',
 29114: 'francesco',
 52027: 'complainers',
 52125: 'templarios',
 40835: '272',
 52028: '273',
 52130: 'zaniacs',

In [14]:
reverse.get(0-5)

In [17]:
# The code below retrieves the dictionary mapping word indices back into the original words so that we can read them.

first_review = []

for i in data[0]: # looping through first review
    first_review.append(reverse.get(i - 3, "#"))
print(" ".join(first_review))

# this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for # and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also # to the two little boy's that played the # of norman and paul they were just brilliant children are often left out of the # list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you thi

In [42]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 #vocab size
maxlen = 300  #number of word used from each review

## Train test split

In [43]:
# we split our data into a training and a testing set. The training set will contain 40,000 reviews and the testing set 10,000.

x_train = data[10000:]
y_train = targets[10000:]

x_test = data[:10000]
y_test = targets[:10000]


In [44]:
# #load dataset as a list of ints
# (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
# make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [45]:
x_train.shape, y_train.shape

((40000, 300), (40000,))

In [46]:
x_test.shape, y_test.shape

((10000, 300), (10000,))

## Build Keras Embedding Layer Model
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [54]:
from keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten, Dropout

In [55]:
# Input - Layer
model = Sequential()

# Hidden - Layers (Embedding and LSTM layers)
model.add(Embedding(vocab_size, 100, input_length=maxlen))
model.add(LSTM(100))
model.add(Dropout(0.5))
# model.add(Flatten())
model.add(Dense(250, activation='relu'))


# Output- Layer
model.add(Dense(1,activation='sigmoid'))

In [56]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 300, 100)          1000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 250)               25250     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 251       
Total params: 1,105,901
Trainable params: 1,105,901
Non-trainable params: 0
_________________________________________________________________


### Now we compile our model, which is nothing but configuring the model for training. We use the “adam” optimizer, an algorithm that changes the weights and biases during training. We also choose binary-crossentropy as loss (because we deal with binary classification) and accuracy as our evaluation metric.

In [57]:
model.compile(
    optimizer="adam", 
    loss="binary_crossentropy", 
    metrics=["accuracy"])

### Now we're able to train our model. We'll do this with a batch_size of 64

In [58]:
results = model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=512, epochs=3) # Train

Epoch 1/3
Epoch 2/3
Epoch 3/3


### It is time to evaluate our model:

In [59]:
y_pred = model.predict_classes(x_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [60]:
import pandas as pd
# Comparision of sample predicted values with actuals
df = pd.DataFrame({'y_test': y_test[0:15], 'y_pred': y_pred[0:15].reshape(15,)})
df

Unnamed: 0,y_test,y_pred
0,1,1
1,0,0
2,0,0
3,1,1
4,0,0
5,0,0
6,1,1
7,0,0
8,1,1
9,0,0


In [61]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,y_pred)


array([[4337,  610],
       [ 531, 4522]], dtype=int64)

In [62]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8859

### With this simple model we could able to achieve 88% of test accuracy which is Awesome! and further it can be improved by more epochs and tuning the hyperparameters along with the regularization techniques.

## Retrive the output of each layer in keras for a given single test sample from the trained model you built

In [63]:
for i in range(0, len(model.layers)):
    tmp_model = Model(model.layers[0].input, model.layers[i].output)
    tmp_output = tmp_model.predict(x_test)[3]
    print(i+1, 'layer output\n')
    print(tmp_output,'\n','Size:',tmp_output.shape, '\n\n')

1 layer output

[[-0.01994763  0.04415705 -0.02281809 ... -0.04648542 -0.00789339
   0.05011789]
 [-0.03828428 -0.01711298 -0.04496679 ... -0.02465373  0.03539054
  -0.04351547]
 [ 0.00256273  0.01656945 -0.02159555 ...  0.00181516 -0.03512169
   0.00111119]
 ...
 [ 0.04091096  0.00753152 -0.03107029 ...  0.00526861 -0.00216971
   0.02148043]
 [ 0.00411884  0.02632968  0.02189445 ...  0.02500214  0.0302964
  -0.05076516]
 [ 0.00874224  0.01604227 -0.00744087 ...  0.00504621 -0.00247939
  -0.04942224]] 
 Size: (300, 100) 


2 layer output

[ 6.42046332e-02  6.46007657e-02 -5.07841632e-02 -7.03097656e-02
 -6.49217144e-02  9.05142426e-02 -1.11441888e-01 -5.68717078e-04
  1.83887016e-02  2.47482881e-01 -6.72579035e-02  1.61057010e-01
 -6.54228264e-03  6.55505583e-02  1.80995479e-01  1.10972956e-01
  9.64758992e-02  4.91496883e-02  6.16583228e-02 -7.76321664e-02
  6.96655316e-03 -2.27410458e-02 -1.21789174e-02  1.79617926e-02
  3.97892185e-02 -6.65654093e-02  2.63730250e-03  7.02043250e-02
