In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Objective:

   Tobuild sequential NLP classifier to determine the customer sentiments based on the review comments.

## About Dataset:
   
   *) Using IMDB movie review dataset has 50000 movie reviews with balanced sample of positive/negative reviews.

   *) The reviews are already preprocessed and encoded with word index(integers) by the frequency of each words. Hence text preprocessing is not required.

#Steps:

*) Loading the dataset and preparing Train & Test data.

*) Since the reviews had already been preprocessed and indexed the data is good to go with tokenisation and vectorization.

*) Padding the sequence to make all the reviews of same length with 250 characters.

*) Decoding the original features.

*) Model Building using Embedding layer along with Pre trained glove model.

*) LSTM layer and Dense layer as output layer with sigmoid activation function.

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.datasets import imdb

##Loading Dataset & Preparing Train and Test data

In [213]:
(x_train,y_train),(x_test, y_test) = imdb.load_data(num_words=10000) ##using the top 10000 frequent words and elimination the top 15 most frequent words.

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [214]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(25000,) (25000,) (25000,) (25000,)


In [215]:
np.unique(x_train)

array([list([1, 2, 2, 2, 2, 9, 4, 86, 6594, 20, 7, 1300, 2, 6457, 5, 1238, 9, 24, 44, 2, 2, 2, 42, 2, 12, 9, 1004, 6, 55, 338, 1830, 11, 6, 2, 11, 6327, 121, 4, 2, 26, 3276, 2, 2, 2, 5233, 372, 4780, 13, 81, 24, 124, 4, 282, 138, 6457, 6546, 14, 1579, 756, 5, 2776, 8, 276, 2511, 21, 849, 36, 26, 55, 619, 537, 49, 7, 98, 483, 2244, 13, 1276, 40, 4, 1301, 65, 7, 4, 430, 5, 27, 336, 15, 925, 19, 6, 313, 7, 68, 205, 5, 2171, 34, 98, 4, 65, 7, 4, 3000, 430, 15, 2251, 29, 7446, 6, 1374, 4, 65, 7, 4, 132, 15, 5245, 677, 476, 17, 48, 36, 71, 68, 205, 3197, 5, 2847, 5, 4, 65, 7, 4, 185, 255, 5233, 34, 41, 2, 61, 2302, 9, 3020, 10, 10, 425, 3829, 2, 475, 1604, 2, 5438, 2, 2, 475, 4, 96, 7, 4, 113]),
       list([1, 2, 2, 2, 2, 9, 6, 87, 20, 24, 64, 11, 4, 291, 12, 16, 324, 21, 82, 150, 45, 35, 498, 20, 63, 9, 24, 5903, 60, 11, 6161, 1117, 4, 167, 2, 6, 52, 1321, 2703, 5, 1346, 93, 4, 65, 38, 3557, 10, 10, 4, 20, 1791, 72, 7, 4, 303, 785, 162, 2869, 20, 2, 2, 5203, 2, 2, 63, 9, 7, 4, 729, 2989]),

In [216]:
max(max(x_train))

9995

In [217]:
print(x_train[88])

[1, 18, 6, 20, 19, 6, 114, 40, 14, 13, 62, 1760, 7625, 2, 11, 4, 86, 747, 234, 5, 471, 12, 125, 21, 14, 16, 55, 73, 93, 19, 921, 9271, 87, 116, 5, 49, 2070, 163, 388, 12, 16, 82, 221, 8, 67, 6, 275, 1181, 6, 2, 31, 33, 15, 61, 322, 5, 13, 199, 2, 12]


In [218]:
print(y_train[88])

1


In [219]:
print(x_test[235])

[1, 1760, 13, 244, 6, 801, 948, 2, 250, 21, 103, 149, 14, 22, 23, 6221, 7, 265, 146, 260, 275, 8353, 5, 82, 13, 122, 24, 124, 15, 61, 514, 748, 1182, 2, 93, 160, 22, 198, 44, 1830, 7, 9415, 625, 543, 4056, 344, 9, 5907, 34, 1714, 2, 2, 5, 6920, 2, 9374, 525, 5, 89, 6, 2, 185, 2, 773, 3220, 4, 251, 5, 2939, 4, 483, 7, 9180, 4, 2, 13, 119, 4, 1334, 1563, 9180, 25, 124, 103, 149, 4, 22, 4, 3096, 1578, 72, 7, 4, 49, 7, 4, 3096, 39, 1561, 5, 46, 7, 4, 690, 11, 4, 22, 13, 119, 1106, 6, 378, 11, 4, 1662, 54, 13, 1941, 15, 613, 12, 220, 93, 72, 1415, 10, 10, 14, 389, 22, 16, 4, 333, 5, 477, 792, 8, 216, 46, 7, 4, 2, 1182, 4, 22, 16, 1822, 170, 8, 30, 626, 23, 9561, 7, 9549, 21, 237, 4, 2, 3008, 910, 626, 2, 2485, 908, 5510, 1194, 4, 1304, 8, 8290, 7, 4, 172, 291, 21, 443, 5698, 472, 435, 83, 6, 95, 2, 3627, 7, 394, 2852, 260, 4, 6343, 7, 636, 107, 504, 103, 4, 1274, 23, 4658, 8691, 443, 5698, 16, 6, 4118, 1690, 5, 1638, 8, 4, 7, 2431, 5, 3833, 2, 39, 4, 1182, 36, 69, 2921, 11, 2, 5, 2, 4, 1169

In [220]:
print(y_test[235])

1


##Pad Sequences

In [221]:
max_length = 250 ## considering the first 250 words of each review

####Tried with first 20,50, 70, 100,150,200 words in each review but the model gives lesser accuracy than 250. Hence considering first 250 words

In [222]:
X_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen = max_length,padding = 'pre', truncating='post')

In [223]:
X_test = tf.keras.preprocessing.sequence.pad_sequences(x_test,maxlen = max_length,padding = 'pre', truncating='post')

In [224]:
X_train.shape ## Feature shape

(25000, 250)

In [225]:
X_test.shape

(25000, 250)

In [226]:
y_train.shape ##label shape

(25000,)

In [227]:
print(X_train[95]) ## feature values

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    1   13   66  215   28 1059    6  275
   22   39    4  360    7    4    2   23   14 2138   88   33    6 2824
    7    4   22  236  314    4  311   16   38    2   34    4    2   15
  146   24   60 3988  320 2529   46  324   17   48    4 1723  197   29
   71 6428   49 3922  162 3641   34 5267    6 4717  582    7  559   65
    4   22    2  628 2219 1581 2057   19 7707 1581  116    4  354   11
   14   22   26   38  753    5 1100   15   13 2626   31    7    4  156
   16  170    8 1345   46   34   89 3658   36  468    8   30   34    4
  229  803  433    9   15   45 6594   40   35  390    7 6127 4583   13
   92  124   37   14  167 1291   29    9  279   29   47    2    7    4
 2427   40    2    2  525   21    4  439    9   15   32    7    4    2
 1046 

In [228]:
print(y_train[95]) ## label

0


## Decoding the fearture to get original sentence

In [229]:
word_index = imdb.get_word_index() ## loading the word index from imdb

In [230]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) ## mapping the index to its respective words

In [235]:
decoded_review1= ' '.join([reverse_word_index.get(i-3,'') for i in x_train[156]]) 
# indices are off by 3 because 0, 1, and 2 are reserverd indices for "padding", "Start of sequence" and "unknown"

In [236]:
decoded_review = ' '.join([reverse_word_index.get(i-3,'') for i in X_train[156]]) 

In [237]:
decoded_review ## after padding first 250 words

"                                                                                                      what i hate about this show is how poorly the leads are written these women have no self respect or dignity the entire plot is them throwing themselves at guys amanda  talent is completely wasted she was brilliant on all that and her own show why they would write her and jenny  as vapid  desperate men chasing old maid  is beyond me br br their plots and dialog remind me of the  homer says whenever his cartoon character  is not on screen everyone should ask where's  all the talk centers on whining about some guy and then whining to some guy sometimes they change it up and the guy  instead then they get back together or break up at the end the 2 women are either shallow stupid or sex  the only word i can think of is sucks"

In [238]:
decoded_review1 ## original sentence

" what i hate about this show is how poorly the leads are written these women have no self respect or dignity the entire plot is them throwing themselves at guys amanda  talent is completely wasted she was brilliant on all that and her own show why they would write her and jenny  as vapid  desperate men chasing old maid  is beyond me br br their plots and dialog remind me of the  homer says whenever his cartoon character  is not on screen everyone should ask where's  all the talk centers on whining about some guy and then whining to some guy sometimes they change it up and the guy  instead then they get back together or break up at the end the 2 women are either shallow stupid or sex  the only word i can think of is sucks"

## Building Model using Embedding layer(Word2Vec)

In [239]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM, Dense,Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [240]:
tf.keras.backend.clear_session()
model = Sequential()

#### Adding Embedding layer
    Embedding layer input = Batch_Size * Length of each review

In [39]:
vocab_size = 10000

In [None]:
model.add(Embedding(vocab_size + 1,  # Vocabulary size and padding value
                    50, #Embedding size
                    input_length = max_length)) ## number of words in each review

In [None]:
model.output

<KerasTensor: shape=(None, 250, 50) dtype=float32 (created by layer 'embedding')>

In [None]:
model.add(Dropout(0.3))
model.add(LSTM(128))
model.add(Dropout(0.3))

### Output layer using dense layer

In [None]:
model.add(Dense(1, activation = 'sigmoid'))

In [None]:
model.compile(optimizer = 'adam',loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 50)           500050    
_________________________________________________________________
dropout (Dropout)            (None, 250, 50)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 591,827
Trainable params: 591,827
Non-trainable params: 0
_________________________________________________________________


## Training the model

In [None]:
model.fit(X_train,y_train,batch_size = 64, epochs = 10, validation_data = (X_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f3ff0a51f10>

*) There is overfit in the model

##Using Pre-Trained Embedding(Glove model)

In [241]:
import gensim.downloader as api

In [242]:
glove_model = api.load('glove-wiki-gigaword-50')

In [243]:
glove_model.vectors.shape

(400000, 50)

In [244]:
glove_model['great']

array([-0.026567,  1.3357  , -1.028   , -0.3729  ,  0.52012 , -0.12699 ,
       -0.35433 ,  0.37824 , -0.29716 ,  0.093894, -0.034122,  0.92961 ,
       -0.14023 , -0.63299 ,  0.020801, -0.21533 ,  0.96923 ,  0.47654 ,
       -1.0039  , -0.24013 , -0.36325 , -0.004757, -0.5148  , -0.4626  ,
        1.2447  , -1.8316  , -1.5581  , -0.37465 ,  0.53362 ,  0.20883 ,
        3.2209  ,  0.64549 ,  0.37438 , -0.17657 , -0.024164,  0.33786 ,
       -0.419   ,  0.40081 , -0.11449 ,  0.051232, -0.15205 ,  0.29855 ,
       -0.44052 ,  0.11089 , -0.24633 ,  0.66251 , -0.26949 , -0.49658 ,
       -0.41618 , -0.2549  ], dtype=float32)

In [245]:
#Initialize embedding matrix for our dataset with 10000+1 rows (1 for padding word)
#and 50 columns (as embedding size is 50)
embedding_matrix = np.zeros((vocab_size + 1, 50))

In [246]:
for word, i in sorted(word_index.items(),key=lambda x:x[1]):
    if i > (vocab_size+1):
        break
    try:
        embedding_vector = glove_model[word] #Reading word's embedding from Glove model for a given word
        embedding_matrix[i] = embedding_vector
    except:
        pass

In [247]:
embedding_matrix[155]

array([-0.14751001,  0.55555999,  1.07640004,  0.044167  ,  0.49217001,
        0.31183001, -0.62123001, -0.28246   , -0.45550001, -0.37761   ,
       -0.23383   , -0.75712001, -0.19904   , -0.19379   ,  1.16320002,
       -0.56375998, -0.49566001, -0.19437   , -1.49870002,  0.1349    ,
        0.56518   , -0.15299   ,  1.12220001,  0.11022   , -0.59064001,
       -0.7489    ,  0.77516001, -0.62996   ,  0.18706   , -0.16483   ,
        3.74780011,  0.51148999, -0.19912   ,  0.46902999,  0.69338   ,
       -0.20723   ,  0.47422999,  0.22966   ,  0.53956002, -0.12704   ,
       -0.29328999, -0.15497001,  0.89543998, -0.33169001, -0.4892    ,
        0.29824999, -0.10244   , -0.3635    ,  0.12941   ,  0.18798   ])

##Building Model using pre trained embedding layer

In [248]:
tf.keras.backend.clear_session()
model = Sequential()

In [249]:
model.add(Embedding(vocab_size + 1, #Vocablury size
                                    50, #Embedding size
                                    weights=[embedding_matrix],
                                    trainable=False,
                                    input_length=max_length) #Number of words in each review
          )

In [250]:
model.add(Dropout(0.2))
model.add(LSTM(130)) #RNN State - size of cell state and hidden state
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

In [251]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [252]:
es = EarlyStopping(monitor='val_loss', patience = 5)
checkpoint= ModelCheckpoint('/content/drive/MyDrive/sequence.h5',save_best_only=True, monitor='val_accuracy',mode='max', verbose=1)

In [253]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 50)           500050    
_________________________________________________________________
dropout (Dropout)            (None, 250, 50)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 130)               94120     
_________________________________________________________________
dropout_1 (Dropout)          (None, 130)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 131       
Total params: 594,301
Trainable params: 94,251
Non-trainable params: 500,050
_________________________________________________________________


In [254]:
model.output

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dense')>

In [255]:
model.fit(X_train,y_train,epochs=25,batch_size=32,validation_data=(X_test, y_test), callbacks = [checkpoint])

Epoch 1/25

Epoch 00001: val_accuracy improved from -inf to 0.55828, saving model to /content/drive/MyDrive/sequence.h5
Epoch 2/25

Epoch 00002: val_accuracy improved from 0.55828 to 0.57728, saving model to /content/drive/MyDrive/sequence.h5
Epoch 3/25

Epoch 00003: val_accuracy did not improve from 0.57728
Epoch 4/25

Epoch 00004: val_accuracy improved from 0.57728 to 0.61040, saving model to /content/drive/MyDrive/sequence.h5
Epoch 5/25

Epoch 00005: val_accuracy improved from 0.61040 to 0.66368, saving model to /content/drive/MyDrive/sequence.h5
Epoch 6/25

Epoch 00006: val_accuracy improved from 0.66368 to 0.66780, saving model to /content/drive/MyDrive/sequence.h5
Epoch 7/25

Epoch 00007: val_accuracy improved from 0.66780 to 0.69236, saving model to /content/drive/MyDrive/sequence.h5
Epoch 8/25

Epoch 00008: val_accuracy improved from 0.69236 to 0.72420, saving model to /content/drive/MyDrive/sequence.h5
Epoch 9/25

Epoch 00009: val_accuracy improved from 0.72420 to 0.74076, sav

<tensorflow.python.keras.callbacks.History at 0x7f30078b0c50>

#Predicting a value using best model

In [256]:
Best_model = tf.keras.models.load_model('/content/drive/MyDrive/sequence.h5')

In [257]:
print(X_test[563])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    1   14
   20    9    6 1557  991    5    2   47   32    4 9574  383  519 6141
    2    5    6 9093  969   15 2282  210  473   21  115  188   14    9
    2  356    2 6553  405    2    5 6339    9   44  107 3008    2  773
    4    5    4  701    7    4    2  126  188  367  126  237    2    2
  336 5822 6498    2  336   11    4 6480  970  107 3008    2 2306  972
  366    2    5 6339  169  283  119  295    5   54  257  499  842    7
   14 

In [258]:
decoded_review = ' '.join([reverse_word_index.get(i-3,'') for i in X_test[563]]) 

In [259]:
decoded_review

'                                                                                                               this movie is a absolute masterpiece and  has all the kinky sex car crashes  and a penis monster that shakespeare always wanted but never got this is  classic  troma style  and juliet is about two rival  named the and the non of the  ever got along ever since   father screwed monty  father in the filmmaking business two rival  grow apart until  and juliet find true love together and when each side hear of this blood shed is the least that happens yes  and juliet is the troma classic  by fans world wide witness harry balls the penis monster first feature film and also has first troma appearance what are you waiting for now go out and rent the movie br br 10 10 br br'

In [262]:
to_predict = np.array(X_test[563])

In [263]:
np.round(np.average(Best_model.predict(to_predict))) ## prediction of review at 563

1.0

In [264]:
y_test[563] ##original rating

1

In [265]:
predicted = np.round(Best_model.predict(X_test))

In [266]:
predicted.shape

(25000, 1)

In [267]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.78      0.80      0.79     12500
           1       0.80      0.78      0.79     12500

    accuracy                           0.79     25000
   macro avg       0.79      0.79      0.79     25000
weighted avg       0.79      0.79      0.79     25000

