# Sentiment Classification with RNN

We implement a RNN-based Sentiment Classifier for the imdb movie reviews dataset.

## Dependencies and Parameters

In [None]:
import keras
from keras.datasets import imdb
from google.colab import drive
from datetime import datetime
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define the parameters
VOCAB_SIZE = 10000 #88584 #this is length of the word2id dictionary
EMBEDDING_SIZE = 32
HIDDEN_DIM = 200
NUM_LAYERS = 1
MAX_LEN = 500
NUM_SAMPLES = 25000
TEST_SIZE = round(0.15 * NUM_SAMPLES)
NUM_EPOCH = 10
CALLBACK = True

In [None]:
SAVE_PATH = f"/content/drive/MyDrive/models/rnn-{EMBEDDING_SIZE}-{HIDDEN_DIM}-{NUM_LAYERS}-{MAX_LEN}-{NUM_SAMPLES}-{NUM_EPOCH}-{VOCAB_SIZE}"
LOGDIR = SAVE_PATH + '/logs/'
print(SAVE_PATH)

In [None]:
!mkdir $SAVE_PATH

## Loading and Preprocessing the Data

In [None]:
vocabulary_size = VOCAB_SIZE
start_char = 1
oov_char = 2
index_from = 3

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size, start_char=start_char, oov_char=oov_char, index_from=index_from)
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Loaded dataset with 25000 training samples, 25000 test samples


In [None]:
# inspect sample
print('---review---')
print(X_train[0])
print('---label---')
print(y_train[0])

---review---
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
---label---
1


In [None]:
# IDs back to words

word2id = imdb.get_word_index()
inverted_word_index = dict(
    (i + index_from, word) for (word, i) in word2id.items()
)
inverted_word_index[start_char] = "[START]"
inverted_word_index[oov_char] = "[OOV]"
print(len(inverted_word_index)) # total number of words

print('---review with words---')
print([inverted_word_index.get(i, ' ') for i in X_train[0]])
print(' '.join([inverted_word_index.get(i, ' ') for i in X_train[0]]))
print('---label---')
print(y_train[0])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
88586
---review with words---
['[START]', 'this', 'film', 'was', 'just', 'brilliant', 'casting', 'location', 'scenery', 'story', 'direction', "everyone's", 'really', 'suited', 'the', 'part', 'they', 'played', 'and', 'you', 'could', 'just', 'imagine', 'being', 'there', 'robert', '[OOV]', 'is', 'an', 'amazing', 'actor', 'and', 'now', 'the', 'same', 'being', 'director', '[OOV]', 'father', 'came', 'from', 'the', 'same', 'scottish', 'island', 'as', 'myself', 'so', 'i', 'loved', 'the', 'fact', 'there', 'was', 'a', 'real', 'connection', 'with', 'this', 'film', 'the', 'witty', 'remarks', 'throughout', 'the', 'film', 'were', 'great', 'it', 'was', 'just', 'brilliant', 'so', 'much', 'that', 'i', 'bought', 'the', 'film', 'as', 'soon', 'as', 'it', 'was', 'released', 'for', '[OOV]', 'and', 'would', 'recommend', 'it', 'to', 'everyone', 'to', 'watch', 'and', 'the', 'fly', 'fishing', 'was', 'amazing',

In [None]:
# max and min length review
print('Maximum review length: {}'.format(
len(max((X_train + X_test), key=len))))
print('Minimum review length: {}'.format(
len(min((X_test + X_test), key=len))))
print(len(word2id))

Maximum review length: 2697
Minimum review length: 14
88584


### Pad Sequences
In order to feed this data into our RNN, all input documents must have the same length. We limit the maximum review length to max_words by truncating longer reviews and padding shorter reviews with a null value, using the pad_sequences() function in Keras.

In [None]:
from keras_preprocessing.sequence import pad_sequences

max_words = MAX_LEN
X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)

## Creating the RNN Model Instance

In [None]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

embedding_size=EMBEDDING_SIZE
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(HIDDEN_DIM))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 32)           320000    
                                                                 
 lstm (LSTM)                 (None, 200)               186400    
                                                                 
 dense (Dense)               (None, 1)                 201       
                                                                 
Total params: 506,601
Trainable params: 506,601
Non-trainable params: 0
_________________________________________________________________
None


## Callbacks

In [None]:
class LossAndErrorPrintingCallback(keras.callbacks.Callback):
  def __init__(self):
    self.train_batch = []
    self.train_losses = {}
    self.test_batch = []
    self.test_losses = {}
    self.test_accuracy_batch = []
    self.test_accuracy = {}
    self.train_accuracy_batch = []
    self.train_accuracy = {}
    
  def on_train_batch_end(self, batch, logs=None):
    self.train_batch.append(logs["loss"])
    self.train_accuracy_batch.append(logs["accuracy"])

  def on_test_batch_end(self, batch, logs=None):
    self.test_batch.append(logs["loss"])
    self.test_accuracy_batch.append(logs["accuracy"])

  def on_epoch_end(self, epoch, logs=None):
    self.train_losses[epoch] = self.train_batch
    self.test_losses[epoch] = self.test_batch
    self.train_accuracy[epoch] = self.train_accuracy_batch
    self.test_accuracy[epoch] = self.test_accuracy_batch
    self.train_batch = []
    self.test_batch = []
    self.train_accuracy_batch = []
    self.test_accuracy_batch = []
    print("The average loss for epoch {} is {:7.2f} ".format(epoch, logs["loss"]))

loss_and_error = LossAndErrorPrintingCallback()

In [None]:
import time

class TimeCallback(keras.callbacks.Callback):
    def __init__(self):
        self.times = []
        # use this value as reference to calculate cummulative time taken
        self.timetaken = time.time()
        self.batch_times = {}
        self.epoch_times = {}
        self.total_time = time.time()
        
    def on_train_batch_end(self, batch, logs=None):
      self.times.append(time.time() - self.timetaken)
      self.timetaken = time.time()

    def on_epoch_end(self,epoch,logs = {}):
        self.batch_times[epoch] = self.times
        self.epoch_times[epoch] = sum(self.times)

        #reset variables
        self.times = []
        self.timetaken = time.time()

        print(f"it took {(epoch,time.time() - self.timetaken)} time")

    def on_train_end(self,logs={}):
      self.total_time = time.time() - self.total_time

time_callback = TimeCallback()

## Training
We use the binary cross-entropy loss function, Adam optimizer, and accuracy metric.

In [None]:
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])

In [None]:
batch_size = 64
test_size = TEST_SIZE
num_epochs = NUM_EPOCH
samples = NUM_SAMPLES

X_test, y_test = X_test[:test_size], y_test[:test_size]
X_train2, y_train2 = X_train[:samples], y_train[:samples]

In [None]:
print(y_train2)

[1 0 0 ... 0 1 0]


In [None]:
from datetime import datetime
import tensorflow as tf

callbacks = []
logdir = LOGDIR

if CALLBACK:

  my_callbacks = [
      tf.keras.callbacks.ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5'),
      tf.keras.callbacks.TensorBoard(log_dir=logdir),
      tf.keras.callbacks.CSVLogger(logdir + 'logs.csv', append=True, separator=','),
      time_callback,
      loss_and_error
  ]


In [None]:
history = model.fit(X_train2, y_train2, validation_data=(X_test, y_test), batch_size=batch_size, epochs=num_epochs, callbacks=my_callbacks)

Epoch 1/10
The average loss for epoch 0 is    0.52 
Epoch 2/10
The average loss for epoch 1 is    0.31 
Epoch 3/10
The average loss for epoch 2 is    0.23 
Epoch 4/10
The average loss for epoch 3 is    0.19 
Epoch 5/10
The average loss for epoch 4 is    0.14 
Epoch 6/10
The average loss for epoch 5 is    0.11 
Epoch 7/10
The average loss for epoch 6 is    0.07 
Epoch 8/10
The average loss for epoch 7 is    0.09 
Epoch 9/10
The average loss for epoch 8 is    0.05 
Epoch 10/10
The average loss for epoch 9 is    0.04 


## Evaluating

In [None]:
# evaluate 
print(history.history)
scores = model.evaluate(X_test, y_test, verbose=0)
print('Test accuracy:', scores[1])

{'loss': [0.5164880156517029, 0.3067905008792877, 0.2298823595046997, 0.18688984215259552, 0.14443299174308777, 0.10630108416080475, 0.0724228248000145, 0.08635959029197693, 0.04910784214735031, 0.04260509833693504], 'accuracy': [0.7390000224113464, 0.8773599863052368, 0.9106000065803528, 0.9305599927902222, 0.9463599920272827, 0.9631199836730957, 0.9774399995803833, 0.9721999764442444, 0.9842000007629395, 0.9863600134849548], 'val_loss': [0.34014835953712463, 0.330218106508255, 0.3185320496559143, 0.35855311155319214, 0.49280136823654175, 0.4512496888637543, 0.5241455435752869, 0.5965407490730286, 0.6645947098731995, 0.6722164750099182], 'val_accuracy': [0.8589333295822144, 0.8602666854858398, 0.867733359336853, 0.8575999736785889, 0.7826666831970215, 0.8581333160400391, 0.8354666829109192, 0.8469333052635193, 0.8560000061988831, 0.850933313369751]}
Test accuracy: 0.850933313369751


In [None]:
print(f'Total time = {time_callback.total_time}')

In [None]:
# save
PATH = SAVE_PATH
model.save(SAVE_PATH)



## Loading the Model

In [None]:
# loading the saved model
LOADPATH = "/content/drive/MyDrive/models/rnn-32-100-1-500-5000-5-new"

In [None]:
!ls $LOADPATH

acc_vs_batch.png	 logs			    time_per_epoch.csv
acc_vs_epoch.png	 loss_vs_batch.png	    train_losses_per_batch.csv
assets			 saved_model.pb		    variables
cumul_time_vs_batch.png  test_losses_per_batch.csv
keras_metadata.pb	 time_per_batch.csv


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
new_model = keras.models.load_model(LOADPATH)

In [None]:
# if you need to process data again
vocabulary_size = 88584
start_char = 1
oov_char = 2
index_from = 3
max_words = 100
samples = 5000
test_size = round(0.15 * samples)

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size, start_char=start_char, oov_char=oov_char, index_from=index_from)
X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)

X_test, y_test = X_test[:test_size], y_test[:test_size]
X_train2, y_train2 = X_train[:samples], y_train[:samples]

In [None]:
scores = new_model.evaluate(X_test, y_test, verbose=0)
print('Test accuracy:', scores[1])

In [None]:
(X_load_train, y_load_train), (X_load_test, y_load_test) = imdb.load_data(num_words = vocabulary_size)

In [None]:
print(X_load_train[0])
output = new_model(tf.constant(pad_sequences(X_load_test[0:5], maxlen=max_words), dtype=np.float32))
print(output)
print(y_test[0:5])

In [None]:
# testing on random inputs
inputs = ["i hate this movie", "this sucks", "i never want to watch this again"]
seqs = [[word2id[i] for i in inp.split(' ')] for inp in inputs ]
print(seqs)
output_bad = new_model(tf.constant(pad_sequences(seqs, maxlen=max_words), dtype=np.float32))
print(output_bad)

## Plotting Using the Model History


In [None]:
# CSV Logger has all this
import matplotlib.pyplot as plt
import numpy as np

In [None]:
print(history.history.keys())

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
for loss_type in history.history.keys():
  np.savetxt(f"{loss_type}.txt", np.array(history.history[loss_type]), delimiter=",")
  print(np.array(history.history[loss_type]))

## Saving the Info from the Callbacks

In [None]:
# remove current logs if exists
import os
for file in ['train_losses_per_batch.csv', 'test_losses_per_batch.csv', 'time_per_epoch.csv', 'time_per_batch.csv', 'log.csv']:
  try:
    os.remove(file)
  except:
    continue

In [None]:
# save training and test loss per epoch
import csv

filename = SAVE_PATH + '/train_losses_per_batch.csv'
with open(filename, 'w') as f:
  writer = csv.DictWriter(f, fieldnames=["Epoch", "Training Loss"])
  writer.writeheader()
  for epoch in loss_and_error.train_losses.keys():
    for loss in loss_and_error.train_losses[epoch]:
      data = {"Epoch": epoch, "Training Loss": loss}
      writer.writerow(data)

filename = SAVE_PATH + '/test_losses_per_batch.csv'
with open(filename, 'w') as f:
  writer = csv.DictWriter(f, fieldnames=["Epoch", "Test Loss"])
  writer.writeheader()
  for epoch in loss_and_error.test_losses.keys():
    for loss in loss_and_error.test_losses[epoch]:
      data = {"Epoch": epoch, "Test Loss": loss}
      writer.writerow(data)

filename = SAVE_PATH + '/train_accuracy_per_batch.csv'
with open(filename, 'w') as f:
  writer = csv.DictWriter(f, fieldnames=["Epoch", "Train Accuracy"])
  writer.writeheader()
  for epoch in loss_and_error.train_accuracy.keys():
    for loss in loss_and_error.train_accuracy[epoch]:
      data = {"Epoch": epoch, "Train Accuracy": loss}
      writer.writerow(data)

filename = SAVE_PATH + '/test_accuracy_per_batch.csv'
with open(filename, 'w') as f:
  writer = csv.DictWriter(f, fieldnames=["Epoch", "Test Accuracy"])
  writer.writeheader()
  for epoch in loss_and_error.test_accuracy.keys():
    for loss in loss_and_error.test_accuracy[epoch]:
      data = {"Epoch": epoch, "Test Accuracy": loss}
      writer.writerow(data)

print(loss_and_error.test_losses)

{0: [0.20284655690193176, 0.2619050145149231, 0.28528931736946106, 0.2870345413684845, 0.30299824476242065, 0.31459352374076843, 0.31224605441093445, 0.29143571853637695, 0.2943876087665558, 0.30005109310150146, 0.3015383780002594, 0.30754032731056213, 0.3157164752483368, 0.3220962584018707, 0.31983014941215515, 0.3309316039085388, 0.3299590051174164, 0.33018290996551514, 0.34008970856666565, 0.33774691820144653, 0.33699968457221985, 0.3347349762916565, 0.33153796195983887, 0.3256082236766815, 0.33024701476097107, 0.32964199781417847, 0.3301847577095032, 0.33240824937820435, 0.33125990629196167, 0.3304252326488495, 0.3295424282550812, 0.3263230621814728, 0.3243754208087921, 0.3240882158279419, 0.3280411660671234, 0.326635479927063, 0.32752570509910583, 0.32603493332862854, 0.3252958357334137, 0.3280918002128601, 0.32883554697036743, 0.3303927779197693, 0.3281922936439514, 0.3281281888484955, 0.3263099193572998, 0.3290502727031708, 0.32868653535842896, 0.3284500539302826, 0.331947565078

In [None]:
# save the time
file1 = SAVE_PATH + '/time_per_epoch.csv'
file2 = SAVE_PATH + '/time_per_batch.csv'
try:
    with open(file1, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=time_callback.epoch_times.keys())
        writer.writeheader()
        for data in [time_callback.epoch_times]:
            writer.writerow(data)
    with open(file2, 'w') as f:
      writer = csv.DictWriter(f, fieldnames=["Epoch", "Batch", "Time"])
      writer.writeheader()
      for epoch in time_callback.batch_times.keys():
        for batch, time in enumerate(time_callback.batch_times[epoch]):
          data = {"Epoch": epoch, "Batch": batch, "Time": time}
          writer.writerow(data)
except IOError:
    print("I/O error")

print(time_callback.epoch_times)

{0: 408.1657633781433, 1: 396.4657554626465, 2: 390.7009654045105, 3: 389.960205078125, 4: 387.50929284095764, 5: 386.586377620697, 6: 386.9248082637787, 7: 387.29252099990845, 8: 386.8334963321686, 9: 386.12139916419983}
{0: 408.1657633781433, 1: 396.4657554626465, 2: 390.7009654045105, 3: 389.960205078125, 4: 387.50929284095764, 5: 386.586377620697, 6: 386.9248082637787, 7: 387.29252099990845, 8: 386.8334963321686, 9: 386.12139916419983}


## References

- We implemented the model as an adaption of [Sentiment Analysis with RNN](https://github.com/susanli2016/NLP-with-Python/blob/master/Sentiment%20Analysis%20with%20RNN.ipynb)
- We created the plots in this notebook referencing [this post on using the model's history for plotting](https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/)