In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
import pandas as pd
import os
import re
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

# Load Data

In [3]:
def load_directory_data(directory):
    data = {}
    data["review"] = []
    for file_path in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["review"].append(f.read())
    
    return pd.DataFrame.from_dict(data)


def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)


def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(fname="aclImdb.tar.gz",
                                      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
                                      extract=True)
   
    train_df = load_dataset(os.path.join(os.path.dirname(dataset),"aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset),"aclImdb", "test"))

    return train_df, test_df

In [4]:
tf.logging.set_verbosity(tf.logging.ERROR)

train_df, test_df = download_and_load_datasets()
train_df.head()

Unnamed: 0,review,polarity
0,I couldn't believe that this movie dates from ...,0
1,If Ashanti had been a serious attempt at a fil...,0
2,"**SPOILERS**This was an ugly movie, and I'm so...",0
3,I originally reviewed this film on Amazon abou...,1
4,OK. I admit. I'm one of those nerds who have s...,1


In [6]:
x_train_text = train_df.iloc[:,0].tolist()
y_train = train_df.iloc[:,1].tolist()


x_test_text = test_df.iloc[:,0].tolist()
y_test = test_df.iloc[:,1].tolist()

data_text = x_train_text + x_test_text

In [7]:
x_train_text[1]

"If Ashanti had been a serious attempt at a film about the institution of slavery, still prevalent in third world countries the film might have been better received. Instead it turns into a star studded disaster of a movie where the stars came in, said their lines, and picked up their paychecks without much conviction.<br /><br />Michael Caine and his wife Beverly Johnson work for the United Nations World Health Organization and are busy doing their humanitarian thing in East Africa. Along comes Peter Ustinov who can barely summon enough ham in him to make a go of the part as a Moslem slave dealer. As Johnson is black he grabs her anyway along with a lot of children and a few adults as well.<br /><br />Of course Caine doesn't take kindly to the kidnapping and the rest of the film is spent in a rescue attempt. The rest of the cast has such folks as William Holden, Rex Harrison, Omar Sharif and Indian film star Kebir Bedi in parts and looking so incredibly bored with the whole thing.<br 

In [8]:
y_train[1]

0

# Tokenizer

In [9]:
# We may instruct the tokenizer to only use e.g. the 10000 most popular words from the data-set.

num_words = 10000
tokenizer = Tokenizer(num_words=num_words)

# Note that we fit the tokenizer on the entire data-set so it gathers words from both the training- and test-data.
#This is OK as we are merely building a vocabulary and want it to be as complete as possible. 
# The actual neural network will of course only be trained on the training-set.

tokenizer.fit_on_texts(data_text)

# If you want to use the entire vocabulary then set num_words=None above, 
# and then it will automatically be set to the vocabulary-size here. 
# (This is because of Keras' somewhat awkward implementation.)

if num_words is None:
    num_words = len(tokenizer.word_index)
    
# We can then inspect the vocabulary that has been gathered by the tokenizer

tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [10]:
# We can then use the tokenizer to convert all texts in the training-set to lists of these tokens.

x_train_tokens = tokenizer.texts_to_sequences(x_train_text)

In [11]:
# For example, here is a text from the training-set:
x_train_text[1]

"If Ashanti had been a serious attempt at a film about the institution of slavery, still prevalent in third world countries the film might have been better received. Instead it turns into a star studded disaster of a movie where the stars came in, said their lines, and picked up their paychecks without much conviction.<br /><br />Michael Caine and his wife Beverly Johnson work for the United Nations World Health Organization and are busy doing their humanitarian thing in East Africa. Along comes Peter Ustinov who can barely summon enough ham in him to make a go of the part as a Moslem slave dealer. As Johnson is black he grabs her anyway along with a lot of children and a few adults as well.<br /><br />Of course Caine doesn't take kindly to the kidnapping and the rest of the film is spent in a rescue attempt. The rest of the cast has such folks as William Holden, Rex Harrison, Omar Sharif and Indian film star Kebir Bedi in parts and looking so incredibly bored with the whole thing.<br 

In [12]:
# This text corresponds to the following list of tokens
np.array(x_train_tokens[1])

array([  43,   66,   75,    3,  599,  602,   30,    3,   19,   42,    1,
       6229,    4, 8089,  130, 8928,    8,  873,  181, 3042,    1,   19,
        233,   25,   75,  126, 2015,  298,    9,  511,   82,    3,  324,
       1566,    4,    3,   17,  117,    1,  405,  384,    8,  301,   65,
        409,    2, 1624,   53,   65,  208,   72, 5663,    7,    7,  498,
       3249,    2,   24,  325, 5422, 2638,  158,   15,    1, 2249, 6378,
        181, 3353, 6813,    2,   23, 3217,  402,   65,  148,    8, 2499,
       2588,  340,  270,  822, 7017,   35,   67, 1183,  193, 4543,    8,
         87,    5,   94,    3,  139,    4,    1,  173,   14,    3, 4352,
       5423,   14, 2638,    6,  317,   28, 5511,   40,  548,  340,   16,
          3,  169,    4,  466,    2,    3,  171, 1540,   14,   69,    7,
          7,    4,  265, 3249,  149,  187, 8230,    5,    1, 5877,    2,
          1,  370,    4,    1,   19,    6,  995,    8,    3, 2168,  602,
          1,  370,    4,    1,  174,   45,  138, 14

In [13]:
# We also need to convert the texts in the test-set to tokens

x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

# Padding and Truncating Data

In [14]:
# First we count the number of tokens in all the sequences in the data-set.
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

# The max number of tokens we will allow is set to the average plus 2 standard deviations.

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

544

In [15]:
# This covers about 95% of the data-set.

np.sum(num_tokens < max_tokens) / len(num_tokens)

0.94528

In [16]:
# The choice of 'pre' or 'post' is important because it determines whether we throw away the first or last part 
# of a sequence when truncating, and it determines whether we add zeros to the beginning or end of the sequence 
# when padding. This may confuse the Recurrent Neural Network

pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,padding=pad, truncating=pad)

In [17]:
# For example, we had the following sequence of tokens above:

np.array(x_train_tokens[1])

array([  43,   66,   75,    3,  599,  602,   30,    3,   19,   42,    1,
       6229,    4, 8089,  130, 8928,    8,  873,  181, 3042,    1,   19,
        233,   25,   75,  126, 2015,  298,    9,  511,   82,    3,  324,
       1566,    4,    3,   17,  117,    1,  405,  384,    8,  301,   65,
        409,    2, 1624,   53,   65,  208,   72, 5663,    7,    7,  498,
       3249,    2,   24,  325, 5422, 2638,  158,   15,    1, 2249, 6378,
        181, 3353, 6813,    2,   23, 3217,  402,   65,  148,    8, 2499,
       2588,  340,  270,  822, 7017,   35,   67, 1183,  193, 4543,    8,
         87,    5,   94,    3,  139,    4,    1,  173,   14,    3, 4352,
       5423,   14, 2638,    6,  317,   28, 5511,   40,  548,  340,   16,
          3,  169,    4,  466,    2,    3,  171, 1540,   14,   69,    7,
          7,    4,  265, 3249,  149,  187, 8230,    5,    1, 5877,    2,
          1,  370,    4,    1,   19,    6,  995,    8,    3, 2168,  602,
          1,  370,    4,    1,  174,   45,  138, 14

In [18]:
# This has simply been padded to create the following sequence.
x_train_pad[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

# Tokenizer Inverse Map

In [19]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [20]:
# A function for converting a list of tokens back to a string of words.

def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    
    # Concatenate all words.
    text = " ".join(words)

    return text

In [21]:
# For example, this is the original text from the data-set:

x_train_text[1]

"If Ashanti had been a serious attempt at a film about the institution of slavery, still prevalent in third world countries the film might have been better received. Instead it turns into a star studded disaster of a movie where the stars came in, said their lines, and picked up their paychecks without much conviction.<br /><br />Michael Caine and his wife Beverly Johnson work for the United Nations World Health Organization and are busy doing their humanitarian thing in East Africa. Along comes Peter Ustinov who can barely summon enough ham in him to make a go of the part as a Moslem slave dealer. As Johnson is black he grabs her anyway along with a lot of children and a few adults as well.<br /><br />Of course Caine doesn't take kindly to the kidnapping and the rest of the film is spent in a rescue attempt. The rest of the cast has such folks as William Holden, Rex Harrison, Omar Sharif and Indian film star Kebir Bedi in parts and looking so incredibly bored with the whole thing.<br 

In [22]:
# Recreate this text except for punctuation and other symbols, by converting the list of tokens back to words:

tokens_to_string(x_train_tokens[1])

"if had been a serious attempt at a film about the institution of slavery still prevalent in third world countries the film might have been better received instead it turns into a star disaster of a movie where the stars came in said their lines and picked up their without much conviction br br michael caine and his wife beverly johnson work for the united nations world health organization and are busy doing their thing in east africa along comes peter ustinov who can barely enough ham in him to make a go of the part as a slave dealer as johnson is black he grabs her anyway along with a lot of children and a few adults as well br br of course caine doesn't take kindly to the kidnapping and the rest of the film is spent in a rescue attempt the rest of the cast has such folks as william holden rex harrison omar and indian film star in parts and looking so incredibly bored with the whole thing br br usually in something like this talented people like those mentioned above will just outrag

# Create the Recurrent Neural Network

- We are now ready to create the Recurrent Neural Network (RNN). 
- We will use the Keras API for this because of its simplicity.


In [23]:
model = Sequential()
embedding_size = 8

# The embedding-layer also needs to know the number of words in the vocabulary (num_words) 
# and the length of the padded token-sequences (max_tokens). We also give this layer a name
# because we need to retrieve its weights further below.

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

# We can now add the first Gated Recurrent Unit (GRU) to the network. This will have 16 outputs.
# Because we will add a second GRU after this one, we need to return sequences of data because 
# the next GRU expects sequences as its input.

model.add(GRU(units=16, return_sequences=True))

# This adds the second GRU with 8 output units. 
# This will be followed by another GRU so it must also return sequences

model.add(GRU(units=8, return_sequences=True))

# This adds the third and final GRU with 4 output units

model.add(GRU(units=4))

# We add a fully-connected / dense layer which computes a value between 0.0 and 1.0 
# that will be used as the classification output.

model.add(Dense(1, activation='sigmoid'))

# We Use the Adam optimizer with the given learning-rate

optimizer = Adam(lr=1e-3)

# Compile the Keras model so it is ready for training

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
gru_1 (GRU)                  (None, 544, 16)           1200      
_________________________________________________________________
gru_2 (GRU)                  (None, 544, 8)            600       
_________________________________________________________________
gru_3 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 81,961
Trainable params: 81,961
Non-trainable params: 0
_________________________________________________________________


# Train the Recurrent Neural Network

In [24]:
%%time
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)

Train on 23750 samples, validate on 1250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Wall time: 14min 46s


<tensorflow.python.keras._impl.keras.callbacks.History at 0x9765e4eb8>

# Performance on Test-Set
Now that the model has been trained we can calculate its classification accuracy on the test-set.

In [25]:
%%time
result = model.evaluate(x_test_pad, y_test)

Wall time: 1min 32s


In [26]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 86.43%


# classification of new reviews

In [27]:
review1 = "This movie is fantastic!"
review2 = "Good movie!"
review3 = "I like this movie."
review4 = "Bad movie!"
review5 = "Not a good movie!"
review6 = "Bad movie!"
reviews = [review1, review2, review3, review4, review5, review6]

# Convert these reviews to arrays of integer-tokens because that is needed by the model.
tokens = tokenizer.texts_to_sequences(reviews)

# To input reviews with different lengths into the model, we also need to pad and truncate them.
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,padding=pad, truncating=pad)

In [28]:
# We can now use the trained model to predict the sentiment for these reviews.
# A value close to 0.0 means a negative sentiment and a value close to 1.0 means a positive sentiment
model.predict(tokens_pad)

array([[0.68778384],
       [0.8573307 ],
       [0.60903764],
       [0.31820264],
       [0.777006  ],
       [0.31820264]], dtype=float32)