In [1]:
import tensorflow as tf
import pandas as pd
import re
import numpy as np
import os
import time

from sklearn.model_selection import train_test_split

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the first GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

2 Physical GPUs, 1 Logical GPUs


In [3]:
# load the dataset
movie_reviews = pd.read_csv("./IMDB Dataset.csv")

In [4]:
# check if there is any null value in the dataset
movie_reviews.isnull().values.any()

False

In [5]:
# show the size of the dataset
movie_reviews.shape

(50000, 2)

In [6]:
# show the first five data in the dataset
movie_reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
movie_reviews["review"][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [8]:
TAG_RE = re.compile(r'<[^>]+>')


def remove_tags(text):
    return TAG_RE.sub('', text)


def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [9]:
X = []
sentences = list(movie_reviews['review'])
for sen in sentences:
    X.append(preprocess_text(sen))


# replace the positive with 1, replace the negative with 0
y = movie_reviews['sentiment']
y = np.array(list(map(lambda x: 1 if x == "positive" else 0, y)))

In [10]:
# Split the training dataset and test dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

In [11]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

max_len = 100

# padding sentences to the same length
X_train = tf.keras.preprocessing.sequence.pad_sequences(
    X_train, padding='post', maxlen=max_len)
X_test = tf.keras.preprocessing.sequence.pad_sequences(
    X_test, padding='post', maxlen=max_len)

In [12]:
# show the preprocessed data
X_train[1]

array([ 100,   20,  155,   81,   17,   46,   14,    1,   98, 4696,    2,
       2508, 1560,    2,    1,  864,    1,  100,  639,   51,    7,    5,
         19,  188,  403,    1,  546,    9,   21,    5, 4342,   51,    7,
       1810,  585,    4, 2408,  417,   33,    1,  296,   41, 1264,   63,
         19,  250,  137,   30, 1069,  100,   31,  107,   60,   14,    1,
         76,   98,    7,    1,   12,    2,  628, 4808,    5, 1082, 6747,
        541,   13,  259,    4, 2408,  897,    2,    1,  204,  132, 1181,
         32,  700,    2, 7163,   26,    1,  603,  455,  220,   94, 1052,
          8,   12,   91,   23,   71, 1681,   15,    6,  211,   82,   99,
          6], dtype=int32)

In [13]:
BUFFER_SIZE = len(X_train)
BATCH_SIZE = 64
steps_per_epoch = len(X_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
# only reserve 10000 words
vocab_size = 10000

dataset = tf.data.Dataset.from_tensor_slices(
    (X_train, y_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 100]), TensorShape([64]))

In [14]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        # vacab_size=10000, embedding_dim=256 enc_units=1024 batch_sz=64
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_activation='sigmoid',
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        # x is the training data，shape == (batch_size，max_length)  -> (64, 100)
        # which means there are batch_size sentences in one batch, the length of each sentence is max_length
        # hidden state shape == (batch_size, units) -> (64, 1024)
        # after embedding, x shape == (batch_size, max_length, embedding_dim) -> (64, 100, 256)
        x = self.embedding(x)
        # output contains the state(in GRU, hidden state equals to output in each timestamp) from all timestamps,
        # output shape == (batch_size, max_length, units) -> (64, 100, 1024)
        # state is the hidden state of the last timestamp, shape == (batch_size, units) -> (64, 1024)
        output, state = self.gru(x, initial_state=hidden)
        # output contains the whole output of the sequence, state is the hidden state of the last timestamp
        return output, state

    def initialize_hidden_state(self):
        # initialize the first state of the gru,  shape == (batch_size, units) -> (64,1024)
        return tf.zeros((self.batch_sz, self.enc_units))

In [15]:
encoder = Encoder(vocab_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units) {}'.format(
    sample_output.shape))
print('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))
# the output and the hidden state of GRU is equal
print(sample_output[-1, -1, :] == sample_hidden[-1, :])

Encoder output shape: (batch size, sequence length, units) (64, 100, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)
tf.Tensor([ True  True  True ...  True  True  True], shape=(1024,), dtype=bool)


In [19]:
class LuongAttention(tf.keras.Model):
    def __init__(self, units):
        super(LuongAttention, self).__init__()
        self.W = tf.keras.layers.Dense(units)

    def call(self, query, values):
        # Dot score: h_t (dot) Wa (dot) h_s
        # query shape: (batch_size, max_len, rnn_size)
        # values shape: (batch_size, 1, rnn_size)
        # score will have shape: (batch_size, 1, max_len)
        score = tf.matmul(query, self.W(values), transpose_b=True)
        # attention_weights
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context vector c_t is the average sum of encoder output
        context_vector = tf.matmul(attention_weights, values)
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [20]:
class Decoder(tf.keras.Model):
    def __init__(self, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        # through four fully connected layers, the model will return the probability of the positivity of the sentence
        self.fc_1 = tf.keras.layers.Dense(2048)
        self.fc_2 = tf.keras.layers.Dense(512)
        self.fc_3 = tf.keras.layers.Dense(64)
        self.fc_4 = tf.keras.layers.Dense(1)

        # used for attention
        self.attention = LuongAttention(self.dec_units)

    def call(self, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)

        output = self.fc_1(context_vector)
        output = self.fc_2(output)
        output = self.fc_3(output)
        output = self.fc_4(output)

        return output, attention_weights

In [21]:
decoder = Decoder(units, BATCH_SIZE)

sample_decoder_output, _ = decoder(sample_hidden, sample_output)

print('Decoder output shape: (batch_size, vocab size) {}'.format(
    sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 1)


In [22]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)


def loss_function(real, pred):
    loss_ = loss_object(real, pred)

    return tf.reduce_mean(loss_)

In [23]:
checkpoint_dir = 'checkpoints/sentiment-analysis-luong-attention'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [24]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        # passing enc_output to the decoder
        predictions, _ = decoder(enc_hidden, enc_output)

        loss = loss_function(targ, predictions)

    # collect all trainable variables
    variables = encoder.trainable_variables + decoder.trainable_variables

    # calculate the gradients for the whole variables
    gradients = tape.gradient(loss, variables)

    # apply the gradients on the variables
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

In [25]:
# set the epochs for training
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    # get the initial hidden state of gru
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))

    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.7771
Epoch 1 Batch 100 Loss 0.3525
Epoch 1 Batch 200 Loss 0.4424
Epoch 1 Batch 300 Loss 0.2447
Epoch 1 Batch 400 Loss 0.3448
Epoch 1 Batch 500 Loss 0.4542
Epoch 1 Batch 600 Loss 0.3048
Epoch 1 Loss 0.4435
Time taken for 1 epoch 31.402924299240112 sec

Epoch 2 Batch 0 Loss 0.3050
Epoch 2 Batch 100 Loss 0.2585
Epoch 2 Batch 200 Loss 0.2450
Epoch 2 Batch 300 Loss 0.3248
Epoch 2 Batch 400 Loss 0.3018
Epoch 2 Batch 500 Loss 0.2097
Epoch 2 Batch 600 Loss 0.2990
Epoch 2 Loss 0.2781
Time taken for 1 epoch 29.82162594795227 sec

Epoch 3 Batch 0 Loss 0.1645
Epoch 3 Batch 100 Loss 0.1533
Epoch 3 Batch 200 Loss 0.2549
Epoch 3 Batch 300 Loss 0.1547
Epoch 3 Batch 400 Loss 0.2486
Epoch 3 Batch 500 Loss 0.2186
Epoch 3 Batch 600 Loss 0.3238
Epoch 3 Loss 0.2384
Time taken for 1 epoch 29.842041730880737 sec

Epoch 4 Batch 0 Loss 0.1184
Epoch 4 Batch 100 Loss 0.1107
Epoch 4 Batch 200 Loss 0.2549
Epoch 4 Batch 300 Loss 0.2190
Epoch 4 Batch 400 Loss 0.1804
Epoch 4 Batch 500 Loss 0.203

In [26]:
print(tf.train.latest_checkpoint(checkpoint_dir))
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

checkpoints/sentiment-analysis-luong-attention/ckpt-5


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fb774175390>

In [27]:
def evaluate(test_data):

    y_predicts = []
    attention_weights_list = []

    for i in range(len(test_data)):

        input_data = tf.expand_dims(tf.convert_to_tensor(test_data[i]), 0)
        enc_hidden = [tf.zeros((1, units))]
        enc_output, enc_hidden = encoder(input_data, enc_hidden)

        # passing enc_output to the decoder
        predictions, attention_weights = decoder(enc_hidden, enc_output)
        # convert the tensor to numpy list
        attention_weights_list.append(
            attention_weights.numpy().flatten().tolist())
        y_predicts.append(1 if predictions >= 0.5 else 0)

    return y_predicts, attention_weights_list

In [28]:
y_predicts, attention_weights_list = evaluate(X_test)

In [29]:
print('Accuracy: ', (y_predicts == y_test).sum() / len(y_test))

Accuracy:  0.8424
