In [5]:
# importing dependencies for RNN
import tensorflow as tf
import numpy as np
import pandas as pd
import time 
import os

from sklearn.model_selection import train_test_split
from sklearn import datasets, metrics
from utils import separate_dataset, build_dataset, str_idx

In [3]:
os.chdir(r'C:\Users\Anuj\Documents\GitHub\Natural Language Processing\NLP Chatbot')

### Data Preparation

In [10]:
traindata = datasets.load_files(container_path = './sentiment data', encoding = 'utf-8')
traindata.data , traindata.target = separate_dataset(traindata, 1.0)

In [11]:
print(traindata.target_names)

['Negative', 'Positive']


In [28]:
#Transforming the dataset into ONE HOT Encoding
ONEHOT = np.zeros((len(traindata.data), len(traindata.target_names)))
ONEHOT[np.arange(len(traindata.data)), traindata.target] = 1.0
train_X, test_X, train_Y, test_Y, train_onehot, test_onehot = train_test_split(
    traindata.data,
    traindata.target,
    ONEHOT, test_size=0.2
)
concat = ' '.join(traindata.data).split()
#print(concat)
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)

In [29]:
print('vocab from size:', vocabulary_size)
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 13997
Most common words [('the', 10132), ('a', 6936), ('of', 5502), ('and', 5294), ('to', 4528), ('is', 3334)]
Sample data [683, 197, 7, 370, 12, 36, 6243, 7, 2067, 51] ['simplistic', 'silly', 'and', 'tedious', 'its', 'so', 'laddish', 'and', 'juvenile', 'only'] 



In [30]:
# Tag to mark the beginning of the sentence
GO = dictionary['GO']       # 0th position
# Tag to add extra padding in the sentence
PAD = dictionary['PAD']     # 1st position
# Tag to mark the end of the sentence
EOS = dictionary['EOS']     # 2nd position
# Tag to mark the unknown word
UNK = dictionary['UNK']     # 3rd position

In [35]:
size_layer = 128 #The number of units in RNN cell
num_layers = 2 #The number of hidden layers
embedded_size = 128 #The size of embeddding
dimension_output = len(traindata.target_names) # Number of classes  
learning_rate = 1e-3 # The lr of the optimization algorithm
maxlen = 50 
batch_size = 128

In [36]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, dimension_output, learning_rate):

        def cells(reuse=False):
            return tf.nn.rnn_cell.BasicRNNCell(size_layer, reuse=reuse)
        '''
        2 place holder: 1. to feed the sequence data to the model
                        2. for the output
        '''
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])

        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        '''
        Variable to store the embedded lookup for the dictionary
        '''
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        '''
        Add the RNN Layer
        '''
        rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
        outputs, _ = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded, dtype=tf.float32)
        '''
        Create weights and Bias
        '''
        W = tf.get_variable('w',
                            shape=(size_layer, dimension_output),
                            initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',
                            shape=(dimension_output),
                            initializer=tf.zeros_initializer())
        '''
        logits are computed by performing a matrix multiplication of the weight, 
        the output from the RNN layer, and addition of bias
        '''
        self.logits = tf.matmul(outputs[:, -1], W) + b
        self.cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.Y)
        )
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)

        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [38]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer, num_layers, embedded_size,
              vocabulary_size + 4, dimension_output,
              learning_rate)
sess.run(tf.global_variables_initializer())

saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
checkpoint_dir = os.path.abspath(os.path.join('./', "checkpoints_basic_rnn"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_X[i: i + batch_size], dictionary, maxlen)
        acc, loss, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict={
                model.X: batch_x,
                model.Y: train_onehot[i: i + batch_size]
            }
        )
        train_loss += loss
        train_acc += acc

    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_X[i: i + batch_size], dictionary, maxlen)
        acc, loss = sess.run(
            [model.accuracy, model.cost],
            feed_dict={
                model.X: batch_x,
                model.Y: train_onehot[i: i + batch_size]
            }
        )
        test_loss += loss
        test_acc += acc

    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)

    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f' % (EPOCH, CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n' %
          (EPOCH, train_loss, train_acc, test_loss, test_acc))
    path = saver.save(sess, checkpoint_prefix, global_step=EPOCH)
    EPOCH += 1



epoch: 0, pass acc: 0.000000, current acc: 0.510742
time taken: 10.172515392303467
epoch: 0, training loss: 0.732301, training acc: 0.448982, valid loss: 0.702369, valid acc: 0.510742

time taken: 6.98599910736084
epoch: 1, training loss: 0.707971, training acc: 0.489347, valid loss: 0.706188, valid acc: 0.506836

epoch: 2, pass acc: 0.510742, current acc: 0.516602
time taken: 10.062481880187988
epoch: 2, training loss: 0.700831, training acc: 0.525450, valid loss: 0.722394, valid acc: 0.516602

Instructions for updating:
Use standard file APIs to delete files with this prefix.
time taken: 10.400992393493652
epoch: 3, training loss: 0.688384, training acc: 0.559422, valid loss: 0.728733, valid acc: 0.501465

time taken: 7.053036689758301
epoch: 4, training loss: 0.658756, training acc: 0.612926, valid loss: 0.759279, valid acc: 0.502930

time taken: 9.946547746658325
epoch: 5, training loss: 0.632482, training acc: 0.646307, valid loss: 0.791856, valid acc: 0.516113

time taken: 7.6449

In [45]:
# Evaluation metrics
logits = sess.run(model.logits,
                  feed_dict={model.X: str_idx(test_X, dictionary, maxlen)})
print(metrics.classification_report(
    test_Y,
    np.argmax(logits, 1),
    target_names=traindata.target_names))

              precision    recall  f1-score   support

    Negative       0.49      1.00      0.66      1053
    Positive       0.00      0.00      0.00      1080

   micro avg       0.49      0.49      0.49      2133
   macro avg       0.25      0.50      0.33      2133
weighted avg       0.24      0.49      0.33      2133



  'precision', 'predicted', average, warn_for)
