In [1]:
import math
import numpy as np
import tensorflow as tf

from tensorflow.python.ops.rnn_cell import GRUCell
from tensorflow.python.ops.rnn_cell import LSTMCell
from tensorflow.python.ops.rnn_cell import MultiRNNCell
from tensorflow.python.ops.rnn_cell import DropoutWrapper, ResidualWrapper

from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.layers.core import Dense
from tensorflow.python.util import nest

from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
from datetime import datetime

from preprocess import *
from loading_util import *

from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters


In [2]:
#Resetter
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [3]:
#embedding parameters
embedding_size = 50
vocab_size = 400003

#data parameters
eMax_allowed_length = 3000
#dMax_allowed_length = 15

#network parameters
hidden_units = 64
n_outputs = 5
depth = 2
n_epochs = 5
learning_rate = 0.001
batch_size = 64

#Saving Parameters
#Tensorboard Logs
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tmp/LSTM_evaluate/tf_logs"
log_dir = "{}/run-{}/".format(root_logdir,now)
#Tensorboard logs end

save_path = 'tmp/LSTM_Evaluate/model/mymodel.ckpt'

In [4]:
#Fetching data
#default directory: 'data/essays.csv'
X,Y= read_csv_essay('data/essays.csv')
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,shuffle=True,train_size = 0.7,test_size=0.3 ,stratify=Y)

In [5]:
#Fetching glove vectors
#default directory: "./glove.6B.50d.txt"
embedding_size = 50
wi,iw,wv = read_glove_vecs()
len(wi)
len(iw)
reset_graph()

In [6]:
#Adding extra tokens to glove dictionary
#IMPORTANT: Don't run more than once
go_index,eos_index,unk_index = add_extra_to_dict(wi,iw,wv,embedding_size)
emb = map_dict_to_list(iw,wv)

In [7]:
#preprocessing data
#Mapping each word in a sentence to its glove index
eInput_train,eLengths_train = fit_essay_text(data= X_train,word_to_index = wi,max_allowed_seq_length = eMax_allowed_length)
eInput_test,eLengths_test = fit_essay_text(data= X_test,word_to_index = wi,max_allowed_seq_length = eMax_allowed_length)

eInput_train = np.array(eInput_train) 
eLengths_train = np.array(eLengths_train)

eInput_test = np.array(eInput_test) 
eLengths_test = np.array(eLengths_test)

print(eInput_train)
print()
print(eInput_test)

[[102729 268045 357265 ... 400001 400001 400001]
 [185456 302351  52942 ... 400001 400001 400001]
 [357265 269797 132032 ... 400001 400001 400001]
 ...
 [268744 390159 185456 ... 400001 400001 400001]
 [361079 383513  43009 ... 400001 400001 400001]
 [185456 388582 357211 ... 400001 400001 400001]]

[[193715 335306 302351 ... 400001 400001 400001]
 [185456 302351 383067 ... 400001 400001 400001]
 [185456 180882 383071 ... 400001 400001 400001]
 ...
 [390159 185456 200034 ... 400001 400001 400001]
 [185456  52942 154322 ... 400001 400001 400001]
 [268744 336113 177230 ... 400001 400001 400001]]


In [8]:
#encoder inputs: [batch_size, max_time_steps]
encoder_inputs = tf.placeholder(dtype = tf.int32, shape = (None,None), name = 'encoder_inputs')
#encoder_inputs_length: [batch_size]
encoder_inputs_length = tf.placeholder(dtype=tf.int32, shape=(None,) , name = 'encoder_inputs_length')

#encoder_outputs
traits = tf.placeholder(tf.float32,[None,n_outputs])

In [9]:

#Instantiating embeddings
embedding_variable = tf.Variable(tf.constant(0.0, shape = [vocab_size, embedding_size]),trainable = False, name = 'embedding')
embedding_placeholder = tf.placeholder(tf.float32, shape=[vocab_size,embedding_size], name = 'embedding_placeholder' )
encoder_embeddings = embedding_variable.assign(embedding_placeholder)

encoder_inputs_embedded=tf.nn.embedding_lookup(encoder_embeddings,encoder_inputs)

In [10]:
#Testing embedding lookup
with tf.Session() as sess:
    embed=sess.run(encoder_inputs_embedded, feed_dict={embedding_placeholder:emb ,encoder_inputs:eInput_train })
    print(embed.shape)
    print(embed[0][0])

(1726, 3000, 50)
[ 0.17212   -1.0375     0.86829    0.18981    0.1175    -0.47162
 -0.41299   -0.98873    0.13708    0.5745    -0.16735   -0.041845
 -0.31854    0.82375   -0.016155   0.2247    -0.97144   -0.80812
  0.39363   -1.1919    -0.32366   -0.33609   -0.0058044  0.98123
  1.3945    -0.18385   -0.28007    0.93575    0.70044   -2.1868
  1.0874     0.233     -0.07168   -0.62216   -0.69471    0.1657
  0.37339   -0.33158   -0.47699    0.065543   0.72013    0.038212
 -0.68997    0.97234   -0.040409  -0.067067   2.0884    -0.33596
  0.45466   -0.4402   ]


In [11]:
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units = hidden_units)
outputs,states = tf.nn.dynamic_rnn(basic_cell,encoder_inputs_embedded,dtype=tf.float32, sequence_length=encoder_inputs_length)

In [12]:
logits = tf.layers.dense(states,n_outputs)
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(
    labels=traits,
    logits=logits,
    name='CostFunction_Sigmoid'
)
loss = tf.reduce_mean(xentropy)

In [13]:
#Testing Purposes
probs = tf.nn.sigmoid(logits)

In [14]:
#Testing sigmoid

init = tf.global_variables_initializer()
with tf.Session() as sess:
    init.run()
    index = 20
    p = sess.run(probs,feed_dict = {encoder_inputs: eInput_train[:index],encoder_inputs_length: eLengths_train[:index],traits: Y_train[:index],embedding_placeholder:emb})
    print(p)

[[0.42670852 0.42172858 0.6228923  0.6031751  0.531172  ]
 [0.54152936 0.4478493  0.66808563 0.6195538  0.45637307]
 [0.51664764 0.5017774  0.55509317 0.4846529  0.44111398]
 [0.6639931  0.4677229  0.6372662  0.51697755 0.378357  ]
 [0.5892946  0.38766056 0.6585822  0.53872454 0.3844948 ]
 [0.54764706 0.37348288 0.5226442  0.6184045  0.41819406]
 [0.64314955 0.48325562 0.5729023  0.39328915 0.45246458]
 [0.6024687  0.37487152 0.6178862  0.55659413 0.4640369 ]
 [0.65077925 0.30212817 0.59735173 0.52226055 0.40486568]
 [0.6117587  0.3518935  0.5141206  0.6774777  0.5719723 ]
 [0.653818   0.44803816 0.50178343 0.62581134 0.35186666]
 [0.49001867 0.34542397 0.57937723 0.52284384 0.4180128 ]
 [0.42772368 0.47527426 0.596334   0.49086615 0.46024498]
 [0.4825744  0.33369347 0.6510847  0.60149574 0.6206156 ]
 [0.62399817 0.36454672 0.5734464  0.47022405 0.54766434]
 [0.6103695  0.44031054 0.58074754 0.5413931  0.48415104]
 [0.64497447 0.332827   0.6024348  0.6906779  0.4366395 ]
 [0.56883323 0

In [15]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss) ;
correct = tf.equal(tf.round(probs),tf.round(traits))
accuracy = tf.reduce_mean(tf.cast(correct,tf.float32))

In [16]:
#Testing accuracy

init = tf.global_variables_initializer()
with tf.Session() as sess:
    init.run()
    index = 20
    c,p = sess.run([correct,accuracy],feed_dict = {encoder_inputs: eInput_train[:index],
                                                   encoder_inputs_length: eLengths_train[:index],
                                                   traits: Y_train[:index],embedding_placeholder:emb})
    print(p)
    print(c)

0.54
[[ True False  True False  True]
 [ True  True  True  True False]
 [False  True  True  True  True]
 [ True False False  True False]
 [ True  True  True  True False]
 [ True False  True False False]
 [False  True  True False False]
 [ True  True  True  True  True]
 [False  True  True  True False]
 [False False  True  True False]
 [False  True  True  True  True]
 [False False  True False  True]
 [False  True  True False False]
 [ True False  True False  True]
 [False False  True  True False]
 [False False False  True False]
 [False False False  True  True]
 [ True False False  True  True]
 [False  True False False  True]
 [ True False False  True False]]


In [17]:
acc_summary = tf.summary.scalar('Accuracy',accuracy)
file_writer = tf.summary.FileWriter(log_dir+'_train',tf.get_default_graph()) 
file_writer_test = tf.summary.FileWriter(log_dir+'_test',tf.get_default_graph()) 

In [18]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [19]:
def fetch_batch(data_x,data_y,lengths,batch_index,batch_size):
    return (data_x[batch_index:batch_index+batch_size,:],
            data_y[batch_index:batch_index+batch_size,:],
            lengths[batch_index:batch_index+batch_size])

In [20]:
total_batches = int(eInput_train.shape[0]/batch_size)


with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for batch_index in range(total_batches):
            batch_X,batch_Y,batch_lengths = fetch_batch(eInput_train,Y_train,eLengths_train,batch_index,batch_size)
            
            sess.run(training_op,feed_dict = {encoder_inputs: batch_X,
                                       encoder_inputs_length: batch_lengths,
                                       traits: batch_Y,
                                       embedding_placeholder:emb})
            acc_train = accuracy.eval(feed_dict = {encoder_inputs: batch_X,
                                       encoder_inputs_length: batch_lengths,
                                       traits: batch_Y,
                                       embedding_placeholder:emb})
            acc_test = accuracy.eval(feed_dict = {encoder_inputs: eInput_test,
                                       encoder_inputs_length: eLengths_test,
                                       traits: Y_test,
                                       embedding_placeholder:emb})
            
            batch_loss = accuracy.eval(feed_dict = {encoder_inputs: batch_X,
                                       encoder_inputs_length: batch_lengths,
                                       traits: batch_Y,
                                       embedding_placeholder:emb})
            
            print('Epoch:',epoch,' Batch_number: ',batch_index,
                  'Batch_loss:',batch_loss, 
                  'TRAIN_accuracy:',acc_train,' TEST_accuracy:',acc_test)
        

Epoch: 0  Batch_number:  0 Batch_loss: 0.55 TRAIN_accuracy: 0.55  TEST_accuracy: 0.50931174
Epoch: 0  Batch_number:  1 Batch_loss: 0.571875 TRAIN_accuracy: 0.571875  TEST_accuracy: 0.5109312
Epoch: 0  Batch_number:  2 Batch_loss: 0.603125 TRAIN_accuracy: 0.603125  TEST_accuracy: 0.5203779
Epoch: 0  Batch_number:  3 Batch_loss: 0.63125 TRAIN_accuracy: 0.63125  TEST_accuracy: 0.51983804
Epoch: 0  Batch_number:  4 Batch_loss: 0.653125 TRAIN_accuracy: 0.653125  TEST_accuracy: 0.5174089
Epoch: 0  Batch_number:  5 Batch_loss: 0.659375 TRAIN_accuracy: 0.659375  TEST_accuracy: 0.51282054
Epoch: 0  Batch_number:  6 Batch_loss: 0.66875 TRAIN_accuracy: 0.66875  TEST_accuracy: 0.5160594
Epoch: 0  Batch_number:  7 Batch_loss: 0.671875 TRAIN_accuracy: 0.671875  TEST_accuracy: 0.51578945
Epoch: 0  Batch_number:  8 Batch_loss: 0.68125 TRAIN_accuracy: 0.68125  TEST_accuracy: 0.51578945
Epoch: 0  Batch_number:  9 Batch_loss: 0.6875 TRAIN_accuracy: 0.6875  TEST_accuracy: 0.5149798
Epoch: 0  Batch_number:

KeyboardInterrupt: 

In [None]:
#Problem with the model due to less data

with tf.Session() as sess:
    saver.restore(sess,save_path)
    testing = test_ques.eval(feed_dict = {encoder_inputs: eInput_test,encoder_inputs_length: eLengths_test,embedding_placeholder:emb})
    print('Ans:',X_test[2])
    print()
    print(ques_dict[testing[2]])

In [None]:
def network_pass(candidate_ans):
    
    candidate_ans = [candidate_ans]
    candidate_input,input_length = fit_encoder_text(data= candidate_ans,word_to_index = wi,max_allowed_seq_length = eMax_allowed_length)
    new_Q = test_ques.eval(feed_dict = {encoder_inputs: candidate_input,encoder_inputs_length: input_length,embedding_placeholder:emb})
    return ques_dict[new_Q[0]] 
    

In [None]:
def activate_bot():
    with tf.Session() as sess:
        saver.restore(sess,save_path)
        
        count = 1 
        Q = 'Tell me about yourself.'
        while(count!=5):
            candidate_ans = input(Q)
            Q = network_pass(candidate_ans)
            print()
            count+=1
        
activate_bot()