In [1]:
import math
import numpy as np
import tensorflow as tf

from tensorflow.python.ops.rnn_cell import GRUCell
from tensorflow.python.ops.rnn_cell import LSTMCell
from tensorflow.python.ops.rnn_cell import MultiRNNCell
from tensorflow.python.ops.rnn_cell import DropoutWrapper, ResidualWrapper

from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.layers.core import Dense
from tensorflow.python.util import nest

from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
from datetime import datetime

from preprocess import *
from loading_util import *
from ques_dictionary import ques_dict

  from ._conv import register_converters as _register_converters


In [2]:
#Resetter
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [3]:
#embedding parameters
embedding_size = 50
vocab_size = 400003

#data parameters
eMax_allowed_length = 3000
#dMax_allowed_length = 15

#network parameters
hidden_units = 90
n_outputs = 5
depth = 1
n_epochs = 10
learning_rate = 0.001
batch_size = 64

#Saving Parameters
#Tensorboard Logs
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tmp/LSTM_evaluate/tf_logs"
log_dir = "{}/run-{}/".format(root_logdir,now)
#Tensorboard logs end

save_path = 'tmp/LSTM_Evaluate/model/mymodel.ckpt'

In [4]:
#Fetching data
#default directory: 'data/essays.csv'
X,Y= read_csv_essay('data/essays.csv')

In [5]:
#Fetching glove vectors
#default directory: "./glove.6B.50d.txt"
embedding_size = 50
wi,iw,wv = read_glove_vecs()
len(wi)
len(iw)
reset_graph()

In [6]:
#Adding extra tokens to glove dictionary
#IMPORTANT: Don't run more than once
go_index,eos_index,unk_index = add_extra_to_dict(wi,iw,wv,embedding_size)
emb = map_dict_to_list(iw,wv)

In [7]:
#preprocessing data
#Mapping each word in a sentence to its glove index
eInput,eLengths = fit_essay_text(data= X,word_to_index = wi,max_allowed_seq_length = eMax_allowed_length)

eInput = np.array(eInput) 
eLengths = np.array(eLengths)
#dInput = np.array(dInput)
#dOutput = np.array(dOutput)
#dLengths = np.array(dLengths)
print(eInput[2])
print(eLengths[1])

[ 54272 270500 205844 ... 400001 400001 400001]
646


In [8]:


#encoder inputs: [batch_size, max_time_steps]
encoder_inputs = tf.placeholder(dtype = tf.int32, shape = (None,None), name = 'encoder_inputs')
#encoder_inputs_length: [batch_size]
encoder_inputs_length = tf.placeholder(dtype=tf.int32, shape=(None,) , name = 'encoder_inputs_length')

#encoder_outputs
traits = tf.placeholder(tf.float32,[None,n_outputs])

In [9]:

#Instantiating embeddings
embedding_variable = tf.Variable(tf.constant(0.0, shape = [vocab_size, embedding_size]),trainable = False, name = 'embedding')
embedding_placeholder = tf.placeholder(tf.float32, shape=[vocab_size,embedding_size], name = 'embedding_placeholder' )
encoder_embeddings = embedding_variable.assign(embedding_placeholder)

encoder_inputs_embedded=tf.nn.embedding_lookup(encoder_embeddings,encoder_inputs)

In [10]:
#Testing embedding lookup
with tf.Session() as sess:
    embed=sess.run(encoder_inputs_embedded, feed_dict={embedding_placeholder:emb ,encoder_inputs:eInput })
    print(embed.shape)
    print(embed[0][0])

(2467, 3000, 50)
[ 2.7691e-01  2.8745e-01 -2.9935e-01 -1.9964e-01  1.2956e-01  1.5555e-01
 -6.4522e-01 -3.4090e-01 -1.1833e-01  1.5798e-01  1.3969e-01  2.4872e-01
 -1.5901e-01 -3.3439e-02  1.1895e-01  7.6535e-02  4.5263e-01  2.6494e-01
 -1.9157e-01 -5.6768e-01  2.9286e-02  2.1745e-01  4.3406e-01  1.4981e-01
  7.5774e-02 -1.4453e+00 -5.8394e-01 -4.6063e-02  6.6214e-02 -2.6417e-01
  3.9650e+00  2.5196e-01  2.4855e-01 -5.0524e-01  2.5806e-01  2.8683e-01
 -1.7994e-01  6.2885e-01 -1.2040e-01 -4.2143e-02 -4.4911e-02  1.8561e-01
  1.6266e-01 -2.6127e-03  1.3083e-01  2.0179e-01 -2.9667e-01 -9.4820e-02
 -2.1250e-01  2.2074e-02]


In [11]:
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units = hidden_units)
outputs,states = tf.nn.dynamic_rnn(basic_cell,encoder_inputs_embedded,dtype=tf.float32, sequence_length=encoder_inputs_length)

In [12]:
logits = tf.layers.dense(states,n_outputs)
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(
    labels=traits,
    logits=logits,
    name='CostFunction_Sigmoid'
)
loss = tf.reduce_mean(xentropy)

In [13]:
#Testing Purposes
probs = tf.nn.sigmoid(logits)

In [14]:
#Testing sigmoid

init = tf.global_variables_initializer()
with tf.Session() as sess:
    init.run()
    index = 20
    p = sess.run(probs,feed_dict = {encoder_inputs: eInput[:index],encoder_inputs_length: eLengths[:index],traits: Y[:index],embedding_placeholder:emb})
    print(p)

[[0.46080476 0.65937465 0.40133417 0.496585   0.31023803]
 [0.6714892  0.39019325 0.41171765 0.48625496 0.41019386]
 [0.66519624 0.35543075 0.5912706  0.4032026  0.43505606]
 [0.4396047  0.69373745 0.3284235  0.35791615 0.5821552 ]
 [0.5403732  0.5991934  0.4185316  0.4548149  0.619352  ]
 [0.5224103  0.61351436 0.3569265  0.38378924 0.72455114]
 [0.6248909  0.5293847  0.2367164  0.25008565 0.5450527 ]
 [0.5889836  0.28774312 0.4597887  0.5511989  0.47979635]
 [0.5883174  0.53732264 0.36808395 0.52243596 0.5126058 ]
 [0.63208276 0.48975295 0.69125515 0.60061264 0.6098951 ]
 [0.58261144 0.48515907 0.48880145 0.5102033  0.37506896]
 [0.6805589  0.40056866 0.50416505 0.35897252 0.5945645 ]
 [0.43699357 0.34254834 0.36023796 0.6148156  0.3705528 ]
 [0.64748263 0.54289925 0.39646497 0.2446766  0.64683235]
 [0.5624788  0.5219125  0.3689972  0.4687736  0.5470733 ]
 [0.5140218  0.36854708 0.5458346  0.54155195 0.4315662 ]
 [0.5573012  0.4981787  0.33942744 0.52188283 0.5463681 ]
 [0.5596738  0

In [15]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss) ;
correct = tf.equal(tf.round(probs),tf.round(traits))
accuracy = tf.reduce_mean(tf.cast(correct,tf.float32))

In [16]:
#Testing accuracy

init = tf.global_variables_initializer()
with tf.Session() as sess:
    init.run()
    index = 20
    c,p = sess.run([correct,accuracy],feed_dict = {encoder_inputs: eInput[:index],
                                                   encoder_inputs_length: eLengths[:index],
                                                   traits: Y[:index],embedding_placeholder:emb})
    print(p)
    print(c)

0.46
[[ True  True False  True False]
 [False  True False  True  True]
 [False False False False False]
 [False False False False False]
 [ True False False  True  True]
 [ True False False  True  True]
 [False False False  True False]
 [False  True  True  True False]
 [ True  True  True  True  True]
 [ True False  True  True False]
 [ True False False  True False]
 [False  True False  True False]
 [ True False  True False  True]
 [False  True False  True False]
 [False False False False False]
 [ True  True  True False  True]
 [False  True  True False  True]
 [False False False  True False]
 [False  True False  True  True]
 [ True False False False  True]]


In [17]:
acc_summary = tf.summary.scalar('Accuracy',accuracy)
file_writer = tf.summary.FileWriter(log_dir+'_train',tf.get_default_graph()) 
file_writer_test = tf.summary.FileWriter(log_dir+'_test',tf.get_default_graph()) 

In [18]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [None]:
def fetch_batch(data_x,data_y,lengths,batch_index,batch_size):
    return (data_x[batch_index:batch_index+batch_size,:],
            data_y[batch_index:batch_index+batch_size,:],
            lengths[batch_index:batch_index+batch_size])

In [None]:

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for batch_index in range(int(eInput.shape[0]/batch_size)):
            batch_X,batch_Y,batch_lengths = fetch_batch(eInput,Y,eLengths,batch_index,batch_size)
            
            sess.run(training_op,feed_dict = {encoder_inputs: batch_X,
                                       encoder_inputs_length: batch_lengths,
                                       traits: batch_Y,
                                       embedding_placeholder:emb})
            acc = accuracy.eval(feed_dict = {encoder_inputs: batch_X,
                                       encoder_inputs_length: batch_lengths,
                                       traits: batch_Y,
                                       embedding_placeholder:emb})
            
            batch_loss = accuracy.eval(feed_dict = {encoder_inputs: batch_X,
                                       encoder_inputs_length: batch_lengths,
                                       traits: batch_Y,
                                       embedding_placeholder:emb})
            print('Epoch:',epoch,' Batch_number: ',batch_index,' Batch_loss:',batch_loss,' Accuracy:',acc)
        
        print('Epoch:',epoch,' | Accuracy:',acc)

Epoch: 0  Batch_number:  0  Batch_loss: 0.5625  Accuracy: 0.5625
Epoch: 0  Batch_number:  1  Batch_loss: 0.584375  Accuracy: 0.584375
Epoch: 0  Batch_number:  2  Batch_loss: 0.615625  Accuracy: 0.615625
Epoch: 0  Batch_number:  3  Batch_loss: 0.634375  Accuracy: 0.634375
Epoch: 0  Batch_number:  4  Batch_loss: 0.66875  Accuracy: 0.66875
Epoch: 0  Batch_number:  5  Batch_loss: 0.69375  Accuracy: 0.69375
Epoch: 0  Batch_number:  6  Batch_loss: 0.709375  Accuracy: 0.709375
Epoch: 0  Batch_number:  7  Batch_loss: 0.73125  Accuracy: 0.73125
Epoch: 0  Batch_number:  8  Batch_loss: 0.740625  Accuracy: 0.740625
Epoch: 0  Batch_number:  9  Batch_loss: 0.7625  Accuracy: 0.7625
Epoch: 0  Batch_number:  10  Batch_loss: 0.784375  Accuracy: 0.784375
Epoch: 0  Batch_number:  11  Batch_loss: 0.790625  Accuracy: 0.790625
Epoch: 0  Batch_number:  12  Batch_loss: 0.803125  Accuracy: 0.803125
Epoch: 0  Batch_number:  13  Batch_loss: 0.80625  Accuracy: 0.80625
Epoch: 0  Batch_number:  14  Batch_loss: 0.803

In [19]:
#Problem with the model due to less data

with tf.Session() as sess:
    saver.restore(sess,save_path)
    testing = test_ques.eval(feed_dict = {encoder_inputs: eInput_test,encoder_inputs_length: eLengths_test,embedding_placeholder:emb})
    print('Ans:',X_test[2])
    print()
    print(ques_dict[testing[2]])

INFO:tensorflow:Restoring parameters from tmp/Classification/model/mymodel.ckpt
Ans: I am a hard working person, and I am ambitious about my goals. I would love to fulfill organization objectives so that I am recognised as someone significant to the company. I can work under pressure when things are not my side and I never give up.

Can you work under pressure?


In [20]:
def network_pass(candidate_ans):
    
    candidate_ans = [candidate_ans]
    candidate_input,input_length = fit_encoder_text(data= candidate_ans,word_to_index = wi,max_allowed_seq_length = eMax_allowed_length)
    new_Q = test_ques.eval(feed_dict = {encoder_inputs: candidate_input,encoder_inputs_length: input_length,embedding_placeholder:emb})
    return ques_dict[new_Q[0]] 
    

In [21]:
def activate_bot():
    with tf.Session() as sess:
        saver.restore(sess,save_path)
        
        count = 1 
        Q = 'Tell me about yourself.'
        while(count!=5):
            candidate_ans = input(Q)
            Q = network_pass(candidate_ans)
            print()
            count+=1
        
activate_bot()

INFO:tensorflow:Restoring parameters from tmp/Classification/model/mymodel.ckpt
Tell me about yourself.Hi i am gitesh khanna and i love working in machine learning.

AI and machine learning is hyped all around the world. What makes you stand out from the other candidates?i am a hardworkinh person who is passionate to gain more knowledge. I am a quick learner and I can easily handle multiple tasks in minimum supervision.

What makes you angry?Anger is a term naturally associated with everybody I guess. I am not known to get angry normally but there are instances when I did become angry, particularly in case of working in teams if I find that fellow members are taking full credit for a work in which they didn't contribute.

What was the toughest decision you ever had to make?I had to choose between software devlopment and machine learning. I decided choosing ML. Later realized that ML itself has so much application focus on software development too.

