In [1]:
import keras as ks
import numpy as np
import tensorflow as tf
import re
﻿from __future__ import print_function
import numpy as np
import sys
import os
import cntk as C
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
from sklearn.metrics import precision_recall_fscore_support
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings("ignore", category=UndefinedMetricWarning) 

Using TensorFlow backend.


In [3]:
#pre_processing

#Initialize Global variables 
GloveEmbeddings = {}
max_query_words = 12
max_passage_words = 50
emb_dim = 50
#The following method takes Glove Embedding file and stores all words and their embeddings in a dictionary
def loadEmbeddings(embeddingfile):
    global GloveEmbeddings,emb_dim

    fe = open(embeddingfile,"r",encoding="utf-8",errors="ignore")
    for line in fe:
        tokens= line.strip().split()
        word = tokens[0]
        vec = tokens[1:]
        vec = " ".join(vec)
        GloveEmbeddings[word]=vec
    #Add Zerovec, this will be useful to pad zeros, it is better to experiment with padding any non-zero constant values also.
    GloveEmbeddings["zerovec"] = "0.0 "*emb_dim
    fe.close()


def TextDataToCTF(inputfile,outputfile,isEvaluation):
    global GloveEmbeddings,emb_dim,max_query_words,max_passage_words

    f = open(inputfile,"r",encoding="utf-8",errors="ignore")  # Format of the file : query_id \t query \t passage \t label \t passage_id
    fw = open(outputfile,"w",encoding="utf-8")
    for line in f:
        tokens = line.strip().lower().split("\t")
        query_id,query,passage,label = tokens[0],tokens[1],tokens[2],tokens[3]

        #****Query Processing****
        words = re.split('\W+', query)
        words = [x for x in words if x] # to remove empty words 
        word_count = len(words)
        remaining = max_query_words - word_count  
        if(remaining>0):
            words += ["zerovec"]*remaining # Pad zero vecs if the word count is less than max_query_words
        words = words[:max_query_words] # trim extra words
        #create Query Feature vector 
        query_feature_vector = ""
        for word in words:
            if(word in GloveEmbeddings):
                query_feature_vector += GloveEmbeddings[word]+" "
            else:
                query_feature_vector += GloveEmbeddings["zerovec"]+" "  #Add zerovec for OOV terms
        query_feature_vector = query_feature_vector.strip() 

        #***** Passage Processing **********
        words = re.split('\W+', passage)
        words = [x for x in words if x] # to remove empty words 
        word_count = len(words)
        remaining = max_passage_words - word_count  
        if(remaining>0):
            words += ["zerovec"]*remaining # Pad zero vecs if the word count is less than max_passage_words
        words = words[:max_passage_words] # trim extra words
        #create Passage Feature vector 
        passage_feature_vector = ""
        for word in words:
            if(word in GloveEmbeddings):
                passage_feature_vector += GloveEmbeddings[word]+" "
            else:
                passage_feature_vector += GloveEmbeddings["zerovec"]+" "  #Add zerovec for OOV terms
        passage_feature_vector = passage_feature_vector.strip() 

        #convert label
        label_str = " 1 0 " if label=="0" else " 0 1 " 

        if(not isEvaluation):
            fw.write("|qfeatures "+query_feature_vector+" |pfeatures "+passage_feature_vector+" |labels "+label_str+"\n")
        else:
            fw.write("|qfeatures "+query_feature_vector+" |pfeatures "+passage_feature_vector+"|qid "+str(query_id)+"\n")



if __name__ == "__main__":

    trainFileName = "data.tsv"
#     validationFileName = "validationdata.tsv"
    EvaluationFileName = "eval1_unlabelled.tsv"

    embeddingFileName = "glove.6B.50d.txt"

    loadEmbeddings(embeddingFileName)    

    # Convert Query,Passage Text Data to CNTK Text Format(CTF) using 50-Dimension Glove word embeddings 
    TextDataToCTF(trainFileName,"TrainData.ctf",False)
    print("Train Data conversion is done")
#     TextDataToCTF(validationFileName,"ValidationData.ctf",False)
#     print("Validation Data conversion is done")
    TextDataToCTF(EvaluationFileName,"EvaluationData.ctf",True)
    print("Evaluation Data conversion is done")

KeyboardInterrupt: 

In [None]:
#Loading Data

#Initialize Global variables
validation_query_vectors = []
validation_passage_vectors = []
validation_labels = []   
q_max_words=12
p_max_words=50
emb_dim=50

## The following LoadValidationSet method reads ctf format validation file and creates query, passage feature vectors and also copies labels for each pair.
## the created vectors will be useful to find metrics on validation set after training each epoch which will be useful to decide the best model 
def LoadData(validationfile,batch_size):
    f = open(validationfile,'r',encoding="utf-8")
    i=0
    for line in f:
        i+=1
        tokens = line.strip().split("|")  
        #tokens[0] will be empty token since the line is starting with |
        x1 = tokens[1].replace("qfeatures","").strip() #Query Features
        x2 = tokens[2].replace("pfeatures","").strip() # Passage Features
        y = tokens[3].replace("labels","").strip() # labels
        x1 = [float(v) for v in x1.split()]
        x2 = [float(v) for v in x2.split()]
        y = [int(w) for w in y.split()]        
        y = y[1] # label will be at index 1, i.e. if y = "1 0" then label=0 else if y="0 1" then label=1

        validation_query_vectors.append(x1)
        validation_passage_vectors.append(x2)
        validation_labels.append(y)
        
        if i==batch_size:
            i=0
            yield(validation_passage_vectors,validation_query_vectors,validation_labels)
        #print("1")
    
#     print("Validation Vectors are created")
    
if __name__ == "__main__":

    trainSetFileName = "TrainData.ctf"
    validationSetFileName = "ValidationData.ctf"
    testSetFileName = "EvaluationData.ctf"
    submissionFileName = "answer.tsv"
   
    LoadValidationSet(validationSetFileName)    #Load Validation Query, Passage Vectors from Validation CTF File
    model = TrainAndValidate(trainSetFileName) # Training and validation methods    
    GetPredictionOnEvalSet(model,testSetFileName,submissionFileName) # Get Predictions on Evaluation Set

In [3]:
#parameters
q_max=12
p_max=50
emb_size=50
nodes=256

In [75]:
# placeholders
tf.reset_default_graph()
q_data=tf.placeholder(dtype=tf.float32,shape=(None,q_max,emb_size),name="query_data")
p_data=tf.placeholder(dtype=tf.float32,shape=(None,p_max,emb_size),name="passage_data")
targets=tf.placeholder(dtype=tf.float32,shape=(None,1),name="score")

#query_encoder
with tf.variable_scope("q_encoder"):
    enc_cell_q_f=tf.nn.rnn_cell.LSTMCell(nodes,activation=tf.nn.tanh)
    enc_cell_q_b=tf.nn.rnn_cell.LSTMCell(nodes,activation=tf.nn.tanh)
    _,q_states=tf.nn.bidirectional_dynamic_rnn(enc_cell_q_f,enc_cell_q_b,q_data,dtype=tf.float32)    

#passage_encoder
with tf.variable_scope("p_encoder"):
    enc_cell_p_f=tf.nn.rnn_cell.LSTMCell(nodes,activation=tf.nn.tanh)
    enc_cell_p_b=tf.nn.rnn_cell.LSTMCell(nodes,activation=tf.nn.tanh)
    _,p_states=tf.nn.bidirectional_dynamic_rnn(enc_cell_p_f,enc_cell_p_b,p_data,dtype=tf.float32) 

#concatenation (None,2048)=>2048 features
encoded_data=tf.concat([q_states[0].h,q_states[0].c,q_states[1].h,q_states[1].c,
                        p_states[0].h,p_states[0].c,p_states[1].h,p_states[1].c],
                       axis=-1)

#evaluating
with tf.variable_scope("eval_ans"):
    l1=tf.keras.layers.Dense(units=2048,activation="tanh",name="l1")(encoded_data)
    l2=tf.keras.layers.Dense(units=1024,activation="tanh",name="l2")(l1)
    l3=tf.keras.layers.Dense(units=512,activation="tanh",name="l3")(l2)
    l4=tf.keras.layers.Dense(units=256,activation="tanh",name="l4")(l3)
    l5=tf.keras.layers.Dense(units=128,activation="tanh",name="l5")(l4)
    output=tf.keras.layers.Dense(units=1,activation="sigmoid",name="output")(l5)

#loss
loss=tf.keras.losses.mean_squared_error(y_pred=output,y_true=targets)
#optimizer
opti=tf.train.AdamOptimizer(name="opti")
#step
step=opti.minimize(loss)

In [81]:
# # Train
# epochs=20
# batch_size=32

# for i in range(epochs):
#     k=0
#     batch_loss=0

#     while k <500:
#         batch_loss=0
#         batch_logits=0
#         acc=0
# #         if k+batch_size > data_len:
# #             _, batch_loss, batch_logits = sess.run([optimizer, loss, logits],feed_dict={X:data_x[k:],Y:data_y[k:],
# #                                                                                         T:data_y[k:]})
# #         else:
#         data_x,data_y,data_t=get_batch(k,k+batch_size)
# #         print(data_x[0],data_y[0],data_t[0])
#         _, batch_loss, batch_logits = sess.run([optimizer, loss, logits_h],feed_dict={X:data_x,Y:data_y,T:data_t})
                                                                                            
#         accuracy = np.mean(batch_logits.argmax(axis=-1) == data_t[k:k+batch_size])

#         print('Epoch {:3} Loss: {:>6.3f} Accuracy: {:>6.4f}'.format(i, batch_loss,accuracy))
#         k+=batch_size
#         break

In [73]:
#remaining
'''
*preprocessing
*dropout
'''

<tf.Tensor 'eval_ans/dense_5/Sigmoid:0' shape=(15, 1) dtype=float32>