In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
sns.set(style="whitegrid")

In [2]:
import tensorflow as tf

class reader(object):
    def __init__(self,config):
        self.config = config
        self.train_file_path  = config.train_file_path
        self.dict_file_path = config.dict_file_path
        self._CSV_COLUMN_DEFAULTS = [[''],[''],[1]]
        self._CSV_CLOLUMNS = ['Context','Utterance','Label']
        self.word_dict = {}
        with open(self.dict_file_path, 'r') as f:
            for l in f.readlines():
                word, idx = l.split(' ')[0], l.split(' ')[1]
                self.word_dict[word] = float(idx)
        print("word_dict size:", len(self.word_dict))
        self.UNK_ID = self.word_dict['<unk>']
        
        kv_initializer = tf.contrib.lookup.TextFileInitializer(
            self.dict_file_path,tf.string,0,tf.float32,1,delimiter=" ")
        self.lookup_table = tf.contrib.lookup.HashTable(kv_initializer,0)
        dataset = tf.data.TextLineDataset(self.train_file_path).repeat()
        dataset = dataset.skip(1)
        dataset = dataset.map(self.parseCSVLine)
        dataset = dataset.map(self.lookUpDict)
        dataset = dataset.filter(lambda line: 
            tf.size(line['Context']) < self.config.max_sequence)
        dataset = dataset.filter(lambda line:
            tf.size(line['Utterance']) < self.config.max_sequence)
        dataset = dataset.padded_batch(
            self.config.train_batch,
            padded_shapes={'Context':[self.config.max_sequence],
             'Utterance':[self.config.max_sequence],
             'Label':[1]})
        self.iterator = dataset.make_initializable_iterator()
        
    def parseCSVLine(self,value):
        columns = tf.decode_csv(value,self._CSV_COLUMN_DEFAULTS)
        fetures = dict(zip(self._CSV_CLOLUMNS,columns))
        return fetures

    def lookUpDict(self,value):
        value['Context'] =  self.lookup_table.lookup(
            tf.string_split((value['Context']," "))).values
        value['Utterance'] =  self.lookup_table.lookup(
            tf.string_split((value['Utterance']," "))).values
        value['Label'] = [tf.cast(value['Label'],tf.float32)]
        return value
    
    def init_reader(self,sess):
        sess.run(self.lookup_table.init)

    def epoch_input(self):
        return self.iterator


In [17]:

class test_reader(object):
    def __init__(self,config):
        self.config = config
        self.test_file_path  = config.test_file_path
        self.dict_file_path = config.dict_file_path
        self._CSV_COLUMN_DEFAULTS = [[''],[''],[''],[''],['']
                                    ,[''],[''],[''],[''],[''],['']]
        self._CSV_CLOLUMNS = ['Context','GroundTruth','D0'
                             ,'D1','D2','D3','D4','D5','D6'
                             ,'D7','D8']
        self.word_dict = {}
        
        with open(self.dict_file_path, 'r') as f:
            for l in f.readlines():
                word, idx = l.split(' ')[0], l.split(' ')[1]
                self.word_dict[word] = int(idx)
        print("word_dict size:", len(self.word_dict))
        self.UNK_ID = self.word_dict['<unk>']
        
        kv_initializer = tf.contrib.lookup.TextFileInitializer(
            self.dict_file_path,tf.string,0,tf.float32,1,delimiter=" ")
        self.lookup_table = tf.contrib.lookup.HashTable(
            kv_initializer,self.UNK_ID)
        dataset = tf.data.TextLineDataset(self.test_file_path)
        dataset = dataset.skip(1)
        dataset = dataset.repeat()
        dataset = dataset.map(self.parseCSVLine)
        dataset = dataset.map(self.lookUpDict)

        for key in self._CSV_CLOLUMNS:
            dataset = dataset.filter(lambda 
                        line: tf.size(line[key]) < self.config.max_sequence)
        padded_shapes = {}
        for key in self._CSV_CLOLUMNS:
            padded_shapes[key]=[self.config.max_sequence]
        dataset = dataset.padded_batch(self.config.test_batch,
            padded_shapes=padded_shapes)
        self.iterator = dataset.make_initializable_iterator()
        
    def parseCSVLine(self,value):
        columns = tf.decode_csv(value,self._CSV_COLUMN_DEFAULTS)
        fetures = dict(zip(self._CSV_CLOLUMNS,columns))
        return fetures

    def lookUpDict(self,value):
        for key in value:
            value[key] = self.lookup_table.lookup(
                tf.string_split((value[key]," "))).values
        return value
    
    def init_reader(self,sess):
        sess.run(self.lookup_table.init)
    
    def epoch_input(self):
        return self.iterator
    

In [5]:
import numpy as np
embeddings_index = {}
embedding_file = "./glove.6B.100d.txt"
with open(embedding_file, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except ValueError:
            continue
        embeddings_index[word] = coefs

word_dict = {}
dict_file_path = "./tokenizer.dict"      
with open(dict_file_path, 'r') as f:
    for l in f.readlines():
        word, idx = l.split(' ')[0], l.split(' ')[1]
        word_dict[word] = int(idx)
print("word_dict size:", len(word_dict))

MAX_NB_WORDS = len(embeddings_index)
num_words = min(MAX_NB_WORDS, len(word_dict)) + 1
embedding_matrix = np.zeros((num_words , 100))

for word, i in word_dict.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

word_dict size: 434055


In [19]:
class config(object):
    def __init__(self):
        self.train_batch = 512
        self.test_batch = 2
        self.max_sequence = 160
        self.dict_file_path = "./tokenizer.dict"
        self.train_file_path = "../data/train.csv"
        self.test_file_path = "../data/test.csv"
        self.embedding_dim = 100
        self.lstm_units = 200
        self.epochs = 1
        self.steps_per_epoch = 100
        self.test_steps = 2
        self.weight_path = './saved_wt.h5'
        
m_config = config()

In [10]:
import keras
import numpy as np
import tensorflow as tf
from keras.layers import Dense, Bidirectional
from keras.layers import Embedding, LSTM, Input
from keras.models import Sequential
from keras.models import Model
from keras.optimizers import SGD
from keras import backend as K
from keras.layers import merge
from keras.utils import multi_gpu_model


train_xc_textline = reader(m_config)
train_xr_textline = reader(m_config)
train_xl_textline = reader(m_config)
train_xc_textline.init_reader(K.get_session())
train_xr_textline.init_reader(K.get_session())
train_xl_textline.init_reader(K.get_session())
K.get_session().run(train_xc_textline.epoch_input().initializer)
K.get_session().run(train_xr_textline.epoch_input().initializer)
K.get_session().run(train_xl_textline.epoch_input().initializer)
train_c = train_xc_textline.epoch_input().get_next()['Context']
train_r = train_xr_textline.epoch_input().get_next()['Utterance']
train_l = train_xl_textline.epoch_input().get_next()['Label']
encoder = Sequential()
embeddin_layer = Embedding(output_dim=m_config.embedding_dim,
                        input_dim=MAX_NB_WORDS+1,
                        input_length=m_config.max_sequence,
                        weights=[embedding_matrix],
                        mask_zero=True,
                        trainable=True)
lstm_layer = LSTM(units=m_config.lstm_units)
encoder.add(embeddin_layer)
encoder.add(lstm_layer)

context_input = Input(shape=(m_config.max_sequence,),tensor=train_c)
response_input = Input(shape=(m_config.max_sequence,),tensor=train_r)

context_branch = encoder(context_input)
response_branch = encoder(response_input)

concatenated = keras.layers.Dot(axes=1)([context_branch, response_branch])
out = Dense((1), activation = "sigmoid") (concatenated)
dual_encoder = Model([context_input, response_input], out)
dual_encoder.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'],
                target_tensors = [train_l])
dual_encoder.summary()
dual_encoder.fit(epochs = m_config.epochs,
                 steps_per_epoch = m_config.steps_per_epoch)

word_dict size: 434055
word_dict size: 434055
word_dict size: 434055
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 160)          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 160)          0                                            
__________________________________________________________________________________________________
sequential_3 (Sequential)       (None, 200)          40240900    input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
dot_3 (Dot)                     (None, 1

<keras.callbacks.History at 0x7fd224d1ed68>

In [15]:
def test_model(key,config):
    K.clear_session()
    test_xc_textline = test_reader(config)
    test_xr_textline = test_reader(config)
    test_xc_textline.init_reader(K.get_session())
    test_xr_textline.init_reader(K.get_session())
    K.get_session().run(test_xc_textline.epoch_input().initializer)
    K.get_session().run(test_xr_textline.epoch_input().initializer)
    test_c = test_xc_textline.epoch_input().get_next()['Context']
    test_r = test_xr_textline.epoch_input().get_next()[key]
    
    
    test_encoder = Sequential()
    embeddin_layer = Embedding(output_dim=config.embedding_dim,
                            input_dim=MAX_NB_WORDS+1,
                            input_length=config.max_sequence,
                            weights=[embedding_matrix],
                            mask_zero=True,
                            trainable=True)
    lstm_layer = LSTM(units=config.lstm_units)
    test_encoder.add(embeddin_layer)
    test_encoder.add(lstm_layer)

    test_context_input = Input(shape=(config.max_sequence,),tensor=test_c)
    test_response_input = Input(shape=(config.max_sequence,),tensor=test_r)
    # context_input = Input(shape=(160,), dtype='float32')
    # response_input = Input(shape=(160,), dtype='float32')

    test_context_branch = test_encoder(test_context_input)
    test_response_branch = test_encoder(test_response_input)

    test_concatenated = keras.layers.Dot(axes=1)(
                                [test_context_branch, 
                                 test_response_branch])
    test_out = Dense((1), activation = "sigmoid") (test_concatenated)
    test_dual_encoder = Model([test_context_input, test_response_input], test_out)
    test_dual_encoder.load_weights(config.weight_path)
    test_dual_encoder.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])
#     test_dual_encoder.summary()

    his = test_dual_encoder.predict(x=None,steps=config.test_steps)
    return his

In [20]:
key_list =  ['GroundTruth','D0','D1','D2','D3','D4','D5','D6','D7','D8']
res = []
for key in key_list:
    print(key)
    res.append(test_model(key,m_config))



GroundTruth
word_dict size: 434055
word_dict size: 434055
D0
word_dict size: 434055
word_dict size: 434055
D1
word_dict size: 434055
word_dict size: 434055
D2
word_dict size: 434055
word_dict size: 434055
D3
word_dict size: 434055
word_dict size: 434055
D4
word_dict size: 434055
word_dict size: 434055
D5
word_dict size: 434055
word_dict size: 434055
D6
word_dict size: 434055
word_dict size: 434055
D7
word_dict size: 434055
word_dict size: 434055
D8
word_dict size: 434055
word_dict size: 434055
0.25


In [24]:
res = np.array(res)
count = 0
for i in range(len(res[0])):
    if 0 in res[:,i].reshape(10).argsort()[-8:]:
        count+=1
print(count/(m_config.test_batch*m_config.test_steps))

1.0
